gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 210                                             aarch64_addr_query_type);
 211
 212 /* Major revision number of the ARM Architecture implemented by the target.  */
 213 unsigned aarch64_architecture_version;
 214
 215 /* The processor for which instructions should be scheduled.  */
 216 enum aarch64_processor aarch64_tune = cortexa53;
 217
 218 /* Mask to specify which instruction scheduling options should be used.  */
 219 unsigned long aarch64_tune_flags = 0;
 220
 221 /* Global flag for PC relative loads.  */
 222 bool aarch64_pcrelative_literal_loads;
 223
 224 /* Global flag for whether frame pointer is enabled.  */
 225 bool aarch64_use_frame_pointer;
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   0, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       1, /* si  */
 326       1, /* di  */
 327       2, /* ti  */
 328     },
 329   1, /* pre_modify  */
 330   1, /* post_modify  */
 331   3, /* register_offset  */
 332   4, /* register_sextend  */
 333   3, /* register_zextend  */
 334   2, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_regmove_cost generic_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost.  */
 352   5, /* GP2FP  */
 353   5, /* FP2GP  */
 354   2 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 358 {
 359   1, /* GP2GP  */
 360   /* Avoid the use of slow int<->fp moves for spilling by setting
 361      their cost higher than memmov_cost.  */
 362   5, /* GP2FP  */
 363   5, /* FP2GP  */
 364   2 /* FP2FP  */
 365 };
 366
 367 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 368 {
 369   1, /* GP2GP  */
 370   /* Avoid the use of slow int<->fp moves for spilling by setting
 371      their cost higher than memmov_cost (actual, 4 and 9).  */
 372   9, /* GP2FP  */
 373   9, /* FP2GP  */
 374   1 /* FP2FP  */
 375 };
 376
 377 static const struct cpu_regmove_cost thunderx_regmove_cost =
 378 {
 379   2, /* GP2GP  */
 380   2, /* GP2FP  */
 381   6, /* FP2GP  */
 382   4 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost xgene1_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of slow int<->fp moves for spilling by setting
 389      their cost higher than memmov_cost.  */
 390   8, /* GP2FP  */
 391   8, /* FP2GP  */
 392   2 /* FP2FP  */
 393 };
 394
 395 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 396 {
 397   2, /* GP2GP  */
 398   /* Avoid the use of int<->fp moves for spilling.  */
 399   6, /* GP2FP  */
 400   6, /* FP2GP  */
 401   4 /* FP2FP  */
 402 };
 403
 404 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 405 {
 406   1, /* GP2GP  */
 407   /* Avoid the use of int<->fp moves for spilling.  */
 408   8, /* GP2FP  */
 409   8, /* FP2GP  */
 410   4  /* FP2FP  */
 411 };
 412
 413 /* Generic costs for vector insn classes.  */
 414 static const struct cpu_vector_cost generic_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   1, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   1, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   2, /* vec_permute_cost  */
 423   1, /* vec_to_scalar_cost  */
 424   1, /* scalar_to_vec_cost  */
 425   1, /* vec_align_load_cost  */
 426   1, /* vec_unalign_load_cost  */
 427   1, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   1 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* ThunderX costs for vector insn classes.  */
 434 static const struct cpu_vector_cost thunderx_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   3, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   4, /* vec_int_stmt_cost  */
 441   1, /* vec_fp_stmt_cost  */
 442   4, /* vec_permute_cost  */
 443   2, /* vec_to_scalar_cost  */
 444   2, /* scalar_to_vec_cost  */
 445   3, /* vec_align_load_cost  */
 446   5, /* vec_unalign_load_cost  */
 447   5, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   3, /* cond_taken_branch_cost  */
 450   3 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 /* Generic costs for vector insn classes.  */
 454 static const struct cpu_vector_cost cortexa57_vector_cost =
 455 {
 456   1, /* scalar_int_stmt_cost  */
 457   1, /* scalar_fp_stmt_cost  */
 458   4, /* scalar_load_cost  */
 459   1, /* scalar_store_cost  */
 460   2, /* vec_int_stmt_cost  */
 461   2, /* vec_fp_stmt_cost  */
 462   3, /* vec_permute_cost  */
 463   8, /* vec_to_scalar_cost  */
 464   8, /* scalar_to_vec_cost  */
 465   4, /* vec_align_load_cost  */
 466   4, /* vec_unalign_load_cost  */
 467   1, /* vec_unalign_store_cost  */
 468   1, /* vec_store_cost  */
 469   1, /* cond_taken_branch_cost  */
 470   1 /* cond_not_taken_branch_cost  */
 471 };
 472
 473 static const struct cpu_vector_cost exynosm1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   3, /* vec_int_stmt_cost  */
 480   3, /* vec_fp_stmt_cost  */
 481   3, /* vec_permute_cost  */
 482   3, /* vec_to_scalar_cost  */
 483   3, /* scalar_to_vec_cost  */
 484   5, /* vec_align_load_cost  */
 485   5, /* vec_unalign_load_cost  */
 486   1, /* vec_unalign_store_cost  */
 487   1, /* vec_store_cost  */
 488   1, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Generic costs for vector insn classes.  */
 493 static const struct cpu_vector_cost xgene1_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   1, /* scalar_fp_stmt_cost  */
 497   5, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   2, /* vec_int_stmt_cost  */
 500   2, /* vec_fp_stmt_cost  */
 501   2, /* vec_permute_cost  */
 502   4, /* vec_to_scalar_cost  */
 503   4, /* scalar_to_vec_cost  */
 504   10, /* vec_align_load_cost  */
 505   10, /* vec_unalign_load_cost  */
 506   2, /* vec_unalign_store_cost  */
 507   2, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1 /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Costs for vector insn classes for Vulcan.  */
 513 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 514 {
 515   1, /* scalar_int_stmt_cost  */
 516   6, /* scalar_fp_stmt_cost  */
 517   4, /* scalar_load_cost  */
 518   1, /* scalar_store_cost  */
 519   5, /* vec_int_stmt_cost  */
 520   6, /* vec_fp_stmt_cost  */
 521   3, /* vec_permute_cost  */
 522   6, /* vec_to_scalar_cost  */
 523   5, /* scalar_to_vec_cost  */
 524   8, /* vec_align_load_cost  */
 525   8, /* vec_unalign_load_cost  */
 526   4, /* vec_unalign_store_cost  */
 527   4, /* vec_store_cost  */
 528   2, /* cond_taken_branch_cost  */
 529   1  /* cond_not_taken_branch_cost  */
 530 };
 531
 532 /* Generic costs for branch instructions.  */
 533 static const struct cpu_branch_cost generic_branch_cost =
 534 {
 535   1,  /* Predictable.  */
 536   3   /* Unpredictable.  */
 537 };
 538
 539 /* Generic approximation modes.  */
 540 static const cpu_approx_modes generic_approx_modes =
 541 {
 542   AARCH64_APPROX_NONE,  /* division  */
 543   AARCH64_APPROX_NONE,  /* sqrt  */
 544   AARCH64_APPROX_NONE   /* recip_sqrt  */
 545 };
 546
 547 /* Approximation modes for Exynos M1.  */
 548 static const cpu_approx_modes exynosm1_approx_modes =
 549 {
 550   AARCH64_APPROX_NONE,  /* division  */
 551   AARCH64_APPROX_ALL,   /* sqrt  */
 552   AARCH64_APPROX_ALL    /* recip_sqrt  */
 553 };
 554
 555 /* Approximation modes for X-Gene 1.  */
 556 static const cpu_approx_modes xgene1_approx_modes =
 557 {
 558   AARCH64_APPROX_NONE,  /* division  */
 559   AARCH64_APPROX_NONE,  /* sqrt  */
 560   AARCH64_APPROX_ALL    /* recip_sqrt  */
 561 };
 562
 563 /* Generic prefetch settings (which disable prefetch).  */
 564 static const cpu_prefetch_tune generic_prefetch_tune =
 565 {
 566   0,                    /* num_slots  */
 567   -1,                   /* l1_cache_size  */
 568   -1,                   /* l1_cache_line_size  */
 569   -1,                   /* l2_cache_size  */
 570   true,                 /* prefetch_dynamic_strides */
 571   -1,                   /* minimum_stride */
 572   -1                    /* default_opt_level  */
 573 };
 574
 575 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 576 {
 577   0,                    /* num_slots  */
 578   -1,                   /* l1_cache_size  */
 579   64,                   /* l1_cache_line_size  */
 580   -1,                   /* l2_cache_size  */
 581   true,                 /* prefetch_dynamic_strides */
 582   -1,                   /* minimum_stride */
 583   -1                    /* default_opt_level  */
 584 };
 585
 586 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 587 {
 588   4,                    /* num_slots  */
 589   32,                   /* l1_cache_size  */
 590   64,                   /* l1_cache_line_size  */
 591   512,                  /* l2_cache_size  */
 592   false,                /* prefetch_dynamic_strides */
 593   2048,                 /* minimum_stride */
 594   3                     /* default_opt_level  */
 595 };
 596
 597 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 598 {
 599   8,                    /* num_slots  */
 600   32,                   /* l1_cache_size  */
 601   128,                  /* l1_cache_line_size  */
 602   16*1024,              /* l2_cache_size  */
 603   true,                 /* prefetch_dynamic_strides */
 604   -1,                   /* minimum_stride */
 605   3                     /* default_opt_level  */
 606 };
 607
 608 static const cpu_prefetch_tune thunderx_prefetch_tune =
 609 {
 610   8,                    /* num_slots  */
 611   32,                   /* l1_cache_size  */
 612   128,                  /* l1_cache_line_size  */
 613   -1,                   /* l2_cache_size  */
 614   true,                 /* prefetch_dynamic_strides */
 615   -1,                   /* minimum_stride */
 616   -1                    /* default_opt_level  */
 617 };
 618
 619 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 620 {
 621   8,                    /* num_slots  */
 622   32,                   /* l1_cache_size  */
 623   64,                   /* l1_cache_line_size  */
 624   256,                  /* l2_cache_size  */
 625   true,                 /* prefetch_dynamic_strides */
 626   -1,                   /* minimum_stride */
 627   -1                    /* default_opt_level  */
 628 };
 629
 630 static const struct tune_params generic_tunings =
 631 {
 632   &cortexa57_extra_costs,
 633   &generic_addrcost_table,
 634   &generic_regmove_cost,
 635   &generic_vector_cost,
 636   &generic_branch_cost,
 637   &generic_approx_modes,
 638   4, /* memmov_cost  */
 639   2, /* issue_rate  */
 640   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 641   "8",  /* function_align.  */
 642   "4",  /* jump_align.  */
 643   "8",  /* loop_align.  */
 644   2,    /* int_reassoc_width.  */
 645   4,    /* fp_reassoc_width.  */
 646   1,    /* vec_reassoc_width.  */
 647   2,    /* min_div_recip_mul_sf.  */
 648   2,    /* min_div_recip_mul_df.  */
 649   0,    /* max_case_values.  */
 650   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 651   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 652   &generic_prefetch_tune
 653 };
 654
 655 static const struct tune_params cortexa35_tunings =
 656 {
 657   &cortexa53_extra_costs,
 658   &generic_addrcost_table,
 659   &cortexa53_regmove_cost,
 660   &generic_vector_cost,
 661   &generic_branch_cost,
 662   &generic_approx_modes,
 663   4, /* memmov_cost  */
 664   1, /* issue_rate  */
 665   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 666    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 667   "16", /* function_align.  */
 668   "4",  /* jump_align.  */
 669   "8",  /* loop_align.  */
 670   2,    /* int_reassoc_width.  */
 671   4,    /* fp_reassoc_width.  */
 672   1,    /* vec_reassoc_width.  */
 673   2,    /* min_div_recip_mul_sf.  */
 674   2,    /* min_div_recip_mul_df.  */
 675   0,    /* max_case_values.  */
 676   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 677   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 678   &generic_prefetch_tune
 679 };
 680
 681 static const struct tune_params cortexa53_tunings =
 682 {
 683   &cortexa53_extra_costs,
 684   &generic_addrcost_table,
 685   &cortexa53_regmove_cost,
 686   &generic_vector_cost,
 687   &generic_branch_cost,
 688   &generic_approx_modes,
 689   4, /* memmov_cost  */
 690   2, /* issue_rate  */
 691   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 692    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 693   "16", /* function_align.  */
 694   "4",  /* jump_align.  */
 695   "8",  /* loop_align.  */
 696   2,    /* int_reassoc_width.  */
 697   4,    /* fp_reassoc_width.  */
 698   1,    /* vec_reassoc_width.  */
 699   2,    /* min_div_recip_mul_sf.  */
 700   2,    /* min_div_recip_mul_df.  */
 701   0,    /* max_case_values.  */
 702   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 703   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 704   &generic_prefetch_tune
 705 };
 706
 707 static const struct tune_params cortexa57_tunings =
 708 {
 709   &cortexa57_extra_costs,
 710   &generic_addrcost_table,
 711   &cortexa57_regmove_cost,
 712   &cortexa57_vector_cost,
 713   &generic_branch_cost,
 714   &generic_approx_modes,
 715   4, /* memmov_cost  */
 716   3, /* issue_rate  */
 717   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 718    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 719   "16", /* function_align.  */
 720   "4",  /* jump_align.  */
 721   "8",  /* loop_align.  */
 722   2,    /* int_reassoc_width.  */
 723   4,    /* fp_reassoc_width.  */
 724   1,    /* vec_reassoc_width.  */
 725   2,    /* min_div_recip_mul_sf.  */
 726   2,    /* min_div_recip_mul_df.  */
 727   0,    /* max_case_values.  */
 728   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 729   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 730   &generic_prefetch_tune
 731 };
 732
 733 static const struct tune_params cortexa72_tunings =
 734 {
 735   &cortexa57_extra_costs,
 736   &generic_addrcost_table,
 737   &cortexa57_regmove_cost,
 738   &cortexa57_vector_cost,
 739   &generic_branch_cost,
 740   &generic_approx_modes,
 741   4, /* memmov_cost  */
 742   3, /* issue_rate  */
 743   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 744    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 745   "16", /* function_align.  */
 746   "4",  /* jump_align.  */
 747   "8",  /* loop_align.  */
 748   2,    /* int_reassoc_width.  */
 749   4,    /* fp_reassoc_width.  */
 750   1,    /* vec_reassoc_width.  */
 751   2,    /* min_div_recip_mul_sf.  */
 752   2,    /* min_div_recip_mul_df.  */
 753   0,    /* max_case_values.  */
 754   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 755   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 756   &generic_prefetch_tune
 757 };
 758
 759 static const struct tune_params cortexa73_tunings =
 760 {
 761   &cortexa57_extra_costs,
 762   &generic_addrcost_table,
 763   &cortexa57_regmove_cost,
 764   &cortexa57_vector_cost,
 765   &generic_branch_cost,
 766   &generic_approx_modes,
 767   4, /* memmov_cost.  */
 768   2, /* issue_rate.  */
 769   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 770    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 771   "16", /* function_align.  */
 772   "4",  /* jump_align.  */
 773   "8",  /* loop_align.  */
 774   2,    /* int_reassoc_width.  */
 775   4,    /* fp_reassoc_width.  */
 776   1,    /* vec_reassoc_width.  */
 777   2,    /* min_div_recip_mul_sf.  */
 778   2,    /* min_div_recip_mul_df.  */
 779   0,    /* max_case_values.  */
 780   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 781   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 782   &generic_prefetch_tune
 783 };
 784
 785
 786
 787 static const struct tune_params exynosm1_tunings =
 788 {
 789   &exynosm1_extra_costs,
 790   &exynosm1_addrcost_table,
 791   &exynosm1_regmove_cost,
 792   &exynosm1_vector_cost,
 793   &generic_branch_cost,
 794   &exynosm1_approx_modes,
 795   4,    /* memmov_cost  */
 796   3,    /* issue_rate  */
 797   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 798   "4",  /* function_align.  */
 799   "4",  /* jump_align.  */
 800   "4",  /* loop_align.  */
 801   2,    /* int_reassoc_width.  */
 802   4,    /* fp_reassoc_width.  */
 803   1,    /* vec_reassoc_width.  */
 804   2,    /* min_div_recip_mul_sf.  */
 805   2,    /* min_div_recip_mul_df.  */
 806   48,   /* max_case_values.  */
 807   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 808   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 809   &exynosm1_prefetch_tune
 810 };
 811
 812 static const struct tune_params thunderxt88_tunings =
 813 {
 814   &thunderx_extra_costs,
 815   &generic_addrcost_table,
 816   &thunderx_regmove_cost,
 817   &thunderx_vector_cost,
 818   &generic_branch_cost,
 819   &generic_approx_modes,
 820   6, /* memmov_cost  */
 821   2, /* issue_rate  */
 822   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 823   "8",  /* function_align.  */
 824   "8",  /* jump_align.  */
 825   "8",  /* loop_align.  */
 826   2,    /* int_reassoc_width.  */
 827   4,    /* fp_reassoc_width.  */
 828   1,    /* vec_reassoc_width.  */
 829   2,    /* min_div_recip_mul_sf.  */
 830   2,    /* min_div_recip_mul_df.  */
 831   0,    /* max_case_values.  */
 832   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 833   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 834   &thunderxt88_prefetch_tune
 835 };
 836
 837 static const struct tune_params thunderx_tunings =
 838 {
 839   &thunderx_extra_costs,
 840   &generic_addrcost_table,
 841   &thunderx_regmove_cost,
 842   &thunderx_vector_cost,
 843   &generic_branch_cost,
 844   &generic_approx_modes,
 845   6, /* memmov_cost  */
 846   2, /* issue_rate  */
 847   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 848   "8",  /* function_align.  */
 849   "8",  /* jump_align.  */
 850   "8",  /* loop_align.  */
 851   2,    /* int_reassoc_width.  */
 852   4,    /* fp_reassoc_width.  */
 853   1,    /* vec_reassoc_width.  */
 854   2,    /* min_div_recip_mul_sf.  */
 855   2,    /* min_div_recip_mul_df.  */
 856   0,    /* max_case_values.  */
 857   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 858   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 859    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 860   &thunderx_prefetch_tune
 861 };
 862
 863 static const struct tune_params xgene1_tunings =
 864 {
 865   &xgene1_extra_costs,
 866   &xgene1_addrcost_table,
 867   &xgene1_regmove_cost,
 868   &xgene1_vector_cost,
 869   &generic_branch_cost,
 870   &xgene1_approx_modes,
 871   6, /* memmov_cost  */
 872   4, /* issue_rate  */
 873   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 874   "16", /* function_align.  */
 875   "8",  /* jump_align.  */
 876   "16", /* loop_align.  */
 877   2,    /* int_reassoc_width.  */
 878   4,    /* fp_reassoc_width.  */
 879   1,    /* vec_reassoc_width.  */
 880   2,    /* min_div_recip_mul_sf.  */
 881   2,    /* min_div_recip_mul_df.  */
 882   0,    /* max_case_values.  */
 883   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 884   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 885   &generic_prefetch_tune
 886 };
 887
 888 static const struct tune_params qdf24xx_tunings =
 889 {
 890   &qdf24xx_extra_costs,
 891   &qdf24xx_addrcost_table,
 892   &qdf24xx_regmove_cost,
 893   &generic_vector_cost,
 894   &generic_branch_cost,
 895   &generic_approx_modes,
 896   4, /* memmov_cost  */
 897   4, /* issue_rate  */
 898   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 899    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 900   "16", /* function_align.  */
 901   "8",  /* jump_align.  */
 902   "16", /* loop_align.  */
 903   2,    /* int_reassoc_width.  */
 904   4,    /* fp_reassoc_width.  */
 905   1,    /* vec_reassoc_width.  */
 906   2,    /* min_div_recip_mul_sf.  */
 907   2,    /* min_div_recip_mul_df.  */
 908   0,    /* max_case_values.  */
 909   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 910   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 911   &qdf24xx_prefetch_tune
 912 };
 913
 914 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 915    for now.  */
 916 static const struct tune_params saphira_tunings =
 917 {
 918   &generic_extra_costs,
 919   &generic_addrcost_table,
 920   &generic_regmove_cost,
 921   &generic_vector_cost,
 922   &generic_branch_cost,
 923   &generic_approx_modes,
 924   4, /* memmov_cost  */
 925   4, /* issue_rate  */
 926   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 927    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 928   "16", /* function_align.  */
 929   "8",  /* jump_align.  */
 930   "16", /* loop_align.  */
 931   2,    /* int_reassoc_width.  */
 932   4,    /* fp_reassoc_width.  */
 933   1,    /* vec_reassoc_width.  */
 934   2,    /* min_div_recip_mul_sf.  */
 935   2,    /* min_div_recip_mul_df.  */
 936   0,    /* max_case_values.  */
 937   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 938   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 939   &generic_prefetch_tune
 940 };
 941
 942 static const struct tune_params thunderx2t99_tunings =
 943 {
 944   &thunderx2t99_extra_costs,
 945   &thunderx2t99_addrcost_table,
 946   &thunderx2t99_regmove_cost,
 947   &thunderx2t99_vector_cost,
 948   &generic_branch_cost,
 949   &generic_approx_modes,
 950   4, /* memmov_cost.  */
 951   4, /* issue_rate.  */
 952   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 953    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 954   "16", /* function_align.  */
 955   "8",  /* jump_align.  */
 956   "16", /* loop_align.  */
 957   3,    /* int_reassoc_width.  */
 958   2,    /* fp_reassoc_width.  */
 959   2,    /* vec_reassoc_width.  */
 960   2,    /* min_div_recip_mul_sf.  */
 961   2,    /* min_div_recip_mul_df.  */
 962   0,    /* max_case_values.  */
 963   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 964   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 965   &thunderx2t99_prefetch_tune
 966 };
 967
 968 /* Support for fine-grained override of the tuning structures.  */
 969 struct aarch64_tuning_override_function
 970 {
 971   const char* name;
 972   void (*parse_override)(const char*, struct tune_params*);
 973 };
 974
 975 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 976 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 977
 978 static const struct aarch64_tuning_override_function
 979 aarch64_tuning_override_functions[] =
 980 {
 981   { "fuse", aarch64_parse_fuse_string },
 982   { "tune", aarch64_parse_tune_string },
 983   { NULL, NULL }
 984 };
 985
 986 /* A processor implementing AArch64.  */
 987 struct processor
 988 {
 989   const char *const name;
 990   enum aarch64_processor ident;
 991   enum aarch64_processor sched_core;
 992   enum aarch64_arch arch;
 993   unsigned architecture_version;
 994   const unsigned long flags;
 995   const struct tune_params *const tune;
 996 };
 997
 998 /* Architectures implementing AArch64.  */
 999 static const struct processor all_architectures[] =
1000 {
1001 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1002   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1003 #include "aarch64-arches.def"
1004   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1005 };
1006
1007 /* Processor cores implementing AArch64.  */
1008 static const struct processor all_cores[] =
1009 {
1010 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1011   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1012   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1013   FLAGS, &COSTS##_tunings},
1014 #include "aarch64-cores.def"
1015   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1016     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1017   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1018 };
1019
1020
1021 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1022    handling code or by target attributes.  */
1023 static const struct processor *selected_arch;
1024 static const struct processor *selected_cpu;
1025 static const struct processor *selected_tune;
1026
1027 /* The current tuning set.  */
1028 struct tune_params aarch64_tune_params = generic_tunings;
1029
1030 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031
1032 /* An ISA extension in the co-processor and main instruction set space.  */
1033 struct aarch64_option_extension
1034 {
1035   const char *const name;
1036   const unsigned long flags_on;
1037   const unsigned long flags_off;
1038 };
1039
1040 typedef enum aarch64_cond_code
1041 {
1042   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1043   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1044   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1045 }
1046 aarch64_cc;
1047
1048 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049
1050 /* The condition codes of the processor, and the inverse function.  */
1051 static const char * const aarch64_condition_codes[] =
1052 {
1053   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1054   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1055 };
1056
1057 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1058 const char *
1059 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1060                         const char * branch_format)
1061 {
1062     rtx_code_label * tmp_label = gen_label_rtx ();
1063     char label_buf[256];
1064     char buffer[128];
1065     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1066                                  CODE_LABEL_NUMBER (tmp_label));
1067     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1068     rtx dest_label = operands[pos_label];
1069     operands[pos_label] = tmp_label;
1070
1071     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1072     output_asm_insn (buffer, operands);
1073
1074     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1075     operands[pos_label] = dest_label;
1076     output_asm_insn (buffer, operands);
1077     return "";
1078 }
1079
1080 void
1081 aarch64_err_no_fpadvsimd (machine_mode mode)
1082 {
1083   if (TARGET_GENERAL_REGS_ONLY)
1084     if (FLOAT_MODE_P (mode))
1085       error ("%qs is incompatible with the use of floating-point types",
1086              "-mgeneral-regs-only");
1087     else
1088       error ("%qs is incompatible with the use of vector types",
1089              "-mgeneral-regs-only");
1090   else
1091     if (FLOAT_MODE_P (mode))
1092       error ("%qs feature modifier is incompatible with the use of"
1093              " floating-point types", "+nofp");
1094     else
1095       error ("%qs feature modifier is incompatible with the use of"
1096              " vector types", "+nofp");
1097 }
1098
1099 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1100    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1101    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1102    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1103    and GENERAL_REGS is lower than the memory cost (in this case the best class
1104    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1105    cost results in bad allocations with many redundant int<->FP moves which
1106    are expensive on various cores.
1107    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1108    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1109    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1110    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1111    The result of this is that it is no longer inefficient to have a higher
1112    memory move cost than the register move cost.
1113 */
1114
1115 static reg_class_t
1116 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1117                                          reg_class_t best_class)
1118 {
1119   machine_mode mode;
1120
1121   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1122       || !reg_class_subset_p (FP_REGS, allocno_class))
1123     return allocno_class;
1124
1125   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1126       || !reg_class_subset_p (FP_REGS, best_class))
1127     return best_class;
1128
1129   mode = PSEUDO_REGNO_MODE (regno);
1130   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1131 }
1132
1133 static unsigned int
1134 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1135 {
1136   if (GET_MODE_UNIT_SIZE (mode) == 4)
1137     return aarch64_tune_params.min_div_recip_mul_sf;
1138   return aarch64_tune_params.min_div_recip_mul_df;
1139 }
1140
1141 /* Return the reassociation width of treeop OPC with mode MODE.  */
1142 static int
1143 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1144 {
1145   if (VECTOR_MODE_P (mode))
1146     return aarch64_tune_params.vec_reassoc_width;
1147   if (INTEGRAL_MODE_P (mode))
1148     return aarch64_tune_params.int_reassoc_width;
1149   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1150   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1151     return aarch64_tune_params.fp_reassoc_width;
1152   return 1;
1153 }
1154
1155 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1156 unsigned
1157 aarch64_dbx_register_number (unsigned regno)
1158 {
1159    if (GP_REGNUM_P (regno))
1160      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1161    else if (regno == SP_REGNUM)
1162      return AARCH64_DWARF_SP;
1163    else if (FP_REGNUM_P (regno))
1164      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1165    else if (PR_REGNUM_P (regno))
1166      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1167    else if (regno == VG_REGNUM)
1168      return AARCH64_DWARF_VG;
1169
1170    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1171       equivalent DWARF register.  */
1172    return DWARF_FRAME_REGISTERS;
1173 }
1174
1175 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1176 static bool
1177 aarch64_advsimd_struct_mode_p (machine_mode mode)
1178 {
1179   return (TARGET_SIMD
1180           && (mode == OImode || mode == CImode || mode == XImode));
1181 }
1182
1183 /* Return true if MODE is an SVE predicate mode.  */
1184 static bool
1185 aarch64_sve_pred_mode_p (machine_mode mode)
1186 {
1187   return (TARGET_SVE
1188           && (mode == VNx16BImode
1189               || mode == VNx8BImode
1190               || mode == VNx4BImode
1191               || mode == VNx2BImode));
1192 }
1193
1194 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1195 const unsigned int VEC_ADVSIMD  = 1;
1196 const unsigned int VEC_SVE_DATA = 2;
1197 const unsigned int VEC_SVE_PRED = 4;
1198 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1199    a structure of 2, 3 or 4 vectors.  */
1200 const unsigned int VEC_STRUCT   = 8;
1201 /* Useful combinations of the above.  */
1202 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1203 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1204
1205 /* Return a set of flags describing the vector properties of mode MODE.
1206    Ignore modes that are not supported by the current target.  */
1207 static unsigned int
1208 aarch64_classify_vector_mode (machine_mode mode)
1209 {
1210   if (aarch64_advsimd_struct_mode_p (mode))
1211     return VEC_ADVSIMD | VEC_STRUCT;
1212
1213   if (aarch64_sve_pred_mode_p (mode))
1214     return VEC_SVE_PRED;
1215
1216   scalar_mode inner = GET_MODE_INNER (mode);
1217   if (VECTOR_MODE_P (mode)
1218       && (inner == QImode
1219           || inner == HImode
1220           || inner == HFmode
1221           || inner == SImode
1222           || inner == SFmode
1223           || inner == DImode
1224           || inner == DFmode))
1225     {
1226       if (TARGET_SVE)
1227         {
1228           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1229             return VEC_SVE_DATA;
1230           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1231               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1232               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1233             return VEC_SVE_DATA | VEC_STRUCT;
1234         }
1235
1236       /* This includes V1DF but not V1DI (which doesn't exist).  */
1237       if (TARGET_SIMD
1238           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1239               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1240         return VEC_ADVSIMD;
1241     }
1242
1243   return 0;
1244 }
1245
1246 /* Return true if MODE is any of the data vector modes, including
1247    structure modes.  */
1248 static bool
1249 aarch64_vector_data_mode_p (machine_mode mode)
1250 {
1251   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1252 }
1253
1254 /* Return true if MODE is an SVE data vector mode; either a single vector
1255    or a structure of vectors.  */
1256 static bool
1257 aarch64_sve_data_mode_p (machine_mode mode)
1258 {
1259   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1260 }
1261
1262 /* Implement target hook TARGET_ARRAY_MODE.  */
1263 static opt_machine_mode
1264 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1265 {
1266   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1267       && IN_RANGE (nelems, 2, 4))
1268     return mode_for_vector (GET_MODE_INNER (mode),
1269                             GET_MODE_NUNITS (mode) * nelems);
1270
1271   return opt_machine_mode ();
1272 }
1273
1274 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1275 static bool
1276 aarch64_array_mode_supported_p (machine_mode mode,
1277                                 unsigned HOST_WIDE_INT nelems)
1278 {
1279   if (TARGET_SIMD
1280       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1281           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1282       && (nelems >= 2 && nelems <= 4))
1283     return true;
1284
1285   return false;
1286 }
1287
1288 /* Return the SVE predicate mode to use for elements that have
1289    ELEM_NBYTES bytes, if such a mode exists.  */
1290
1291 opt_machine_mode
1292 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1293 {
1294   if (TARGET_SVE)
1295     {
1296       if (elem_nbytes == 1)
1297         return VNx16BImode;
1298       if (elem_nbytes == 2)
1299         return VNx8BImode;
1300       if (elem_nbytes == 4)
1301         return VNx4BImode;
1302       if (elem_nbytes == 8)
1303         return VNx2BImode;
1304     }
1305   return opt_machine_mode ();
1306 }
1307
1308 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1309
1310 static opt_machine_mode
1311 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1312 {
1313   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1314     {
1315       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1316       machine_mode pred_mode;
1317       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1318         return pred_mode;
1319     }
1320
1321   return default_get_mask_mode (nunits, nbytes);
1322 }
1323
1324 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1325    prefer to use the first arithmetic operand as the else value if
1326    the else value doesn't matter, since that exactly matches the SVE
1327    destructive merging form.  For ternary operations we could either
1328    pick the first operand and use FMAD-like instructions or the last
1329    operand and use FMLA-like instructions; the latter seems more
1330    natural.  */
1331
1332 static tree
1333 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1334 {
1335   return nops == 3 ? ops[2] : ops[0];
1336 }
1337
1338 /* Implement TARGET_HARD_REGNO_NREGS.  */
1339
1340 static unsigned int
1341 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1342 {
1343   /* ??? Logically we should only need to provide a value when
1344      HARD_REGNO_MODE_OK says that the combination is valid,
1345      but at the moment we need to handle all modes.  Just ignore
1346      any runtime parts for registers that can't store them.  */
1347   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1348   switch (aarch64_regno_regclass (regno))
1349     {
1350     case FP_REGS:
1351     case FP_LO_REGS:
1352       if (aarch64_sve_data_mode_p (mode))
1353         return exact_div (GET_MODE_SIZE (mode),
1354                           BYTES_PER_SVE_VECTOR).to_constant ();
1355       return CEIL (lowest_size, UNITS_PER_VREG);
1356     case PR_REGS:
1357     case PR_LO_REGS:
1358     case PR_HI_REGS:
1359       return 1;
1360     default:
1361       return CEIL (lowest_size, UNITS_PER_WORD);
1362     }
1363   gcc_unreachable ();
1364 }
1365
1366 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1367
1368 static bool
1369 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1370 {
1371   if (GET_MODE_CLASS (mode) == MODE_CC)
1372     return regno == CC_REGNUM;
1373
1374   if (regno == VG_REGNUM)
1375     /* This must have the same size as _Unwind_Word.  */
1376     return mode == DImode;
1377
1378   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1379   if (vec_flags & VEC_SVE_PRED)
1380     return PR_REGNUM_P (regno);
1381
1382   if (PR_REGNUM_P (regno))
1383     return 0;
1384
1385   if (regno == SP_REGNUM)
1386     /* The purpose of comparing with ptr_mode is to support the
1387        global register variable associated with the stack pointer
1388        register via the syntax of asm ("wsp") in ILP32.  */
1389     return mode == Pmode || mode == ptr_mode;
1390
1391   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1392     return mode == Pmode;
1393
1394   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1395     return true;
1396
1397   if (FP_REGNUM_P (regno))
1398     {
1399       if (vec_flags & VEC_STRUCT)
1400         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1401       else
1402         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1403     }
1404
1405   return false;
1406 }
1407
1408 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1409    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1410    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1411
1412 static bool
1413 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1414 {
1415   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1416 }
1417
1418 /* Implement REGMODE_NATURAL_SIZE.  */
1419 poly_uint64
1420 aarch64_regmode_natural_size (machine_mode mode)
1421 {
1422   /* The natural size for SVE data modes is one SVE data vector,
1423      and similarly for predicates.  We can't independently modify
1424      anything smaller than that.  */
1425   /* ??? For now, only do this for variable-width SVE registers.
1426      Doing it for constant-sized registers breaks lower-subreg.c.  */
1427   /* ??? And once that's fixed, we should probably have similar
1428      code for Advanced SIMD.  */
1429   if (!aarch64_sve_vg.is_constant ())
1430     {
1431       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1432       if (vec_flags & VEC_SVE_PRED)
1433         return BYTES_PER_SVE_PRED;
1434       if (vec_flags & VEC_SVE_DATA)
1435         return BYTES_PER_SVE_VECTOR;
1436     }
1437   return UNITS_PER_WORD;
1438 }
1439
1440 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1441 machine_mode
1442 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1443                                      machine_mode mode)
1444 {
1445   /* The predicate mode determines which bits are significant and
1446      which are "don't care".  Decreasing the number of lanes would
1447      lose data while increasing the number of lanes would make bits
1448      unnecessarily significant.  */
1449   if (PR_REGNUM_P (regno))
1450     return mode;
1451   if (known_ge (GET_MODE_SIZE (mode), 4))
1452     return mode;
1453   else
1454     return SImode;
1455 }
1456
1457 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1458    that strcpy from constants will be faster.  */
1459
1460 static HOST_WIDE_INT
1461 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1462 {
1463   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1464     return MAX (align, BITS_PER_WORD);
1465   return align;
1466 }
1467
1468 /* Return true if calls to DECL should be treated as
1469    long-calls (ie called via a register).  */
1470 static bool
1471 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1472 {
1473   return false;
1474 }
1475
1476 /* Return true if calls to symbol-ref SYM should be treated as
1477    long-calls (ie called via a register).  */
1478 bool
1479 aarch64_is_long_call_p (rtx sym)
1480 {
1481   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1482 }
1483
1484 /* Return true if calls to symbol-ref SYM should not go through
1485    plt stubs.  */
1486
1487 bool
1488 aarch64_is_noplt_call_p (rtx sym)
1489 {
1490   const_tree decl = SYMBOL_REF_DECL (sym);
1491
1492   if (flag_pic
1493       && decl
1494       && (!flag_plt
1495           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1496       && !targetm.binds_local_p (decl))
1497     return true;
1498
1499   return false;
1500 }
1501
1502 /* Return true if the offsets to a zero/sign-extract operation
1503    represent an expression that matches an extend operation.  The
1504    operands represent the paramters from
1505
1506    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1507 bool
1508 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1509                                 rtx extract_imm)
1510 {
1511   HOST_WIDE_INT mult_val, extract_val;
1512
1513   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1514     return false;
1515
1516   mult_val = INTVAL (mult_imm);
1517   extract_val = INTVAL (extract_imm);
1518
1519   if (extract_val > 8
1520       && extract_val < GET_MODE_BITSIZE (mode)
1521       && exact_log2 (extract_val & ~7) > 0
1522       && (extract_val & 7) <= 4
1523       && mult_val == (1 << (extract_val & 7)))
1524     return true;
1525
1526   return false;
1527 }
1528
1529 /* Emit an insn that's a simple single-set.  Both the operands must be
1530    known to be valid.  */
1531 inline static rtx_insn *
1532 emit_set_insn (rtx x, rtx y)
1533 {
1534   return emit_insn (gen_rtx_SET (x, y));
1535 }
1536
1537 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1538    return the rtx for register 0 in the proper mode.  */
1539 rtx
1540 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1541 {
1542   machine_mode mode = SELECT_CC_MODE (code, x, y);
1543   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1544
1545   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1546   return cc_reg;
1547 }
1548
1549 /* Build the SYMBOL_REF for __tls_get_addr.  */
1550
1551 static GTY(()) rtx tls_get_addr_libfunc;
1552
1553 rtx
1554 aarch64_tls_get_addr (void)
1555 {
1556   if (!tls_get_addr_libfunc)
1557     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1558   return tls_get_addr_libfunc;
1559 }
1560
1561 /* Return the TLS model to use for ADDR.  */
1562
1563 static enum tls_model
1564 tls_symbolic_operand_type (rtx addr)
1565 {
1566   enum tls_model tls_kind = TLS_MODEL_NONE;
1567   if (GET_CODE (addr) == CONST)
1568     {
1569       poly_int64 addend;
1570       rtx sym = strip_offset (addr, &addend);
1571       if (GET_CODE (sym) == SYMBOL_REF)
1572         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1573     }
1574   else if (GET_CODE (addr) == SYMBOL_REF)
1575     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1576
1577   return tls_kind;
1578 }
1579
1580 /* We'll allow lo_sum's in addresses in our legitimate addresses
1581    so that combine would take care of combining addresses where
1582    necessary, but for generation purposes, we'll generate the address
1583    as :
1584    RTL                               Absolute
1585    tmp = hi (symbol_ref);            adrp  x1, foo
1586    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1587                                      nop
1588
1589    PIC                               TLS
1590    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1591    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1592                                      bl   __tls_get_addr
1593                                      nop
1594
1595    Load TLS symbol, depending on TLS mechanism and TLS access model.
1596
1597    Global Dynamic - Traditional TLS:
1598    adrp tmp, :tlsgd:imm
1599    add  dest, tmp, #:tlsgd_lo12:imm
1600    bl   __tls_get_addr
1601
1602    Global Dynamic - TLS Descriptors:
1603    adrp dest, :tlsdesc:imm
1604    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1605    add  dest, dest, #:tlsdesc_lo12:imm
1606    blr  tmp
1607    mrs  tp, tpidr_el0
1608    add  dest, dest, tp
1609
1610    Initial Exec:
1611    mrs  tp, tpidr_el0
1612    adrp tmp, :gottprel:imm
1613    ldr  dest, [tmp, #:gottprel_lo12:imm]
1614    add  dest, dest, tp
1615
1616    Local Exec:
1617    mrs  tp, tpidr_el0
1618    add  t0, tp, #:tprel_hi12:imm, lsl #12
1619    add  t0, t0, #:tprel_lo12_nc:imm
1620 */
1621
1622 static void
1623 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1624                                    enum aarch64_symbol_type type)
1625 {
1626   switch (type)
1627     {
1628     case SYMBOL_SMALL_ABSOLUTE:
1629       {
1630         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1631         rtx tmp_reg = dest;
1632         machine_mode mode = GET_MODE (dest);
1633
1634         gcc_assert (mode == Pmode || mode == ptr_mode);
1635
1636         if (can_create_pseudo_p ())
1637           tmp_reg = gen_reg_rtx (mode);
1638
1639         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1640         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1641         return;
1642       }
1643
1644     case SYMBOL_TINY_ABSOLUTE:
1645       emit_insn (gen_rtx_SET (dest, imm));
1646       return;
1647
1648     case SYMBOL_SMALL_GOT_28K:
1649       {
1650         machine_mode mode = GET_MODE (dest);
1651         rtx gp_rtx = pic_offset_table_rtx;
1652         rtx insn;
1653         rtx mem;
1654
1655         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1656            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1657            decide rtx costs, in which case pic_offset_table_rtx is not
1658            initialized.  For that case no need to generate the first adrp
1659            instruction as the final cost for global variable access is
1660            one instruction.  */
1661         if (gp_rtx != NULL)
1662           {
1663             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1664                using the page base as GOT base, the first page may be wasted,
1665                in the worst scenario, there is only 28K space for GOT).
1666
1667                The generate instruction sequence for accessing global variable
1668                is:
1669
1670                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1671
1672                Only one instruction needed. But we must initialize
1673                pic_offset_table_rtx properly.  We generate initialize insn for
1674                every global access, and allow CSE to remove all redundant.
1675
1676                The final instruction sequences will look like the following
1677                for multiply global variables access.
1678
1679                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1680
1681                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1682                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1683                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1684                  ...  */
1685
1686             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1687             crtl->uses_pic_offset_table = 1;
1688             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1689
1690             if (mode != GET_MODE (gp_rtx))
1691              gp_rtx = gen_lowpart (mode, gp_rtx);
1692
1693           }
1694
1695         if (mode == ptr_mode)
1696           {
1697             if (mode == DImode)
1698               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1699             else
1700               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1701
1702             mem = XVECEXP (SET_SRC (insn), 0, 0);
1703           }
1704         else
1705           {
1706             gcc_assert (mode == Pmode);
1707
1708             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1709             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1710           }
1711
1712         /* The operand is expected to be MEM.  Whenever the related insn
1713            pattern changed, above code which calculate mem should be
1714            updated.  */
1715         gcc_assert (GET_CODE (mem) == MEM);
1716         MEM_READONLY_P (mem) = 1;
1717         MEM_NOTRAP_P (mem) = 1;
1718         emit_insn (insn);
1719         return;
1720       }
1721
1722     case SYMBOL_SMALL_GOT_4G:
1723       {
1724         /* In ILP32, the mode of dest can be either SImode or DImode,
1725            while the got entry is always of SImode size.  The mode of
1726            dest depends on how dest is used: if dest is assigned to a
1727            pointer (e.g. in the memory), it has SImode; it may have
1728            DImode if dest is dereferenced to access the memeory.
1729            This is why we have to handle three different ldr_got_small
1730            patterns here (two patterns for ILP32).  */
1731
1732         rtx insn;
1733         rtx mem;
1734         rtx tmp_reg = dest;
1735         machine_mode mode = GET_MODE (dest);
1736
1737         if (can_create_pseudo_p ())
1738           tmp_reg = gen_reg_rtx (mode);
1739
1740         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1741         if (mode == ptr_mode)
1742           {
1743             if (mode == DImode)
1744               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1745             else
1746               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1747
1748             mem = XVECEXP (SET_SRC (insn), 0, 0);
1749           }
1750         else
1751           {
1752             gcc_assert (mode == Pmode);
1753
1754             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1755             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1756           }
1757
1758         gcc_assert (GET_CODE (mem) == MEM);
1759         MEM_READONLY_P (mem) = 1;
1760         MEM_NOTRAP_P (mem) = 1;
1761         emit_insn (insn);
1762         return;
1763       }
1764
1765     case SYMBOL_SMALL_TLSGD:
1766       {
1767         rtx_insn *insns;
1768         machine_mode mode = GET_MODE (dest);
1769         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1770
1771         start_sequence ();
1772         if (TARGET_ILP32)
1773           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1774         else
1775           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1776         insns = get_insns ();
1777         end_sequence ();
1778
1779         RTL_CONST_CALL_P (insns) = 1;
1780         emit_libcall_block (insns, dest, result, imm);
1781         return;
1782       }
1783
1784     case SYMBOL_SMALL_TLSDESC:
1785       {
1786         machine_mode mode = GET_MODE (dest);
1787         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1788         rtx tp;
1789
1790         gcc_assert (mode == Pmode || mode == ptr_mode);
1791
1792         /* In ILP32, the got entry is always of SImode size.  Unlike
1793            small GOT, the dest is fixed at reg 0.  */
1794         if (TARGET_ILP32)
1795           emit_insn (gen_tlsdesc_small_si (imm));
1796         else
1797           emit_insn (gen_tlsdesc_small_di (imm));
1798         tp = aarch64_load_tp (NULL);
1799
1800         if (mode != Pmode)
1801           tp = gen_lowpart (mode, tp);
1802
1803         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1804         if (REG_P (dest))
1805           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1806         return;
1807       }
1808
1809     case SYMBOL_SMALL_TLSIE:
1810       {
1811         /* In ILP32, the mode of dest can be either SImode or DImode,
1812            while the got entry is always of SImode size.  The mode of
1813            dest depends on how dest is used: if dest is assigned to a
1814            pointer (e.g. in the memory), it has SImode; it may have
1815            DImode if dest is dereferenced to access the memeory.
1816            This is why we have to handle three different tlsie_small
1817            patterns here (two patterns for ILP32).  */
1818         machine_mode mode = GET_MODE (dest);
1819         rtx tmp_reg = gen_reg_rtx (mode);
1820         rtx tp = aarch64_load_tp (NULL);
1821
1822         if (mode == ptr_mode)
1823           {
1824             if (mode == DImode)
1825               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1826             else
1827               {
1828                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1829                 tp = gen_lowpart (mode, tp);
1830               }
1831           }
1832         else
1833           {
1834             gcc_assert (mode == Pmode);
1835             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1836           }
1837
1838         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1839         if (REG_P (dest))
1840           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1841         return;
1842       }
1843
1844     case SYMBOL_TLSLE12:
1845     case SYMBOL_TLSLE24:
1846     case SYMBOL_TLSLE32:
1847     case SYMBOL_TLSLE48:
1848       {
1849         machine_mode mode = GET_MODE (dest);
1850         rtx tp = aarch64_load_tp (NULL);
1851
1852         if (mode != Pmode)
1853           tp = gen_lowpart (mode, tp);
1854
1855         switch (type)
1856           {
1857           case SYMBOL_TLSLE12:
1858             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1859                         (dest, tp, imm));
1860             break;
1861           case SYMBOL_TLSLE24:
1862             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1863                         (dest, tp, imm));
1864           break;
1865           case SYMBOL_TLSLE32:
1866             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1867                         (dest, imm));
1868             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1869                         (dest, dest, tp));
1870           break;
1871           case SYMBOL_TLSLE48:
1872             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1873                         (dest, imm));
1874             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1875                         (dest, dest, tp));
1876             break;
1877           default:
1878             gcc_unreachable ();
1879           }
1880
1881         if (REG_P (dest))
1882           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1883         return;
1884       }
1885
1886     case SYMBOL_TINY_GOT:
1887       emit_insn (gen_ldr_got_tiny (dest, imm));
1888       return;
1889
1890     case SYMBOL_TINY_TLSIE:
1891       {
1892         machine_mode mode = GET_MODE (dest);
1893         rtx tp = aarch64_load_tp (NULL);
1894
1895         if (mode == ptr_mode)
1896           {
1897             if (mode == DImode)
1898               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1899             else
1900               {
1901                 tp = gen_lowpart (mode, tp);
1902                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1903               }
1904           }
1905         else
1906           {
1907             gcc_assert (mode == Pmode);
1908             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1909           }
1910
1911         if (REG_P (dest))
1912           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1913         return;
1914       }
1915
1916     default:
1917       gcc_unreachable ();
1918     }
1919 }
1920
1921 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1922    handle all moves if !can_create_pseudo_p ().  The distinction is
1923    important because, unlike emit_move_insn, the move expanders know
1924    how to force Pmode objects into the constant pool even when the
1925    constant pool address is not itself legitimate.  */
1926 static rtx
1927 aarch64_emit_move (rtx dest, rtx src)
1928 {
1929   return (can_create_pseudo_p ()
1930           ? emit_move_insn (dest, src)
1931           : emit_move_insn_1 (dest, src));
1932 }
1933
1934 /* Apply UNOPTAB to OP and store the result in DEST.  */
1935
1936 static void
1937 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1938 {
1939   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1940   if (dest != tmp)
1941     emit_move_insn (dest, tmp);
1942 }
1943
1944 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1945
1946 static void
1947 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1948 {
1949   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1950                           OPTAB_DIRECT);
1951   if (dest != tmp)
1952     emit_move_insn (dest, tmp);
1953 }
1954
1955 /* Split a 128-bit move operation into two 64-bit move operations,
1956    taking care to handle partial overlap of register to register
1957    copies.  Special cases are needed when moving between GP regs and
1958    FP regs.  SRC can be a register, constant or memory; DST a register
1959    or memory.  If either operand is memory it must not have any side
1960    effects.  */
1961 void
1962 aarch64_split_128bit_move (rtx dst, rtx src)
1963 {
1964   rtx dst_lo, dst_hi;
1965   rtx src_lo, src_hi;
1966
1967   machine_mode mode = GET_MODE (dst);
1968
1969   gcc_assert (mode == TImode || mode == TFmode);
1970   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1971   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1972
1973   if (REG_P (dst) && REG_P (src))
1974     {
1975       int src_regno = REGNO (src);
1976       int dst_regno = REGNO (dst);
1977
1978       /* Handle FP <-> GP regs.  */
1979       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1980         {
1981           src_lo = gen_lowpart (word_mode, src);
1982           src_hi = gen_highpart (word_mode, src);
1983
1984           if (mode == TImode)
1985             {
1986               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1987               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1988             }
1989           else
1990             {
1991               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1992               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1993             }
1994           return;
1995         }
1996       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1997         {
1998           dst_lo = gen_lowpart (word_mode, dst);
1999           dst_hi = gen_highpart (word_mode, dst);
2000
2001           if (mode == TImode)
2002             {
2003               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
2004               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
2005             }
2006           else
2007             {
2008               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
2009               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
2010             }
2011           return;
2012         }
2013     }
2014
2015   dst_lo = gen_lowpart (word_mode, dst);
2016   dst_hi = gen_highpart (word_mode, dst);
2017   src_lo = gen_lowpart (word_mode, src);
2018   src_hi = gen_highpart_mode (word_mode, mode, src);
2019
2020   /* At most one pairing may overlap.  */
2021   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2022     {
2023       aarch64_emit_move (dst_hi, src_hi);
2024       aarch64_emit_move (dst_lo, src_lo);
2025     }
2026   else
2027     {
2028       aarch64_emit_move (dst_lo, src_lo);
2029       aarch64_emit_move (dst_hi, src_hi);
2030     }
2031 }
2032
2033 bool
2034 aarch64_split_128bit_move_p (rtx dst, rtx src)
2035 {
2036   return (! REG_P (src)
2037           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2038 }
2039
2040 /* Split a complex SIMD combine.  */
2041
2042 void
2043 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2044 {
2045   machine_mode src_mode = GET_MODE (src1);
2046   machine_mode dst_mode = GET_MODE (dst);
2047
2048   gcc_assert (VECTOR_MODE_P (dst_mode));
2049   gcc_assert (register_operand (dst, dst_mode)
2050               && register_operand (src1, src_mode)
2051               && register_operand (src2, src_mode));
2052
2053   rtx (*gen) (rtx, rtx, rtx);
2054
2055   switch (src_mode)
2056     {
2057     case E_V8QImode:
2058       gen = gen_aarch64_simd_combinev8qi;
2059       break;
2060     case E_V4HImode:
2061       gen = gen_aarch64_simd_combinev4hi;
2062       break;
2063     case E_V2SImode:
2064       gen = gen_aarch64_simd_combinev2si;
2065       break;
2066     case E_V4HFmode:
2067       gen = gen_aarch64_simd_combinev4hf;
2068       break;
2069     case E_V2SFmode:
2070       gen = gen_aarch64_simd_combinev2sf;
2071       break;
2072     case E_DImode:
2073       gen = gen_aarch64_simd_combinedi;
2074       break;
2075     case E_DFmode:
2076       gen = gen_aarch64_simd_combinedf;
2077       break;
2078     default:
2079       gcc_unreachable ();
2080     }
2081
2082   emit_insn (gen (dst, src1, src2));
2083   return;
2084 }
2085
2086 /* Split a complex SIMD move.  */
2087
2088 void
2089 aarch64_split_simd_move (rtx dst, rtx src)
2090 {
2091   machine_mode src_mode = GET_MODE (src);
2092   machine_mode dst_mode = GET_MODE (dst);
2093
2094   gcc_assert (VECTOR_MODE_P (dst_mode));
2095
2096   if (REG_P (dst) && REG_P (src))
2097     {
2098       rtx (*gen) (rtx, rtx);
2099
2100       gcc_assert (VECTOR_MODE_P (src_mode));
2101
2102       switch (src_mode)
2103         {
2104         case E_V16QImode:
2105           gen = gen_aarch64_split_simd_movv16qi;
2106           break;
2107         case E_V8HImode:
2108           gen = gen_aarch64_split_simd_movv8hi;
2109           break;
2110         case E_V4SImode:
2111           gen = gen_aarch64_split_simd_movv4si;
2112           break;
2113         case E_V2DImode:
2114           gen = gen_aarch64_split_simd_movv2di;
2115           break;
2116         case E_V8HFmode:
2117           gen = gen_aarch64_split_simd_movv8hf;
2118           break;
2119         case E_V4SFmode:
2120           gen = gen_aarch64_split_simd_movv4sf;
2121           break;
2122         case E_V2DFmode:
2123           gen = gen_aarch64_split_simd_movv2df;
2124           break;
2125         default:
2126           gcc_unreachable ();
2127         }
2128
2129       emit_insn (gen (dst, src));
2130       return;
2131     }
2132 }
2133
2134 bool
2135 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2136                               machine_mode ymode, rtx y)
2137 {
2138   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2139   gcc_assert (r != NULL);
2140   return rtx_equal_p (x, r);
2141 }
2142
2143
2144 static rtx
2145 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2146 {
2147   if (can_create_pseudo_p ())
2148     return force_reg (mode, value);
2149   else
2150     {
2151       gcc_assert (x);
2152       aarch64_emit_move (x, value);
2153       return x;
2154     }
2155 }
2156
2157 /* Return true if we can move VALUE into a register using a single
2158    CNT[BHWD] instruction.  */
2159
2160 static bool
2161 aarch64_sve_cnt_immediate_p (poly_int64 value)
2162 {
2163   HOST_WIDE_INT factor = value.coeffs[0];
2164   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2165   return (value.coeffs[1] == factor
2166           && IN_RANGE (factor, 2, 16 * 16)
2167           && (factor & 1) == 0
2168           && factor <= 16 * (factor & -factor));
2169 }
2170
2171 /* Likewise for rtx X.  */
2172
2173 bool
2174 aarch64_sve_cnt_immediate_p (rtx x)
2175 {
2176   poly_int64 value;
2177   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2178 }
2179
2180 /* Return the asm string for an instruction with a CNT-like vector size
2181    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2182    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2183    first part of the operands template (the part that comes before the
2184    vector size itself).  FACTOR is the number of quadwords.
2185    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2186    If it is zero, we can use any element size.  */
2187
2188 static char *
2189 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2190                                   unsigned int factor,
2191                                   unsigned int nelts_per_vq)
2192 {
2193   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2194
2195   if (nelts_per_vq == 0)
2196     /* There is some overlap in the ranges of the four CNT instructions.
2197        Here we always use the smallest possible element size, so that the
2198        multiplier is 1 whereever possible.  */
2199     nelts_per_vq = factor & -factor;
2200   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2201   gcc_assert (IN_RANGE (shift, 1, 4));
2202   char suffix = "dwhb"[shift - 1];
2203
2204   factor >>= shift;
2205   unsigned int written;
2206   if (factor == 1)
2207     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2208                         prefix, suffix, operands);
2209   else
2210     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2211                         prefix, suffix, operands, factor);
2212   gcc_assert (written < sizeof (buffer));
2213   return buffer;
2214 }
2215
2216 /* Return the asm string for an instruction with a CNT-like vector size
2217    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2218    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2219    first part of the operands template (the part that comes before the
2220    vector size itself).  X is the value of the vector size operand,
2221    as a polynomial integer rtx.  */
2222
2223 char *
2224 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2225                                   rtx x)
2226 {
2227   poly_int64 value = rtx_to_poly_int64 (x);
2228   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2229   return aarch64_output_sve_cnt_immediate (prefix, operands,
2230                                            value.coeffs[1], 0);
2231 }
2232
2233 /* Return true if we can add VALUE to a register using a single ADDVL
2234    or ADDPL instruction.  */
2235
2236 static bool
2237 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2238 {
2239   HOST_WIDE_INT factor = value.coeffs[0];
2240   if (factor == 0 || value.coeffs[1] != factor)
2241     return false;
2242   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2243      and a value of 16 is one vector width.  */
2244   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2245           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2246 }
2247
2248 /* Likewise for rtx X.  */
2249
2250 bool
2251 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2252 {
2253   poly_int64 value;
2254   return (poly_int_rtx_p (x, &value)
2255           && aarch64_sve_addvl_addpl_immediate_p (value));
2256 }
2257
2258 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2259    and storing the result in operand 0.  */
2260
2261 char *
2262 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2263 {
2264   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2265   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2266   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2267
2268   /* Use INC or DEC if possible.  */
2269   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2270     {
2271       if (aarch64_sve_cnt_immediate_p (offset_value))
2272         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2273                                                  offset_value.coeffs[1], 0);
2274       if (aarch64_sve_cnt_immediate_p (-offset_value))
2275         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2276                                                  -offset_value.coeffs[1], 0);
2277     }
2278
2279   int factor = offset_value.coeffs[1];
2280   if ((factor & 15) == 0)
2281     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2282   else
2283     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2284   return buffer;
2285 }
2286
2287 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2288    instruction.  If it is, store the number of elements in each vector
2289    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2290    factor in *FACTOR_OUT (if nonnull).  */
2291
2292 bool
2293 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2294                                  unsigned int *nelts_per_vq_out)
2295 {
2296   rtx elt;
2297   poly_int64 value;
2298
2299   if (!const_vec_duplicate_p (x, &elt)
2300       || !poly_int_rtx_p (elt, &value))
2301     return false;
2302
2303   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2304   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2305     /* There's no vector INCB.  */
2306     return false;
2307
2308   HOST_WIDE_INT factor = value.coeffs[0];
2309   if (value.coeffs[1] != factor)
2310     return false;
2311
2312   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2313   if ((factor % nelts_per_vq) != 0
2314       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2315     return false;
2316
2317   if (factor_out)
2318     *factor_out = factor;
2319   if (nelts_per_vq_out)
2320     *nelts_per_vq_out = nelts_per_vq;
2321   return true;
2322 }
2323
2324 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2325    instruction.  */
2326
2327 bool
2328 aarch64_sve_inc_dec_immediate_p (rtx x)
2329 {
2330   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2331 }
2332
2333 /* Return the asm template for an SVE vector INC or DEC instruction.
2334    OPERANDS gives the operands before the vector count and X is the
2335    value of the vector count operand itself.  */
2336
2337 char *
2338 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2339 {
2340   int factor;
2341   unsigned int nelts_per_vq;
2342   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2343     gcc_unreachable ();
2344   if (factor < 0)
2345     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2346                                              nelts_per_vq);
2347   else
2348     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2349                                              nelts_per_vq);
2350 }
2351
2352 static int
2353 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2354                                 scalar_int_mode mode)
2355 {
2356   int i;
2357   unsigned HOST_WIDE_INT val, val2, mask;
2358   int one_match, zero_match;
2359   int num_insns;
2360
2361   val = INTVAL (imm);
2362
2363   if (aarch64_move_imm (val, mode))
2364     {
2365       if (generate)
2366         emit_insn (gen_rtx_SET (dest, imm));
2367       return 1;
2368     }
2369
2370   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2371      (with XXXX non-zero). In that case check to see if the move can be done in
2372      a smaller mode.  */
2373   val2 = val & 0xffffffff;
2374   if (mode == DImode
2375       && aarch64_move_imm (val2, SImode)
2376       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2377     {
2378       if (generate)
2379         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2380
2381       /* Check if we have to emit a second instruction by checking to see
2382          if any of the upper 32 bits of the original DI mode value is set.  */
2383       if (val == val2)
2384         return 1;
2385
2386       i = (val >> 48) ? 48 : 32;
2387
2388       if (generate)
2389          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2390                                     GEN_INT ((val >> i) & 0xffff)));
2391
2392       return 2;
2393     }
2394
2395   if ((val >> 32) == 0 || mode == SImode)
2396     {
2397       if (generate)
2398         {
2399           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2400           if (mode == SImode)
2401             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2402                                        GEN_INT ((val >> 16) & 0xffff)));
2403           else
2404             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2405                                        GEN_INT ((val >> 16) & 0xffff)));
2406         }
2407       return 2;
2408     }
2409
2410   /* Remaining cases are all for DImode.  */
2411
2412   mask = 0xffff;
2413   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2414     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2415   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2416     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2417
2418   if (zero_match != 2 && one_match != 2)
2419     {
2420       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2421          For a 64-bit bitmask try whether changing 16 bits to all ones or
2422          zeroes creates a valid bitmask.  To check any repeated bitmask,
2423          try using 16 bits from the other 32-bit half of val.  */
2424
2425       for (i = 0; i < 64; i += 16, mask <<= 16)
2426         {
2427           val2 = val & ~mask;
2428           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2429             break;
2430           val2 = val | mask;
2431           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2432             break;
2433           val2 = val2 & ~mask;
2434           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2435           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2436             break;
2437         }
2438       if (i != 64)
2439         {
2440           if (generate)
2441             {
2442               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2443               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2444                                          GEN_INT ((val >> i) & 0xffff)));
2445             }
2446           return 2;
2447         }
2448     }
2449
2450   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2451      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2452      otherwise skip zero bits.  */
2453
2454   num_insns = 1;
2455   mask = 0xffff;
2456   val2 = one_match > zero_match ? ~val : val;
2457   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2458
2459   if (generate)
2460     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2461                                            ? (val | ~(mask << i))
2462                                            : (val & (mask << i)))));
2463   for (i += 16; i < 64; i += 16)
2464     {
2465       if ((val2 & (mask << i)) == 0)
2466         continue;
2467       if (generate)
2468         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2469                                    GEN_INT ((val >> i) & 0xffff)));
2470       num_insns ++;
2471     }
2472
2473   return num_insns;
2474 }
2475
2476 /* Return whether imm is a 128-bit immediate which is simple enough to
2477    expand inline.  */
2478 bool
2479 aarch64_mov128_immediate (rtx imm)
2480 {
2481   if (GET_CODE (imm) == CONST_INT)
2482     return true;
2483
2484   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2485
2486   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2487   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2488
2489   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2490          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2491 }
2492
2493
2494 /* Return the number of temporary registers that aarch64_add_offset_1
2495    would need to add OFFSET to a register.  */
2496
2497 static unsigned int
2498 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2499 {
2500   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2501 }
2502
2503 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2504    a non-polynomial OFFSET.  MODE is the mode of the addition.
2505    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2506    be set and CFA adjustments added to the generated instructions.
2507
2508    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2509    temporary if register allocation is already complete.  This temporary
2510    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2511    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2512    the immediate again.
2513
2514    Since this function may be used to adjust the stack pointer, we must
2515    ensure that it cannot cause transient stack deallocation (for example
2516    by first incrementing SP and then decrementing when adjusting by a
2517    large immediate).  */
2518
2519 static void
2520 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2521                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2522                       bool frame_related_p, bool emit_move_imm)
2523 {
2524   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2525   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2526
2527   HOST_WIDE_INT moffset = abs_hwi (offset);
2528   rtx_insn *insn;
2529
2530   if (!moffset)
2531     {
2532       if (!rtx_equal_p (dest, src))
2533         {
2534           insn = emit_insn (gen_rtx_SET (dest, src));
2535           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536         }
2537       return;
2538     }
2539
2540   /* Single instruction adjustment.  */
2541   if (aarch64_uimm12_shift (moffset))
2542     {
2543       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2544       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2545       return;
2546     }
2547
2548   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2549      and either:
2550
2551      a) the offset cannot be loaded by a 16-bit move or
2552      b) there is no spare register into which we can move it.  */
2553   if (moffset < 0x1000000
2554       && ((!temp1 && !can_create_pseudo_p ())
2555           || !aarch64_move_imm (moffset, mode)))
2556     {
2557       HOST_WIDE_INT low_off = moffset & 0xfff;
2558
2559       low_off = offset < 0 ? -low_off : low_off;
2560       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2561       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2562       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2563       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2564       return;
2565     }
2566
2567   /* Emit a move immediate if required and an addition/subtraction.  */
2568   if (emit_move_imm)
2569     {
2570       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2571       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2572     }
2573   insn = emit_insn (offset < 0
2574                     ? gen_sub3_insn (dest, src, temp1)
2575                     : gen_add3_insn (dest, src, temp1));
2576   if (frame_related_p)
2577     {
2578       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2579       rtx adj = plus_constant (mode, src, offset);
2580       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2581     }
2582 }
2583
2584 /* Return the number of temporary registers that aarch64_add_offset
2585    would need to move OFFSET into a register or add OFFSET to a register;
2586    ADD_P is true if we want the latter rather than the former.  */
2587
2588 static unsigned int
2589 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2590 {
2591   /* This follows the same structure as aarch64_add_offset.  */
2592   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2593     return 0;
2594
2595   unsigned int count = 0;
2596   HOST_WIDE_INT factor = offset.coeffs[1];
2597   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2598   poly_int64 poly_offset (factor, factor);
2599   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2600     /* Need one register for the ADDVL/ADDPL result.  */
2601     count += 1;
2602   else if (factor != 0)
2603     {
2604       factor = abs (factor);
2605       if (factor > 16 * (factor & -factor))
2606         /* Need one register for the CNT result and one for the multiplication
2607            factor.  If necessary, the second temporary can be reused for the
2608            constant part of the offset.  */
2609         return 2;
2610       /* Need one register for the CNT result (which might then
2611          be shifted).  */
2612       count += 1;
2613     }
2614   return count + aarch64_add_offset_1_temporaries (constant);
2615 }
2616
2617 /* If X can be represented as a poly_int64, return the number
2618    of temporaries that are required to add it to a register.
2619    Return -1 otherwise.  */
2620
2621 int
2622 aarch64_add_offset_temporaries (rtx x)
2623 {
2624   poly_int64 offset;
2625   if (!poly_int_rtx_p (x, &offset))
2626     return -1;
2627   return aarch64_offset_temporaries (true, offset);
2628 }
2629
2630 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2631    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2632    be set and CFA adjustments added to the generated instructions.
2633
2634    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2635    temporary if register allocation is already complete.  This temporary
2636    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2637    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2638    false to avoid emitting the immediate again.
2639
2640    TEMP2, if nonnull, is a second temporary register that doesn't
2641    overlap either DEST or REG.
2642
2643    Since this function may be used to adjust the stack pointer, we must
2644    ensure that it cannot cause transient stack deallocation (for example
2645    by first incrementing SP and then decrementing when adjusting by a
2646    large immediate).  */
2647
2648 static void
2649 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2650                     poly_int64 offset, rtx temp1, rtx temp2,
2651                     bool frame_related_p, bool emit_move_imm = true)
2652 {
2653   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2654   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2655   gcc_assert (temp1 == NULL_RTX
2656               || !frame_related_p
2657               || !reg_overlap_mentioned_p (temp1, dest));
2658   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2659
2660   /* Try using ADDVL or ADDPL to add the whole value.  */
2661   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2662     {
2663       rtx offset_rtx = gen_int_mode (offset, mode);
2664       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2665       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2666       return;
2667     }
2668
2669   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2670      SVE vector register, over and above the minimum size of 128 bits.
2671      This is equivalent to half the value returned by CNTD with a
2672      vector shape of ALL.  */
2673   HOST_WIDE_INT factor = offset.coeffs[1];
2674   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2675
2676   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2677   poly_int64 poly_offset (factor, factor);
2678   if (src != const0_rtx
2679       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2680     {
2681       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2682       if (frame_related_p)
2683         {
2684           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2685           RTX_FRAME_RELATED_P (insn) = true;
2686           src = dest;
2687         }
2688       else
2689         {
2690           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2691           src = aarch64_force_temporary (mode, temp1, addr);
2692           temp1 = temp2;
2693           temp2 = NULL_RTX;
2694         }
2695     }
2696   /* Otherwise use a CNT-based sequence.  */
2697   else if (factor != 0)
2698     {
2699       /* Use a subtraction if we have a negative factor.  */
2700       rtx_code code = PLUS;
2701       if (factor < 0)
2702         {
2703           factor = -factor;
2704           code = MINUS;
2705         }
2706
2707       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2708          into the multiplication.  */
2709       rtx val;
2710       int shift = 0;
2711       if (factor & 1)
2712         /* Use a right shift by 1.  */
2713         shift = -1;
2714       else
2715         factor /= 2;
2716       HOST_WIDE_INT low_bit = factor & -factor;
2717       if (factor <= 16 * low_bit)
2718         {
2719           if (factor > 16 * 8)
2720             {
2721               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2722                  the value with the minimum multiplier and shift it into
2723                  position.  */
2724               int extra_shift = exact_log2 (low_bit);
2725               shift += extra_shift;
2726               factor >>= extra_shift;
2727             }
2728           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2729         }
2730       else
2731         {
2732           /* Use CNTD, then multiply it by FACTOR.  */
2733           val = gen_int_mode (poly_int64 (2, 2), mode);
2734           val = aarch64_force_temporary (mode, temp1, val);
2735
2736           /* Go back to using a negative multiplication factor if we have
2737              no register from which to subtract.  */
2738           if (code == MINUS && src == const0_rtx)
2739             {
2740               factor = -factor;
2741               code = PLUS;
2742             }
2743           rtx coeff1 = gen_int_mode (factor, mode);
2744           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2745           val = gen_rtx_MULT (mode, val, coeff1);
2746         }
2747
2748       if (shift > 0)
2749         {
2750           /* Multiply by 1 << SHIFT.  */
2751           val = aarch64_force_temporary (mode, temp1, val);
2752           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2753         }
2754       else if (shift == -1)
2755         {
2756           /* Divide by 2.  */
2757           val = aarch64_force_temporary (mode, temp1, val);
2758           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2759         }
2760
2761       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2762       if (src != const0_rtx)
2763         {
2764           val = aarch64_force_temporary (mode, temp1, val);
2765           val = gen_rtx_fmt_ee (code, mode, src, val);
2766         }
2767       else if (code == MINUS)
2768         {
2769           val = aarch64_force_temporary (mode, temp1, val);
2770           val = gen_rtx_NEG (mode, val);
2771         }
2772
2773       if (constant == 0 || frame_related_p)
2774         {
2775           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2776           if (frame_related_p)
2777             {
2778               RTX_FRAME_RELATED_P (insn) = true;
2779               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2780                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2781                                                               poly_offset)));
2782             }
2783           src = dest;
2784           if (constant == 0)
2785             return;
2786         }
2787       else
2788         {
2789           src = aarch64_force_temporary (mode, temp1, val);
2790           temp1 = temp2;
2791           temp2 = NULL_RTX;
2792         }
2793
2794       emit_move_imm = true;
2795     }
2796
2797   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2798                         frame_related_p, emit_move_imm);
2799 }
2800
2801 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2802    than a poly_int64.  */
2803
2804 void
2805 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2806                           rtx offset_rtx, rtx temp1, rtx temp2)
2807 {
2808   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2809                       temp1, temp2, false);
2810 }
2811
2812 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2813    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2814    if TEMP1 already contains abs (DELTA).  */
2815
2816 static inline void
2817 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2818 {
2819   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2820                       temp1, temp2, true, emit_move_imm);
2821 }
2822
2823 /* Subtract DELTA from the stack pointer, marking the instructions
2824    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2825    if nonnull.  */
2826
2827 static inline void
2828 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2829 {
2830   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2831                       temp1, temp2, frame_related_p);
2832 }
2833
2834 /* Set DEST to (vec_series BASE STEP).  */
2835
2836 static void
2837 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2838 {
2839   machine_mode mode = GET_MODE (dest);
2840   scalar_mode inner = GET_MODE_INNER (mode);
2841
2842   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2843   if (!aarch64_sve_index_immediate_p (base))
2844     base = force_reg (inner, base);
2845   if (!aarch64_sve_index_immediate_p (step))
2846     step = force_reg (inner, step);
2847
2848   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2849 }
2850
2851 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2852    integer of mode INT_MODE.  Return true on success.  */
2853
2854 static bool
2855 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2856                                       rtx src)
2857 {
2858   /* If the constant is smaller than 128 bits, we can do the move
2859      using a vector of SRC_MODEs.  */
2860   if (src_mode != TImode)
2861     {
2862       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2863                                      GET_MODE_SIZE (src_mode));
2864       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2865       emit_move_insn (gen_lowpart (dup_mode, dest),
2866                       gen_const_vec_duplicate (dup_mode, src));
2867       return true;
2868     }
2869
2870   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2871   src = force_const_mem (src_mode, src);
2872   if (!src)
2873     return false;
2874
2875   /* Make sure that the address is legitimate.  */
2876   if (!aarch64_sve_ld1r_operand_p (src))
2877     {
2878       rtx addr = force_reg (Pmode, XEXP (src, 0));
2879       src = replace_equiv_address (src, addr);
2880     }
2881
2882   machine_mode mode = GET_MODE (dest);
2883   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2884   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2885   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2886   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2887   emit_insn (gen_rtx_SET (dest, src));
2888   return true;
2889 }
2890
2891 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2892    isn't a simple duplicate or series.  */
2893
2894 static void
2895 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2896 {
2897   machine_mode mode = GET_MODE (src);
2898   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2899   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2900   gcc_assert (npatterns > 1);
2901
2902   if (nelts_per_pattern == 1)
2903     {
2904       /* The constant is a repeating seqeuence of at least two elements,
2905          where the repeating elements occupy no more than 128 bits.
2906          Get an integer representation of the replicated value.  */
2907       scalar_int_mode int_mode;
2908       if (BYTES_BIG_ENDIAN)
2909         /* For now, always use LD1RQ to load the value on big-endian
2910            targets, since the handling of smaller integers includes a
2911            subreg that is semantically an element reverse.  */
2912         int_mode = TImode;
2913       else
2914         {
2915           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2916           gcc_assert (int_bits <= 128);
2917           int_mode = int_mode_for_size (int_bits, 0).require ();
2918         }
2919       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2920       if (int_value
2921           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2922         return;
2923     }
2924
2925   /* Expand each pattern individually.  */
2926   rtx_vector_builder builder;
2927   auto_vec<rtx, 16> vectors (npatterns);
2928   for (unsigned int i = 0; i < npatterns; ++i)
2929     {
2930       builder.new_vector (mode, 1, nelts_per_pattern);
2931       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2932         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2933       vectors.quick_push (force_reg (mode, builder.build ()));
2934     }
2935
2936   /* Use permutes to interleave the separate vectors.  */
2937   while (npatterns > 1)
2938     {
2939       npatterns /= 2;
2940       for (unsigned int i = 0; i < npatterns; ++i)
2941         {
2942           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2943           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2944           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2945           vectors[i] = tmp;
2946         }
2947     }
2948   gcc_assert (vectors[0] == dest);
2949 }
2950
2951 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2952    is a pattern that can be used to set DEST to a replicated scalar
2953    element.  */
2954
2955 void
2956 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2957                               rtx (*gen_vec_duplicate) (rtx, rtx))
2958 {
2959   machine_mode mode = GET_MODE (dest);
2960
2961   /* Check on what type of symbol it is.  */
2962   scalar_int_mode int_mode;
2963   if ((GET_CODE (imm) == SYMBOL_REF
2964        || GET_CODE (imm) == LABEL_REF
2965        || GET_CODE (imm) == CONST
2966        || GET_CODE (imm) == CONST_POLY_INT)
2967       && is_a <scalar_int_mode> (mode, &int_mode))
2968     {
2969       rtx mem;
2970       poly_int64 offset;
2971       HOST_WIDE_INT const_offset;
2972       enum aarch64_symbol_type sty;
2973
2974       /* If we have (const (plus symbol offset)), separate out the offset
2975          before we start classifying the symbol.  */
2976       rtx base = strip_offset (imm, &offset);
2977
2978       /* We must always add an offset involving VL separately, rather than
2979          folding it into the relocation.  */
2980       if (!offset.is_constant (&const_offset))
2981         {
2982           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2983             emit_insn (gen_rtx_SET (dest, imm));
2984           else
2985             {
2986               /* Do arithmetic on 32-bit values if the result is smaller
2987                  than that.  */
2988               if (partial_subreg_p (int_mode, SImode))
2989                 {
2990                   /* It is invalid to do symbol calculations in modes
2991                      narrower than SImode.  */
2992                   gcc_assert (base == const0_rtx);
2993                   dest = gen_lowpart (SImode, dest);
2994                   int_mode = SImode;
2995                 }
2996               if (base != const0_rtx)
2997                 {
2998                   base = aarch64_force_temporary (int_mode, dest, base);
2999                   aarch64_add_offset (int_mode, dest, base, offset,
3000                                       NULL_RTX, NULL_RTX, false);
3001                 }
3002               else
3003                 aarch64_add_offset (int_mode, dest, base, offset,
3004                                     dest, NULL_RTX, false);
3005             }
3006           return;
3007         }
3008
3009       sty = aarch64_classify_symbol (base, const_offset);
3010       switch (sty)
3011         {
3012         case SYMBOL_FORCE_TO_MEM:
3013           if (const_offset != 0
3014               && targetm.cannot_force_const_mem (int_mode, imm))
3015             {
3016               gcc_assert (can_create_pseudo_p ());
3017               base = aarch64_force_temporary (int_mode, dest, base);
3018               aarch64_add_offset (int_mode, dest, base, const_offset,
3019                                   NULL_RTX, NULL_RTX, false);
3020               return;
3021             }
3022
3023           mem = force_const_mem (ptr_mode, imm);
3024           gcc_assert (mem);
3025
3026           /* If we aren't generating PC relative literals, then
3027              we need to expand the literal pool access carefully.
3028              This is something that needs to be done in a number
3029              of places, so could well live as a separate function.  */
3030           if (!aarch64_pcrelative_literal_loads)
3031             {
3032               gcc_assert (can_create_pseudo_p ());
3033               base = gen_reg_rtx (ptr_mode);
3034               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3035               if (ptr_mode != Pmode)
3036                 base = convert_memory_address (Pmode, base);
3037               mem = gen_rtx_MEM (ptr_mode, base);
3038             }
3039
3040           if (int_mode != ptr_mode)
3041             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3042
3043           emit_insn (gen_rtx_SET (dest, mem));
3044
3045           return;
3046
3047         case SYMBOL_SMALL_TLSGD:
3048         case SYMBOL_SMALL_TLSDESC:
3049         case SYMBOL_SMALL_TLSIE:
3050         case SYMBOL_SMALL_GOT_28K:
3051         case SYMBOL_SMALL_GOT_4G:
3052         case SYMBOL_TINY_GOT:
3053         case SYMBOL_TINY_TLSIE:
3054           if (const_offset != 0)
3055             {
3056               gcc_assert(can_create_pseudo_p ());
3057               base = aarch64_force_temporary (int_mode, dest, base);
3058               aarch64_add_offset (int_mode, dest, base, const_offset,
3059                                   NULL_RTX, NULL_RTX, false);
3060               return;
3061             }
3062           /* FALLTHRU */
3063
3064         case SYMBOL_SMALL_ABSOLUTE:
3065         case SYMBOL_TINY_ABSOLUTE:
3066         case SYMBOL_TLSLE12:
3067         case SYMBOL_TLSLE24:
3068         case SYMBOL_TLSLE32:
3069         case SYMBOL_TLSLE48:
3070           aarch64_load_symref_appropriately (dest, imm, sty);
3071           return;
3072
3073         default:
3074           gcc_unreachable ();
3075         }
3076     }
3077
3078   if (!CONST_INT_P (imm))
3079     {
3080       rtx base, step, value;
3081       if (GET_CODE (imm) == HIGH
3082           || aarch64_simd_valid_immediate (imm, NULL))
3083         emit_insn (gen_rtx_SET (dest, imm));
3084       else if (const_vec_series_p (imm, &base, &step))
3085         aarch64_expand_vec_series (dest, base, step);
3086       else if (const_vec_duplicate_p (imm, &value))
3087         {
3088           /* If the constant is out of range of an SVE vector move,
3089              load it from memory if we can, otherwise move it into
3090              a register and use a DUP.  */
3091           scalar_mode inner_mode = GET_MODE_INNER (mode);
3092           rtx op = force_const_mem (inner_mode, value);
3093           if (!op)
3094             op = force_reg (inner_mode, value);
3095           else if (!aarch64_sve_ld1r_operand_p (op))
3096             {
3097               rtx addr = force_reg (Pmode, XEXP (op, 0));
3098               op = replace_equiv_address (op, addr);
3099             }
3100           emit_insn (gen_vec_duplicate (dest, op));
3101         }
3102       else if (GET_CODE (imm) == CONST_VECTOR
3103                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3104         aarch64_expand_sve_const_vector (dest, imm);
3105       else
3106         {
3107           rtx mem = force_const_mem (mode, imm);
3108           gcc_assert (mem);
3109           emit_move_insn (dest, mem);
3110         }
3111
3112       return;
3113     }
3114
3115   aarch64_internal_mov_immediate (dest, imm, true,
3116                                   as_a <scalar_int_mode> (mode));
3117 }
3118
3119 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3120    that is known to contain PTRUE.  */
3121
3122 void
3123 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3124 {
3125   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3126                                                 gen_rtvec (2, pred, src),
3127                                                 UNSPEC_MERGE_PTRUE)));
3128 }
3129
3130 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3131    operand is in memory.  In this case we need to use the predicated LD1
3132    and ST1 instead of LDR and STR, both for correctness on big-endian
3133    targets and because LD1 and ST1 support a wider range of addressing modes.
3134    PRED_MODE is the mode of the predicate.
3135
3136    See the comment at the head of aarch64-sve.md for details about the
3137    big-endian handling.  */
3138
3139 void
3140 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3141 {
3142   machine_mode mode = GET_MODE (dest);
3143   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3144   if (!register_operand (src, mode)
3145       && !register_operand (dest, mode))
3146     {
3147       rtx tmp = gen_reg_rtx (mode);
3148       if (MEM_P (src))
3149         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3150       else
3151         emit_move_insn (tmp, src);
3152       src = tmp;
3153     }
3154   aarch64_emit_sve_pred_move (dest, ptrue, src);
3155 }
3156
3157 /* Called only on big-endian targets.  See whether an SVE vector move
3158    from SRC to DEST is effectively a REV[BHW] instruction, because at
3159    least one operand is a subreg of an SVE vector that has wider or
3160    narrower elements.  Return true and emit the instruction if so.
3161
3162    For example:
3163
3164      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3165
3166    represents a VIEW_CONVERT between the following vectors, viewed
3167    in memory order:
3168
3169      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3170      R1: { [0],      [1],      [2],      [3],     ... }
3171
3172    The high part of lane X in R2 should therefore correspond to lane X*2
3173    of R1, but the register representations are:
3174
3175          msb                                      lsb
3176      R2: ...... [1].high  [1].low   [0].high  [0].low
3177      R1: ...... [3]       [2]       [1]       [0]
3178
3179    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3180    We therefore need a reverse operation to swap the high and low values
3181    around.
3182
3183    This is purely an optimization.  Without it we would spill the
3184    subreg operand to the stack in one mode and reload it in the
3185    other mode, which has the same effect as the REV.  */
3186
3187 bool
3188 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3189 {
3190   gcc_assert (BYTES_BIG_ENDIAN);
3191   if (GET_CODE (dest) == SUBREG)
3192     dest = SUBREG_REG (dest);
3193   if (GET_CODE (src) == SUBREG)
3194     src = SUBREG_REG (src);
3195
3196   /* The optimization handles two single SVE REGs with different element
3197      sizes.  */
3198   if (!REG_P (dest)
3199       || !REG_P (src)
3200       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3201       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3202       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3203           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3204     return false;
3205
3206   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3207   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3208   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3209                                UNSPEC_REV_SUBREG);
3210   emit_insn (gen_rtx_SET (dest, unspec));
3211   return true;
3212 }
3213
3214 /* Return a copy of X with mode MODE, without changing its other
3215    attributes.  Unlike gen_lowpart, this doesn't care whether the
3216    mode change is valid.  */
3217
3218 static rtx
3219 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3220 {
3221   if (GET_MODE (x) == mode)
3222     return x;
3223
3224   x = shallow_copy_rtx (x);
3225   set_mode_and_regno (x, mode, REGNO (x));
3226   return x;
3227 }
3228
3229 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3230    operands.  */
3231
3232 void
3233 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3234 {
3235   /* Decide which REV operation we need.  The mode with narrower elements
3236      determines the mode of the operands and the mode with the wider
3237      elements determines the reverse width.  */
3238   machine_mode mode_with_wider_elts = GET_MODE (dest);
3239   machine_mode mode_with_narrower_elts = GET_MODE (src);
3240   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3241       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3242     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3243
3244   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3245   unsigned int unspec;
3246   if (wider_bytes == 8)
3247     unspec = UNSPEC_REV64;
3248   else if (wider_bytes == 4)
3249     unspec = UNSPEC_REV32;
3250   else if (wider_bytes == 2)
3251     unspec = UNSPEC_REV16;
3252   else
3253     gcc_unreachable ();
3254   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3255
3256   /* Emit:
3257
3258        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3259                          UNSPEC_MERGE_PTRUE))
3260
3261      with the appropriate modes.  */
3262   ptrue = gen_lowpart (pred_mode, ptrue);
3263   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3264   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3265   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3266   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3267                         UNSPEC_MERGE_PTRUE);
3268   emit_insn (gen_rtx_SET (dest, src));
3269 }
3270
3271 static bool
3272 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3273                                  tree exp ATTRIBUTE_UNUSED)
3274 {
3275   /* Currently, always true.  */
3276   return true;
3277 }
3278
3279 /* Implement TARGET_PASS_BY_REFERENCE.  */
3280
3281 static bool
3282 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3283                            machine_mode mode,
3284                            const_tree type,
3285                            bool named ATTRIBUTE_UNUSED)
3286 {
3287   HOST_WIDE_INT size;
3288   machine_mode dummymode;
3289   int nregs;
3290
3291   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3292   if (mode == BLKmode && type)
3293     size = int_size_in_bytes (type);
3294   else
3295     /* No frontends can create types with variable-sized modes, so we
3296        shouldn't be asked to pass or return them.  */
3297     size = GET_MODE_SIZE (mode).to_constant ();
3298
3299   /* Aggregates are passed by reference based on their size.  */
3300   if (type && AGGREGATE_TYPE_P (type))
3301     {
3302       size = int_size_in_bytes (type);
3303     }
3304
3305   /* Variable sized arguments are always returned by reference.  */
3306   if (size < 0)
3307     return true;
3308
3309   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3310   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3311                                                &dummymode, &nregs,
3312                                                NULL))
3313     return false;
3314
3315   /* Arguments which are variable sized or larger than 2 registers are
3316      passed by reference unless they are a homogenous floating point
3317      aggregate.  */
3318   return size > 2 * UNITS_PER_WORD;
3319 }
3320
3321 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3322 static bool
3323 aarch64_return_in_msb (const_tree valtype)
3324 {
3325   machine_mode dummy_mode;
3326   int dummy_int;
3327
3328   /* Never happens in little-endian mode.  */
3329   if (!BYTES_BIG_ENDIAN)
3330     return false;
3331
3332   /* Only composite types smaller than or equal to 16 bytes can
3333      be potentially returned in registers.  */
3334   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3335       || int_size_in_bytes (valtype) <= 0
3336       || int_size_in_bytes (valtype) > 16)
3337     return false;
3338
3339   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3340      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3341      is always passed/returned in the least significant bits of fp/simd
3342      register(s).  */
3343   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3344                                                &dummy_mode, &dummy_int, NULL))
3345     return false;
3346
3347   return true;
3348 }
3349
3350 /* Implement TARGET_FUNCTION_VALUE.
3351    Define how to find the value returned by a function.  */
3352
3353 static rtx
3354 aarch64_function_value (const_tree type, const_tree func,
3355                         bool outgoing ATTRIBUTE_UNUSED)
3356 {
3357   machine_mode mode;
3358   int unsignedp;
3359   int count;
3360   machine_mode ag_mode;
3361
3362   mode = TYPE_MODE (type);
3363   if (INTEGRAL_TYPE_P (type))
3364     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3365
3366   if (aarch64_return_in_msb (type))
3367     {
3368       HOST_WIDE_INT size = int_size_in_bytes (type);
3369
3370       if (size % UNITS_PER_WORD != 0)
3371         {
3372           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3373           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3374         }
3375     }
3376
3377   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3378                                                &ag_mode, &count, NULL))
3379     {
3380       if (!aarch64_composite_type_p (type, mode))
3381         {
3382           gcc_assert (count == 1 && mode == ag_mode);
3383           return gen_rtx_REG (mode, V0_REGNUM);
3384         }
3385       else
3386         {
3387           int i;
3388           rtx par;
3389
3390           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3391           for (i = 0; i < count; i++)
3392             {
3393               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3394               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3395               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3396               XVECEXP (par, 0, i) = tmp;
3397             }
3398           return par;
3399         }
3400     }
3401   else
3402     return gen_rtx_REG (mode, R0_REGNUM);
3403 }
3404
3405 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3406    Return true if REGNO is the number of a hard register in which the values
3407    of called function may come back.  */
3408
3409 static bool
3410 aarch64_function_value_regno_p (const unsigned int regno)
3411 {
3412   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3413      of 16-byte return values are: 128-bit integers and 16-byte small
3414      structures (excluding homogeneous floating-point aggregates).  */
3415   if (regno == R0_REGNUM || regno == R1_REGNUM)
3416     return true;
3417
3418   /* Up to four fp/simd registers can return a function value, e.g. a
3419      homogeneous floating-point aggregate having four members.  */
3420   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3421     return TARGET_FLOAT;
3422
3423   return false;
3424 }
3425
3426 /* Implement TARGET_RETURN_IN_MEMORY.
3427
3428    If the type T of the result of a function is such that
3429      void func (T arg)
3430    would require that arg be passed as a value in a register (or set of
3431    registers) according to the parameter passing rules, then the result
3432    is returned in the same registers as would be used for such an
3433    argument.  */
3434
3435 static bool
3436 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3437 {
3438   HOST_WIDE_INT size;
3439   machine_mode ag_mode;
3440   int count;
3441
3442   if (!AGGREGATE_TYPE_P (type)
3443       && TREE_CODE (type) != COMPLEX_TYPE
3444       && TREE_CODE (type) != VECTOR_TYPE)
3445     /* Simple scalar types always returned in registers.  */
3446     return false;
3447
3448   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3449                                                type,
3450                                                &ag_mode,
3451                                                &count,
3452                                                NULL))
3453     return false;
3454
3455   /* Types larger than 2 registers returned in memory.  */
3456   size = int_size_in_bytes (type);
3457   return (size < 0 || size > 2 * UNITS_PER_WORD);
3458 }
3459
3460 static bool
3461 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3462                                const_tree type, int *nregs)
3463 {
3464   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3465   return aarch64_vfp_is_call_or_return_candidate (mode,
3466                                                   type,
3467                                                   &pcum->aapcs_vfp_rmode,
3468                                                   nregs,
3469                                                   NULL);
3470 }
3471
3472 /* Given MODE and TYPE of a function argument, return the alignment in
3473    bits.  The idea is to suppress any stronger alignment requested by
3474    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3475    This is a helper function for local use only.  */
3476
3477 static unsigned int
3478 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3479 {
3480   if (!type)
3481     return GET_MODE_ALIGNMENT (mode);
3482
3483   if (integer_zerop (TYPE_SIZE (type)))
3484     return 0;
3485
3486   gcc_assert (TYPE_MODE (type) == mode);
3487
3488   if (!AGGREGATE_TYPE_P (type))
3489     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3490
3491   if (TREE_CODE (type) == ARRAY_TYPE)
3492     return TYPE_ALIGN (TREE_TYPE (type));
3493
3494   unsigned int alignment = 0;
3495   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3496     if (TREE_CODE (field) == FIELD_DECL)
3497       alignment = std::max (alignment, DECL_ALIGN (field));
3498
3499   return alignment;
3500 }
3501
3502 /* Layout a function argument according to the AAPCS64 rules.  The rule
3503    numbers refer to the rule numbers in the AAPCS64.  */
3504
3505 static void
3506 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3507                     const_tree type,
3508                     bool named ATTRIBUTE_UNUSED)
3509 {
3510   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3511   int ncrn, nvrn, nregs;
3512   bool allocate_ncrn, allocate_nvrn;
3513   HOST_WIDE_INT size;
3514
3515   /* We need to do this once per argument.  */
3516   if (pcum->aapcs_arg_processed)
3517     return;
3518
3519   pcum->aapcs_arg_processed = true;
3520
3521   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3522   if (type)
3523     size = int_size_in_bytes (type);
3524   else
3525     /* No frontends can create types with variable-sized modes, so we
3526        shouldn't be asked to pass or return them.  */
3527     size = GET_MODE_SIZE (mode).to_constant ();
3528   size = ROUND_UP (size, UNITS_PER_WORD);
3529
3530   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3531   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3532                                                  mode,
3533                                                  type,
3534                                                  &nregs);
3535
3536   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3537      The following code thus handles passing by SIMD/FP registers first.  */
3538
3539   nvrn = pcum->aapcs_nvrn;
3540
3541   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3542      and homogenous short-vector aggregates (HVA).  */
3543   if (allocate_nvrn)
3544     {
3545       if (!TARGET_FLOAT)
3546         aarch64_err_no_fpadvsimd (mode);
3547
3548       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3549         {
3550           pcum->aapcs_nextnvrn = nvrn + nregs;
3551           if (!aarch64_composite_type_p (type, mode))
3552             {
3553               gcc_assert (nregs == 1);
3554               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3555             }
3556           else
3557             {
3558               rtx par;
3559               int i;
3560               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3561               for (i = 0; i < nregs; i++)
3562                 {
3563                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3564                                          V0_REGNUM + nvrn + i);
3565                   rtx offset = gen_int_mode
3566                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3567                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3568                   XVECEXP (par, 0, i) = tmp;
3569                 }
3570               pcum->aapcs_reg = par;
3571             }
3572           return;
3573         }
3574       else
3575         {
3576           /* C.3 NSRN is set to 8.  */
3577           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3578           goto on_stack;
3579         }
3580     }
3581
3582   ncrn = pcum->aapcs_ncrn;
3583   nregs = size / UNITS_PER_WORD;
3584
3585   /* C6 - C9.  though the sign and zero extension semantics are
3586      handled elsewhere.  This is the case where the argument fits
3587      entirely general registers.  */
3588   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3589     {
3590
3591       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3592
3593       /* C.8 if the argument has an alignment of 16 then the NGRN is
3594          rounded up to the next even number.  */
3595       if (nregs == 2
3596           && ncrn % 2
3597           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3598              comparison is there because for > 16 * BITS_PER_UNIT
3599              alignment nregs should be > 2 and therefore it should be
3600              passed by reference rather than value.  */
3601           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3602         {
3603           ++ncrn;
3604           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3605         }
3606
3607       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3608          A reg is still generated for it, but the caller should be smart
3609          enough not to use it.  */
3610       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3611         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3612       else
3613         {
3614           rtx par;
3615           int i;
3616
3617           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3618           for (i = 0; i < nregs; i++)
3619             {
3620               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3621               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3622                                        GEN_INT (i * UNITS_PER_WORD));
3623               XVECEXP (par, 0, i) = tmp;
3624             }
3625           pcum->aapcs_reg = par;
3626         }
3627
3628       pcum->aapcs_nextncrn = ncrn + nregs;
3629       return;
3630     }
3631
3632   /* C.11  */
3633   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3634
3635   /* The argument is passed on stack; record the needed number of words for
3636      this argument and align the total size if necessary.  */
3637 on_stack:
3638   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3639
3640   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3641     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3642                                        16 / UNITS_PER_WORD);
3643   return;
3644 }
3645
3646 /* Implement TARGET_FUNCTION_ARG.  */
3647
3648 static rtx
3649 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3650                       const_tree type, bool named)
3651 {
3652   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3653   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3654
3655   if (mode == VOIDmode)
3656     return NULL_RTX;
3657
3658   aarch64_layout_arg (pcum_v, mode, type, named);
3659   return pcum->aapcs_reg;
3660 }
3661
3662 void
3663 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3664                            const_tree fntype ATTRIBUTE_UNUSED,
3665                            rtx libname ATTRIBUTE_UNUSED,
3666                            const_tree fndecl ATTRIBUTE_UNUSED,
3667                            unsigned n_named ATTRIBUTE_UNUSED)
3668 {
3669   pcum->aapcs_ncrn = 0;
3670   pcum->aapcs_nvrn = 0;
3671   pcum->aapcs_nextncrn = 0;
3672   pcum->aapcs_nextnvrn = 0;
3673   pcum->pcs_variant = ARM_PCS_AAPCS64;
3674   pcum->aapcs_reg = NULL_RTX;
3675   pcum->aapcs_arg_processed = false;
3676   pcum->aapcs_stack_words = 0;
3677   pcum->aapcs_stack_size = 0;
3678
3679   if (!TARGET_FLOAT
3680       && fndecl && TREE_PUBLIC (fndecl)
3681       && fntype && fntype != error_mark_node)
3682     {
3683       const_tree type = TREE_TYPE (fntype);
3684       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3685       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3686       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3687                                                    &mode, &nregs, NULL))
3688         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3689     }
3690   return;
3691 }
3692
3693 static void
3694 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3695                               machine_mode mode,
3696                               const_tree type,
3697                               bool named)
3698 {
3699   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3700   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3701     {
3702       aarch64_layout_arg (pcum_v, mode, type, named);
3703       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3704                   != (pcum->aapcs_stack_words != 0));
3705       pcum->aapcs_arg_processed = false;
3706       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3707       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3708       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3709       pcum->aapcs_stack_words = 0;
3710       pcum->aapcs_reg = NULL_RTX;
3711     }
3712 }
3713
3714 bool
3715 aarch64_function_arg_regno_p (unsigned regno)
3716 {
3717   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3718           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3719 }
3720
3721 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3722    PARM_BOUNDARY bits of alignment, but will be given anything up
3723    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3724    that both before and after the layout of each argument, the Next
3725    Stacked Argument Address (NSAA) will have a minimum alignment of
3726    8 bytes.  */
3727
3728 static unsigned int
3729 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3730 {
3731   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3732   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3733 }
3734
3735 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3736
3737 static fixed_size_mode
3738 aarch64_get_reg_raw_mode (int regno)
3739 {
3740   if (TARGET_SVE && FP_REGNUM_P (regno))
3741     /* Don't use the SVE part of the register for __builtin_apply and
3742        __builtin_return.  The SVE registers aren't used by the normal PCS,
3743        so using them there would be a waste of time.  The PCS extensions
3744        for SVE types are fundamentally incompatible with the
3745        __builtin_return/__builtin_apply interface.  */
3746     return as_a <fixed_size_mode> (V16QImode);
3747   return default_get_reg_raw_mode (regno);
3748 }
3749
3750 /* Implement TARGET_FUNCTION_ARG_PADDING.
3751
3752    Small aggregate types are placed in the lowest memory address.
3753
3754    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3755
3756 static pad_direction
3757 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3758 {
3759   /* On little-endian targets, the least significant byte of every stack
3760      argument is passed at the lowest byte address of the stack slot.  */
3761   if (!BYTES_BIG_ENDIAN)
3762     return PAD_UPWARD;
3763
3764   /* Otherwise, integral, floating-point and pointer types are padded downward:
3765      the least significant byte of a stack argument is passed at the highest
3766      byte address of the stack slot.  */
3767   if (type
3768       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3769          || POINTER_TYPE_P (type))
3770       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3771     return PAD_DOWNWARD;
3772
3773   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3774   return PAD_UPWARD;
3775 }
3776
3777 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3778
3779    It specifies padding for the last (may also be the only)
3780    element of a block move between registers and memory.  If
3781    assuming the block is in the memory, padding upward means that
3782    the last element is padded after its highest significant byte,
3783    while in downward padding, the last element is padded at the
3784    its least significant byte side.
3785
3786    Small aggregates and small complex types are always padded
3787    upwards.
3788
3789    We don't need to worry about homogeneous floating-point or
3790    short-vector aggregates; their move is not affected by the
3791    padding direction determined here.  Regardless of endianness,
3792    each element of such an aggregate is put in the least
3793    significant bits of a fp/simd register.
3794
3795    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3796    register has useful data, and return the opposite if the most
3797    significant byte does.  */
3798
3799 bool
3800 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3801                      bool first ATTRIBUTE_UNUSED)
3802 {
3803
3804   /* Small composite types are always padded upward.  */
3805   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3806     {
3807       HOST_WIDE_INT size;
3808       if (type)
3809         size = int_size_in_bytes (type);
3810       else
3811         /* No frontends can create types with variable-sized modes, so we
3812            shouldn't be asked to pass or return them.  */
3813         size = GET_MODE_SIZE (mode).to_constant ();
3814       if (size < 2 * UNITS_PER_WORD)
3815         return true;
3816     }
3817
3818   /* Otherwise, use the default padding.  */
3819   return !BYTES_BIG_ENDIAN;
3820 }
3821
3822 static scalar_int_mode
3823 aarch64_libgcc_cmp_return_mode (void)
3824 {
3825   return SImode;
3826 }
3827
3828 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3829
3830 /* We use the 12-bit shifted immediate arithmetic instructions so values
3831    must be multiple of (1 << 12), i.e. 4096.  */
3832 #define ARITH_FACTOR 4096
3833
3834 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3835 #error Cannot use simple address calculation for stack probing
3836 #endif
3837
3838 /* The pair of scratch registers used for stack probing.  */
3839 #define PROBE_STACK_FIRST_REG  9
3840 #define PROBE_STACK_SECOND_REG 10
3841
3842 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3843    inclusive.  These are offsets from the current stack pointer.  */
3844
3845 static void
3846 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3847 {
3848   HOST_WIDE_INT size;
3849   if (!poly_size.is_constant (&size))
3850     {
3851       sorry ("stack probes for SVE frames");
3852       return;
3853     }
3854
3855   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3856
3857   /* See the same assertion on PROBE_INTERVAL above.  */
3858   gcc_assert ((first % ARITH_FACTOR) == 0);
3859
3860   /* See if we have a constant small number of probes to generate.  If so,
3861      that's the easy case.  */
3862   if (size <= PROBE_INTERVAL)
3863     {
3864       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3865
3866       emit_set_insn (reg1,
3867                      plus_constant (Pmode,
3868                                     stack_pointer_rtx, -(first + base)));
3869       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3870     }
3871
3872   /* The run-time loop is made up of 8 insns in the generic case while the
3873      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3874   else if (size <= 4 * PROBE_INTERVAL)
3875     {
3876       HOST_WIDE_INT i, rem;
3877
3878       emit_set_insn (reg1,
3879                      plus_constant (Pmode,
3880                                     stack_pointer_rtx,
3881                                     -(first + PROBE_INTERVAL)));
3882       emit_stack_probe (reg1);
3883
3884       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3885          it exceeds SIZE.  If only two probes are needed, this will not
3886          generate any code.  Then probe at FIRST + SIZE.  */
3887       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3888         {
3889           emit_set_insn (reg1,
3890                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3891           emit_stack_probe (reg1);
3892         }
3893
3894       rem = size - (i - PROBE_INTERVAL);
3895       if (rem > 256)
3896         {
3897           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3898
3899           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3900           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3901         }
3902       else
3903         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3904     }
3905
3906   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3907      extra careful with variables wrapping around because we might be at
3908      the very top (or the very bottom) of the address space and we have
3909      to be able to handle this case properly; in particular, we use an
3910      equality test for the loop condition.  */
3911   else
3912     {
3913       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3914
3915       /* Step 1: round SIZE to the previous multiple of the interval.  */
3916
3917       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3918
3919
3920       /* Step 2: compute initial and final value of the loop counter.  */
3921
3922       /* TEST_ADDR = SP + FIRST.  */
3923       emit_set_insn (reg1,
3924                      plus_constant (Pmode, stack_pointer_rtx, -first));
3925
3926       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3927       HOST_WIDE_INT adjustment = - (first + rounded_size);
3928       if (! aarch64_uimm12_shift (adjustment))
3929         {
3930           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3931                                           true, Pmode);
3932           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3933         }
3934       else
3935         emit_set_insn (reg2,
3936                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3937
3938       /* Step 3: the loop
3939
3940          do
3941            {
3942              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3943              probe at TEST_ADDR
3944            }
3945          while (TEST_ADDR != LAST_ADDR)
3946
3947          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3948          until it is equal to ROUNDED_SIZE.  */
3949
3950       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3951
3952
3953       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3954          that SIZE is equal to ROUNDED_SIZE.  */
3955
3956       if (size != rounded_size)
3957         {
3958           HOST_WIDE_INT rem = size - rounded_size;
3959
3960           if (rem > 256)
3961             {
3962               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3963
3964               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3965               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3966             }
3967           else
3968             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3969         }
3970     }
3971
3972   /* Make sure nothing is scheduled before we are done.  */
3973   emit_insn (gen_blockage ());
3974 }
3975
3976 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3977    absolute addresses.  */
3978
3979 const char *
3980 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3981 {
3982   static int labelno = 0;
3983   char loop_lab[32];
3984   rtx xops[2];
3985
3986   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3987
3988   /* Loop.  */
3989   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3990
3991   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3992   xops[0] = reg1;
3993   xops[1] = GEN_INT (PROBE_INTERVAL);
3994   output_asm_insn ("sub\t%0, %0, %1", xops);
3995
3996   /* Probe at TEST_ADDR.  */
3997   output_asm_insn ("str\txzr, [%0]", xops);
3998
3999   /* Test if TEST_ADDR == LAST_ADDR.  */
4000   xops[1] = reg2;
4001   output_asm_insn ("cmp\t%0, %1", xops);
4002
4003   /* Branch.  */
4004   fputs ("\tb.ne\t", asm_out_file);
4005   assemble_name_raw (asm_out_file, loop_lab);
4006   fputc ('\n', asm_out_file);
4007
4008   return "";
4009 }
4010
4011 /* Determine whether a frame chain needs to be generated.  */
4012 static bool
4013 aarch64_needs_frame_chain (void)
4014 {
4015   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4016   if (frame_pointer_needed || crtl->calls_eh_return)
4017     return true;
4018
4019   /* A leaf function cannot have calls or write LR.  */
4020   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4021
4022   /* Don't use a frame chain in leaf functions if leaf frame pointers
4023      are disabled.  */
4024   if (flag_omit_leaf_frame_pointer && is_leaf)
4025     return false;
4026
4027   return aarch64_use_frame_pointer;
4028 }
4029
4030 /* Mark the registers that need to be saved by the callee and calculate
4031    the size of the callee-saved registers area and frame record (both FP
4032    and LR may be omitted).  */
4033 static void
4034 aarch64_layout_frame (void)
4035 {
4036   HOST_WIDE_INT offset = 0;
4037   int regno, last_fp_reg = INVALID_REGNUM;
4038
4039   if (reload_completed && cfun->machine->frame.laid_out)
4040     return;
4041
4042   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4043
4044 #define SLOT_NOT_REQUIRED (-2)
4045 #define SLOT_REQUIRED     (-1)
4046
4047   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4048   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4049
4050   /* First mark all the registers that really need to be saved...  */
4051   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4052     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4053
4054   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4055     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4056
4057   /* ... that includes the eh data registers (if needed)...  */
4058   if (crtl->calls_eh_return)
4059     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4060       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4061         = SLOT_REQUIRED;
4062
4063   /* ... and any callee saved register that dataflow says is live.  */
4064   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4065     if (df_regs_ever_live_p (regno)
4066         && (regno == R30_REGNUM
4067             || !call_used_regs[regno]))
4068       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4069
4070   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4071     if (df_regs_ever_live_p (regno)
4072         && !call_used_regs[regno])
4073       {
4074         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4075         last_fp_reg = regno;
4076       }
4077
4078   if (cfun->machine->frame.emit_frame_chain)
4079     {
4080       /* FP and LR are placed in the linkage record.  */
4081       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4082       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4083       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4084       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4085       offset = 2 * UNITS_PER_WORD;
4086     }
4087
4088   /* Now assign stack slots for them.  */
4089   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4090     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4091       {
4092         cfun->machine->frame.reg_offset[regno] = offset;
4093         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4094           cfun->machine->frame.wb_candidate1 = regno;
4095         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4096           cfun->machine->frame.wb_candidate2 = regno;
4097         offset += UNITS_PER_WORD;
4098       }
4099
4100   HOST_WIDE_INT max_int_offset = offset;
4101   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4102   bool has_align_gap = offset != max_int_offset;
4103
4104   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4105     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4106       {
4107         /* If there is an alignment gap between integer and fp callee-saves,
4108            allocate the last fp register to it if possible.  */
4109         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4110           {
4111             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4112             break;
4113           }
4114
4115         cfun->machine->frame.reg_offset[regno] = offset;
4116         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4117           cfun->machine->frame.wb_candidate1 = regno;
4118         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4119                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4120           cfun->machine->frame.wb_candidate2 = regno;
4121         offset += UNITS_PER_WORD;
4122       }
4123
4124   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4125
4126   cfun->machine->frame.saved_regs_size = offset;
4127
4128   HOST_WIDE_INT varargs_and_saved_regs_size
4129     = offset + cfun->machine->frame.saved_varargs_size;
4130
4131   cfun->machine->frame.hard_fp_offset
4132     = aligned_upper_bound (varargs_and_saved_regs_size
4133                            + get_frame_size (),
4134                            STACK_BOUNDARY / BITS_PER_UNIT);
4135
4136   /* Both these values are already aligned.  */
4137   gcc_assert (multiple_p (crtl->outgoing_args_size,
4138                           STACK_BOUNDARY / BITS_PER_UNIT));
4139   cfun->machine->frame.frame_size
4140     = (cfun->machine->frame.hard_fp_offset
4141        + crtl->outgoing_args_size);
4142
4143   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4144
4145   cfun->machine->frame.initial_adjust = 0;
4146   cfun->machine->frame.final_adjust = 0;
4147   cfun->machine->frame.callee_adjust = 0;
4148   cfun->machine->frame.callee_offset = 0;
4149
4150   HOST_WIDE_INT max_push_offset = 0;
4151   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4152     max_push_offset = 512;
4153   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4154     max_push_offset = 256;
4155
4156   HOST_WIDE_INT const_size, const_fp_offset;
4157   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4158       && const_size < max_push_offset
4159       && known_eq (crtl->outgoing_args_size, 0))
4160     {
4161       /* Simple, small frame with no outgoing arguments:
4162          stp reg1, reg2, [sp, -frame_size]!
4163          stp reg3, reg4, [sp, 16]  */
4164       cfun->machine->frame.callee_adjust = const_size;
4165     }
4166   else if (known_lt (crtl->outgoing_args_size
4167                      + cfun->machine->frame.saved_regs_size, 512)
4168            && !(cfun->calls_alloca
4169                 && known_lt (cfun->machine->frame.hard_fp_offset,
4170                              max_push_offset)))
4171     {
4172       /* Frame with small outgoing arguments:
4173          sub sp, sp, frame_size
4174          stp reg1, reg2, [sp, outgoing_args_size]
4175          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4176       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4177       cfun->machine->frame.callee_offset
4178         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4179     }
4180   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4181            && const_fp_offset < max_push_offset)
4182     {
4183       /* Frame with large outgoing arguments but a small local area:
4184          stp reg1, reg2, [sp, -hard_fp_offset]!
4185          stp reg3, reg4, [sp, 16]
4186          sub sp, sp, outgoing_args_size  */
4187       cfun->machine->frame.callee_adjust = const_fp_offset;
4188       cfun->machine->frame.final_adjust
4189         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4190     }
4191   else
4192     {
4193       /* Frame with large local area and outgoing arguments using frame pointer:
4194          sub sp, sp, hard_fp_offset
4195          stp x29, x30, [sp, 0]
4196          add x29, sp, 0
4197          stp reg3, reg4, [sp, 16]
4198          sub sp, sp, outgoing_args_size  */
4199       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4200       cfun->machine->frame.final_adjust
4201         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4202     }
4203
4204   cfun->machine->frame.laid_out = true;
4205 }
4206
4207 /* Return true if the register REGNO is saved on entry to
4208    the current function.  */
4209
4210 static bool
4211 aarch64_register_saved_on_entry (int regno)
4212 {
4213   return cfun->machine->frame.reg_offset[regno] >= 0;
4214 }
4215
4216 /* Return the next register up from REGNO up to LIMIT for the callee
4217    to save.  */
4218
4219 static unsigned
4220 aarch64_next_callee_save (unsigned regno, unsigned limit)
4221 {
4222   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4223     regno ++;
4224   return regno;
4225 }
4226
4227 /* Push the register number REGNO of mode MODE to the stack with write-back
4228    adjusting the stack by ADJUSTMENT.  */
4229
4230 static void
4231 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4232                            HOST_WIDE_INT adjustment)
4233  {
4234   rtx base_rtx = stack_pointer_rtx;
4235   rtx insn, reg, mem;
4236
4237   reg = gen_rtx_REG (mode, regno);
4238   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4239                             plus_constant (Pmode, base_rtx, -adjustment));
4240   mem = gen_frame_mem (mode, mem);
4241
4242   insn = emit_move_insn (mem, reg);
4243   RTX_FRAME_RELATED_P (insn) = 1;
4244 }
4245
4246 /* Generate and return an instruction to store the pair of registers
4247    REG and REG2 of mode MODE to location BASE with write-back adjusting
4248    the stack location BASE by ADJUSTMENT.  */
4249
4250 static rtx
4251 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4252                           HOST_WIDE_INT adjustment)
4253 {
4254   switch (mode)
4255     {
4256     case E_DImode:
4257       return gen_storewb_pairdi_di (base, base, reg, reg2,
4258                                     GEN_INT (-adjustment),
4259                                     GEN_INT (UNITS_PER_WORD - adjustment));
4260     case E_DFmode:
4261       return gen_storewb_pairdf_di (base, base, reg, reg2,
4262                                     GEN_INT (-adjustment),
4263                                     GEN_INT (UNITS_PER_WORD - adjustment));
4264     default:
4265       gcc_unreachable ();
4266     }
4267 }
4268
4269 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4270    stack pointer by ADJUSTMENT.  */
4271
4272 static void
4273 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4274 {
4275   rtx_insn *insn;
4276   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4277
4278   if (regno2 == INVALID_REGNUM)
4279     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4280
4281   rtx reg1 = gen_rtx_REG (mode, regno1);
4282   rtx reg2 = gen_rtx_REG (mode, regno2);
4283
4284   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4285                                               reg2, adjustment));
4286   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4287   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4288   RTX_FRAME_RELATED_P (insn) = 1;
4289 }
4290
4291 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4292    adjusting it by ADJUSTMENT afterwards.  */
4293
4294 static rtx
4295 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4296                          HOST_WIDE_INT adjustment)
4297 {
4298   switch (mode)
4299     {
4300     case E_DImode:
4301       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4302                                    GEN_INT (UNITS_PER_WORD));
4303     case E_DFmode:
4304       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4305                                    GEN_INT (UNITS_PER_WORD));
4306     default:
4307       gcc_unreachable ();
4308     }
4309 }
4310
4311 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4312    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4313    into CFI_OPS.  */
4314
4315 static void
4316 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4317                   rtx *cfi_ops)
4318 {
4319   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4320   rtx reg1 = gen_rtx_REG (mode, regno1);
4321
4322   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4323
4324   if (regno2 == INVALID_REGNUM)
4325     {
4326       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4327       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4328       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4329     }
4330   else
4331     {
4332       rtx reg2 = gen_rtx_REG (mode, regno2);
4333       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4334       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4335                                           reg2, adjustment));
4336     }
4337 }
4338
4339 /* Generate and return a store pair instruction of mode MODE to store
4340    register REG1 to MEM1 and register REG2 to MEM2.  */
4341
4342 static rtx
4343 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4344                         rtx reg2)
4345 {
4346   switch (mode)
4347     {
4348     case E_DImode:
4349       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4350
4351     case E_DFmode:
4352       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4353
4354     default:
4355       gcc_unreachable ();
4356     }
4357 }
4358
4359 /* Generate and regurn a load pair isntruction of mode MODE to load register
4360    REG1 from MEM1 and register REG2 from MEM2.  */
4361
4362 static rtx
4363 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4364                        rtx mem2)
4365 {
4366   switch (mode)
4367     {
4368     case E_DImode:
4369       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4370
4371     case E_DFmode:
4372       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4373
4374     default:
4375       gcc_unreachable ();
4376     }
4377 }
4378
4379 /* Return TRUE if return address signing should be enabled for the current
4380    function, otherwise return FALSE.  */
4381
4382 bool
4383 aarch64_return_address_signing_enabled (void)
4384 {
4385   /* This function should only be called after frame laid out.   */
4386   gcc_assert (cfun->machine->frame.laid_out);
4387
4388   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4389      if it's LR is pushed onto stack.  */
4390   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4391           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4392               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4393 }
4394
4395 /* Emit code to save the callee-saved registers from register number START
4396    to LIMIT to the stack at the location starting at offset START_OFFSET,
4397    skipping any write-back candidates if SKIP_WB is true.  */
4398
4399 static void
4400 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4401                            unsigned start, unsigned limit, bool skip_wb)
4402 {
4403   rtx_insn *insn;
4404   unsigned regno;
4405   unsigned regno2;
4406
4407   for (regno = aarch64_next_callee_save (start, limit);
4408        regno <= limit;
4409        regno = aarch64_next_callee_save (regno + 1, limit))
4410     {
4411       rtx reg, mem;
4412       poly_int64 offset;
4413
4414       if (skip_wb
4415           && (regno == cfun->machine->frame.wb_candidate1
4416               || regno == cfun->machine->frame.wb_candidate2))
4417         continue;
4418
4419       if (cfun->machine->reg_is_wrapped_separately[regno])
4420        continue;
4421
4422       reg = gen_rtx_REG (mode, regno);
4423       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4424       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4425                                                 offset));
4426
4427       regno2 = aarch64_next_callee_save (regno + 1, limit);
4428
4429       if (regno2 <= limit
4430           && !cfun->machine->reg_is_wrapped_separately[regno2]
4431           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4432               == cfun->machine->frame.reg_offset[regno2]))
4433
4434         {
4435           rtx reg2 = gen_rtx_REG (mode, regno2);
4436           rtx mem2;
4437
4438           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4439           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4440                                                      offset));
4441           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4442                                                     reg2));
4443
4444           /* The first part of a frame-related parallel insn is
4445              always assumed to be relevant to the frame
4446              calculations; subsequent parts, are only
4447              frame-related if explicitly marked.  */
4448           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4449           regno = regno2;
4450         }
4451       else
4452         insn = emit_move_insn (mem, reg);
4453
4454       RTX_FRAME_RELATED_P (insn) = 1;
4455     }
4456 }
4457
4458 /* Emit code to restore the callee registers of mode MODE from register
4459    number START up to and including LIMIT.  Restore from the stack offset
4460    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4461    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4462
4463 static void
4464 aarch64_restore_callee_saves (machine_mode mode,
4465                               poly_int64 start_offset, unsigned start,
4466                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4467 {
4468   rtx base_rtx = stack_pointer_rtx;
4469   unsigned regno;
4470   unsigned regno2;
4471   poly_int64 offset;
4472
4473   for (regno = aarch64_next_callee_save (start, limit);
4474        regno <= limit;
4475        regno = aarch64_next_callee_save (regno + 1, limit))
4476     {
4477       if (cfun->machine->reg_is_wrapped_separately[regno])
4478        continue;
4479
4480       rtx reg, mem;
4481
4482       if (skip_wb
4483           && (regno == cfun->machine->frame.wb_candidate1
4484               || regno == cfun->machine->frame.wb_candidate2))
4485         continue;
4486
4487       reg = gen_rtx_REG (mode, regno);
4488       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4489       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4490
4491       regno2 = aarch64_next_callee_save (regno + 1, limit);
4492
4493       if (regno2 <= limit
4494           && !cfun->machine->reg_is_wrapped_separately[regno2]
4495           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4496               == cfun->machine->frame.reg_offset[regno2]))
4497         {
4498           rtx reg2 = gen_rtx_REG (mode, regno2);
4499           rtx mem2;
4500
4501           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4502           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4503           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4504
4505           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4506           regno = regno2;
4507         }
4508       else
4509         emit_move_insn (reg, mem);
4510       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4511     }
4512 }
4513
4514 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4515    of MODE.  */
4516
4517 static inline bool
4518 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4519 {
4520   HOST_WIDE_INT multiple;
4521   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4522           && IN_RANGE (multiple, -8, 7));
4523 }
4524
4525 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4526    of MODE.  */
4527
4528 static inline bool
4529 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4530 {
4531   HOST_WIDE_INT multiple;
4532   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4533           && IN_RANGE (multiple, 0, 63));
4534 }
4535
4536 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4537    of MODE.  */
4538
4539 bool
4540 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4541 {
4542   HOST_WIDE_INT multiple;
4543   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4544           && IN_RANGE (multiple, -64, 63));
4545 }
4546
4547 /* Return true if OFFSET is a signed 9-bit value.  */
4548
4549 static inline bool
4550 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4551                                poly_int64 offset)
4552 {
4553   HOST_WIDE_INT const_offset;
4554   return (offset.is_constant (&const_offset)
4555           && IN_RANGE (const_offset, -256, 255));
4556 }
4557
4558 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4559    of MODE.  */
4560
4561 static inline bool
4562 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4563 {
4564   HOST_WIDE_INT multiple;
4565   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4566           && IN_RANGE (multiple, -256, 255));
4567 }
4568
4569 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4570    of MODE.  */
4571
4572 static inline bool
4573 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4574 {
4575   HOST_WIDE_INT multiple;
4576   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4577           && IN_RANGE (multiple, 0, 4095));
4578 }
4579
4580 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4581
4582 static sbitmap
4583 aarch64_get_separate_components (void)
4584 {
4585   aarch64_layout_frame ();
4586
4587   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4588   bitmap_clear (components);
4589
4590   /* The registers we need saved to the frame.  */
4591   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4592     if (aarch64_register_saved_on_entry (regno))
4593       {
4594         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4595         if (!frame_pointer_needed)
4596           offset += cfun->machine->frame.frame_size
4597                     - cfun->machine->frame.hard_fp_offset;
4598         /* Check that we can access the stack slot of the register with one
4599            direct load with no adjustments needed.  */
4600         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4601           bitmap_set_bit (components, regno);
4602       }
4603
4604   /* Don't mess with the hard frame pointer.  */
4605   if (frame_pointer_needed)
4606     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4607
4608   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4609   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4610   /* If aarch64_layout_frame has chosen registers to store/restore with
4611      writeback don't interfere with them to avoid having to output explicit
4612      stack adjustment instructions.  */
4613   if (reg2 != INVALID_REGNUM)
4614     bitmap_clear_bit (components, reg2);
4615   if (reg1 != INVALID_REGNUM)
4616     bitmap_clear_bit (components, reg1);
4617
4618   bitmap_clear_bit (components, LR_REGNUM);
4619   bitmap_clear_bit (components, SP_REGNUM);
4620
4621   return components;
4622 }
4623
4624 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4625
4626 static sbitmap
4627 aarch64_components_for_bb (basic_block bb)
4628 {
4629   bitmap in = DF_LIVE_IN (bb);
4630   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4631   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4632
4633   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4634   bitmap_clear (components);
4635
4636   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4637   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4638     if ((!call_used_regs[regno])
4639        && (bitmap_bit_p (in, regno)
4640            || bitmap_bit_p (gen, regno)
4641            || bitmap_bit_p (kill, regno)))
4642       {
4643         unsigned regno2, offset, offset2;
4644         bitmap_set_bit (components, regno);
4645
4646         /* If there is a callee-save at an adjacent offset, add it too
4647            to increase the use of LDP/STP.  */
4648         offset = cfun->machine->frame.reg_offset[regno];
4649         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4650
4651         if (regno2 <= LAST_SAVED_REGNUM)
4652           {
4653             offset2 = cfun->machine->frame.reg_offset[regno2];
4654             if ((offset & ~8) == (offset2 & ~8))
4655               bitmap_set_bit (components, regno2);
4656           }
4657       }
4658
4659   return components;
4660 }
4661
4662 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4663    Nothing to do for aarch64.  */
4664
4665 static void
4666 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4667 {
4668 }
4669
4670 /* Return the next set bit in BMP from START onwards.  Return the total number
4671    of bits in BMP if no set bit is found at or after START.  */
4672
4673 static unsigned int
4674 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4675 {
4676   unsigned int nbits = SBITMAP_SIZE (bmp);
4677   if (start == nbits)
4678     return start;
4679
4680   gcc_assert (start < nbits);
4681   for (unsigned int i = start; i < nbits; i++)
4682     if (bitmap_bit_p (bmp, i))
4683       return i;
4684
4685   return nbits;
4686 }
4687
4688 /* Do the work for aarch64_emit_prologue_components and
4689    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4690    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4691    for these components or the epilogue sequence.  That is, it determines
4692    whether we should emit stores or loads and what kind of CFA notes to attach
4693    to the insns.  Otherwise the logic for the two sequences is very
4694    similar.  */
4695
4696 static void
4697 aarch64_process_components (sbitmap components, bool prologue_p)
4698 {
4699   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4700                              ? HARD_FRAME_POINTER_REGNUM
4701                              : STACK_POINTER_REGNUM);
4702
4703   unsigned last_regno = SBITMAP_SIZE (components);
4704   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4705   rtx_insn *insn = NULL;
4706
4707   while (regno != last_regno)
4708     {
4709       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4710          so DFmode for the vector registers is enough.  */
4711       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4712       rtx reg = gen_rtx_REG (mode, regno);
4713       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4714       if (!frame_pointer_needed)
4715         offset += cfun->machine->frame.frame_size
4716                   - cfun->machine->frame.hard_fp_offset;
4717       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4718       rtx mem = gen_frame_mem (mode, addr);
4719
4720       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4721       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4722       /* No more registers to handle after REGNO.
4723          Emit a single save/restore and exit.  */
4724       if (regno2 == last_regno)
4725         {
4726           insn = emit_insn (set);
4727           RTX_FRAME_RELATED_P (insn) = 1;
4728           if (prologue_p)
4729             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4730           else
4731             add_reg_note (insn, REG_CFA_RESTORE, reg);
4732           break;
4733         }
4734
4735       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4736       /* The next register is not of the same class or its offset is not
4737          mergeable with the current one into a pair.  */
4738       if (!satisfies_constraint_Ump (mem)
4739           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4740           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4741                        GET_MODE_SIZE (mode)))
4742         {
4743           insn = emit_insn (set);
4744           RTX_FRAME_RELATED_P (insn) = 1;
4745           if (prologue_p)
4746             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4747           else
4748             add_reg_note (insn, REG_CFA_RESTORE, reg);
4749
4750           regno = regno2;
4751           continue;
4752         }
4753
4754       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4755       rtx reg2 = gen_rtx_REG (mode, regno2);
4756       if (!frame_pointer_needed)
4757         offset2 += cfun->machine->frame.frame_size
4758                   - cfun->machine->frame.hard_fp_offset;
4759       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4760       rtx mem2 = gen_frame_mem (mode, addr2);
4761       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4762                              : gen_rtx_SET (reg2, mem2);
4763
4764       if (prologue_p)
4765         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4766       else
4767         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4768
4769       RTX_FRAME_RELATED_P (insn) = 1;
4770       if (prologue_p)
4771         {
4772           add_reg_note (insn, REG_CFA_OFFSET, set);
4773           add_reg_note (insn, REG_CFA_OFFSET, set2);
4774         }
4775       else
4776         {
4777           add_reg_note (insn, REG_CFA_RESTORE, reg);
4778           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4779         }
4780
4781       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4782     }
4783 }
4784
4785 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4786
4787 static void
4788 aarch64_emit_prologue_components (sbitmap components)
4789 {
4790   aarch64_process_components (components, true);
4791 }
4792
4793 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4794
4795 static void
4796 aarch64_emit_epilogue_components (sbitmap components)
4797 {
4798   aarch64_process_components (components, false);
4799 }
4800
4801 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4802
4803 static void
4804 aarch64_set_handled_components (sbitmap components)
4805 {
4806   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4807     if (bitmap_bit_p (components, regno))
4808       cfun->machine->reg_is_wrapped_separately[regno] = true;
4809 }
4810
4811 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4812    is saved at BASE + OFFSET.  */
4813
4814 static void
4815 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4816                             rtx base, poly_int64 offset)
4817 {
4818   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4819   add_reg_note (insn, REG_CFA_EXPRESSION,
4820                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4821 }
4822
4823 /* AArch64 stack frames generated by this compiler look like:
4824
4825         +-------------------------------+
4826         |                               |
4827         |  incoming stack arguments     |
4828         |                               |
4829         +-------------------------------+
4830         |                               | <-- incoming stack pointer (aligned)
4831         |  callee-allocated save area   |
4832         |  for register varargs         |
4833         |                               |
4834         +-------------------------------+
4835         |  local variables              | <-- frame_pointer_rtx
4836         |                               |
4837         +-------------------------------+
4838         |  padding0                     | \
4839         +-------------------------------+  |
4840         |  callee-saved registers       |  | frame.saved_regs_size
4841         +-------------------------------+  |
4842         |  LR'                          |  |
4843         +-------------------------------+  |
4844         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4845         +-------------------------------+
4846         |  dynamic allocation           |
4847         +-------------------------------+
4848         |  padding                      |
4849         +-------------------------------+
4850         |  outgoing stack arguments     | <-- arg_pointer
4851         |                               |
4852         +-------------------------------+
4853         |                               | <-- stack_pointer_rtx (aligned)
4854
4855    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4856    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4857    unchanged.  */
4858
4859 /* Generate the prologue instructions for entry into a function.
4860    Establish the stack frame by decreasing the stack pointer with a
4861    properly calculated size and, if necessary, create a frame record
4862    filled with the values of LR and previous frame pointer.  The
4863    current FP is also set up if it is in use.  */
4864
4865 void
4866 aarch64_expand_prologue (void)
4867 {
4868   aarch64_layout_frame ();
4869
4870   poly_int64 frame_size = cfun->machine->frame.frame_size;
4871   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4872   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4873   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4874   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4875   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4876   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4877   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4878   rtx_insn *insn;
4879
4880   /* Sign return address for functions.  */
4881   if (aarch64_return_address_signing_enabled ())
4882     {
4883       insn = emit_insn (gen_pacisp ());
4884       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4885       RTX_FRAME_RELATED_P (insn) = 1;
4886     }
4887
4888   if (flag_stack_usage_info)
4889     current_function_static_stack_size = constant_lower_bound (frame_size);
4890
4891   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4892     {
4893       if (crtl->is_leaf && !cfun->calls_alloca)
4894         {
4895           if (maybe_gt (frame_size, PROBE_INTERVAL)
4896               && maybe_gt (frame_size, get_stack_check_protect ()))
4897             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4898                                             (frame_size
4899                                              - get_stack_check_protect ()));
4900         }
4901       else if (maybe_gt (frame_size, 0))
4902         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4903     }
4904
4905   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4906   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4907
4908   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4909
4910   if (callee_adjust != 0)
4911     aarch64_push_regs (reg1, reg2, callee_adjust);
4912
4913   if (emit_frame_chain)
4914     {
4915       poly_int64 reg_offset = callee_adjust;
4916       if (callee_adjust == 0)
4917         {
4918           reg1 = R29_REGNUM;
4919           reg2 = R30_REGNUM;
4920           reg_offset = callee_offset;
4921           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4922         }
4923       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4924                           stack_pointer_rtx, callee_offset,
4925                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4926       if (frame_pointer_needed && !frame_size.is_constant ())
4927         {
4928           /* Variable-sized frames need to describe the save slot
4929              address using DW_CFA_expression rather than DW_CFA_offset.
4930              This means that, without taking further action, the
4931              locations of the registers that we've already saved would
4932              remain based on the stack pointer even after we redefine
4933              the CFA based on the frame pointer.  We therefore need new
4934              DW_CFA_expressions to re-express the save slots with addresses
4935              based on the frame pointer.  */
4936           rtx_insn *insn = get_last_insn ();
4937           gcc_assert (RTX_FRAME_RELATED_P (insn));
4938
4939           /* Add an explicit CFA definition if this was previously
4940              implicit.  */
4941           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4942             {
4943               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4944                                        callee_offset);
4945               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4946                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4947             }
4948
4949           /* Change the save slot expressions for the registers that
4950              we've already saved.  */
4951           reg_offset -= callee_offset;
4952           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4953                                       reg_offset + UNITS_PER_WORD);
4954           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4955                                       reg_offset);
4956         }
4957       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4958     }
4959
4960   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4961                              callee_adjust != 0 || emit_frame_chain);
4962   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4963                              callee_adjust != 0 || emit_frame_chain);
4964   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4965 }
4966
4967 /* Return TRUE if we can use a simple_return insn.
4968
4969    This function checks whether the callee saved stack is empty, which
4970    means no restore actions are need. The pro_and_epilogue will use
4971    this to check whether shrink-wrapping opt is feasible.  */
4972
4973 bool
4974 aarch64_use_return_insn_p (void)
4975 {
4976   if (!reload_completed)
4977     return false;
4978
4979   if (crtl->profile)
4980     return false;
4981
4982   aarch64_layout_frame ();
4983
4984   return known_eq (cfun->machine->frame.frame_size, 0);
4985 }
4986
4987 /* Generate the epilogue instructions for returning from a function.
4988    This is almost exactly the reverse of the prolog sequence, except
4989    that we need to insert barriers to avoid scheduling loads that read
4990    from a deallocated stack, and we optimize the unwind records by
4991    emitting them all together if possible.  */
4992 void
4993 aarch64_expand_epilogue (bool for_sibcall)
4994 {
4995   aarch64_layout_frame ();
4996
4997   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4998   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4999   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5000   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5001   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5002   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5003   rtx cfi_ops = NULL;
5004   rtx_insn *insn;
5005   /* A stack clash protection prologue may not have left IP0_REGNUM or
5006      IP1_REGNUM in a usable state.  The same is true for allocations
5007      with an SVE component, since we then need both temporary registers
5008      for each allocation.  */
5009   bool can_inherit_p = (initial_adjust.is_constant ()
5010                         && final_adjust.is_constant ()
5011                         && !flag_stack_clash_protection);
5012
5013   /* We need to add memory barrier to prevent read from deallocated stack.  */
5014   bool need_barrier_p
5015     = maybe_ne (get_frame_size ()
5016                 + cfun->machine->frame.saved_varargs_size, 0);
5017
5018   /* Emit a barrier to prevent loads from a deallocated stack.  */
5019   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5020       || cfun->calls_alloca
5021       || crtl->calls_eh_return)
5022     {
5023       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5024       need_barrier_p = false;
5025     }
5026
5027   /* Restore the stack pointer from the frame pointer if it may not
5028      be the same as the stack pointer.  */
5029   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5030   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5031   if (frame_pointer_needed
5032       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5033     /* If writeback is used when restoring callee-saves, the CFA
5034        is restored on the instruction doing the writeback.  */
5035     aarch64_add_offset (Pmode, stack_pointer_rtx,
5036                         hard_frame_pointer_rtx, -callee_offset,
5037                         ip1_rtx, ip0_rtx, callee_adjust == 0);
5038   else
5039     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5040                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5041
5042   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5043                                 callee_adjust != 0, &cfi_ops);
5044   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5045                                 callee_adjust != 0, &cfi_ops);
5046
5047   if (need_barrier_p)
5048     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5049
5050   if (callee_adjust != 0)
5051     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5052
5053   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5054     {
5055       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5056       insn = get_last_insn ();
5057       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5058       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5059       RTX_FRAME_RELATED_P (insn) = 1;
5060       cfi_ops = NULL;
5061     }
5062
5063   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5064                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5065
5066   if (cfi_ops)
5067     {
5068       /* Emit delayed restores and reset the CFA to be SP.  */
5069       insn = get_last_insn ();
5070       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5071       REG_NOTES (insn) = cfi_ops;
5072       RTX_FRAME_RELATED_P (insn) = 1;
5073     }
5074
5075   /* We prefer to emit the combined return/authenticate instruction RETAA,
5076      however there are three cases in which we must instead emit an explicit
5077      authentication instruction.
5078
5079         1) Sibcalls don't return in a normal way, so if we're about to call one
5080            we must authenticate.
5081
5082         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5083            generating code for !TARGET_ARMV8_3 we can't use it and must
5084            explicitly authenticate.
5085
5086         3) On an eh_return path we make extra stack adjustments to update the
5087            canonical frame address to be the exception handler's CFA.  We want
5088            to authenticate using the CFA of the function which calls eh_return.
5089     */
5090   if (aarch64_return_address_signing_enabled ()
5091       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5092     {
5093       insn = emit_insn (gen_autisp ());
5094       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5095       RTX_FRAME_RELATED_P (insn) = 1;
5096     }
5097
5098   /* Stack adjustment for exception handler.  */
5099   if (crtl->calls_eh_return)
5100     {
5101       /* We need to unwind the stack by the offset computed by
5102          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5103          to be SP; letting the CFA move during this adjustment
5104          is just as correct as retaining the CFA from the body
5105          of the function.  Therefore, do nothing special.  */
5106       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5107     }
5108
5109   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5110   if (!for_sibcall)
5111     emit_jump_insn (ret_rtx);
5112 }
5113
5114 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5115    normally or return to a previous frame after unwinding.
5116
5117    An EH return uses a single shared return sequence.  The epilogue is
5118    exactly like a normal epilogue except that it has an extra input
5119    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5120    that must be applied after the frame has been destroyed.  An extra label
5121    is inserted before the epilogue which initializes this register to zero,
5122    and this is the entry point for a normal return.
5123
5124    An actual EH return updates the return address, initializes the stack
5125    adjustment and jumps directly into the epilogue (bypassing the zeroing
5126    of the adjustment).  Since the return address is typically saved on the
5127    stack when a function makes a call, the saved LR must be updated outside
5128    the epilogue.
5129
5130    This poses problems as the store is generated well before the epilogue,
5131    so the offset of LR is not known yet.  Also optimizations will remove the
5132    store as it appears dead, even after the epilogue is generated (as the
5133    base or offset for loading LR is different in many cases).
5134
5135    To avoid these problems this implementation forces the frame pointer
5136    in eh_return functions so that the location of LR is fixed and known early.
5137    It also marks the store volatile, so no optimization is permitted to
5138    remove the store.  */
5139 rtx
5140 aarch64_eh_return_handler_rtx (void)
5141 {
5142   rtx tmp = gen_frame_mem (Pmode,
5143     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5144
5145   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5146   MEM_VOLATILE_P (tmp) = true;
5147   return tmp;
5148 }
5149
5150 /* Output code to add DELTA to the first argument, and then jump
5151    to FUNCTION.  Used for C++ multiple inheritance.  */
5152 static void
5153 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5154                          HOST_WIDE_INT delta,
5155                          HOST_WIDE_INT vcall_offset,
5156                          tree function)
5157 {
5158   /* The this pointer is always in x0.  Note that this differs from
5159      Arm where the this pointer maybe bumped to r1 if r0 is required
5160      to return a pointer to an aggregate.  On AArch64 a result value
5161      pointer will be in x8.  */
5162   int this_regno = R0_REGNUM;
5163   rtx this_rtx, temp0, temp1, addr, funexp;
5164   rtx_insn *insn;
5165
5166   reload_completed = 1;
5167   emit_note (NOTE_INSN_PROLOGUE_END);
5168
5169   this_rtx = gen_rtx_REG (Pmode, this_regno);
5170   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5171   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5172
5173   if (vcall_offset == 0)
5174     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5175   else
5176     {
5177       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5178
5179       addr = this_rtx;
5180       if (delta != 0)
5181         {
5182           if (delta >= -256 && delta < 256)
5183             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5184                                        plus_constant (Pmode, this_rtx, delta));
5185           else
5186             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5187                                 temp1, temp0, false);
5188         }
5189
5190       if (Pmode == ptr_mode)
5191         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5192       else
5193         aarch64_emit_move (temp0,
5194                            gen_rtx_ZERO_EXTEND (Pmode,
5195                                                 gen_rtx_MEM (ptr_mode, addr)));
5196
5197       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5198           addr = plus_constant (Pmode, temp0, vcall_offset);
5199       else
5200         {
5201           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5202                                           Pmode);
5203           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5204         }
5205
5206       if (Pmode == ptr_mode)
5207         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5208       else
5209         aarch64_emit_move (temp1,
5210                            gen_rtx_SIGN_EXTEND (Pmode,
5211                                                 gen_rtx_MEM (ptr_mode, addr)));
5212
5213       emit_insn (gen_add2_insn (this_rtx, temp1));
5214     }
5215
5216   /* Generate a tail call to the target function.  */
5217   if (!TREE_USED (function))
5218     {
5219       assemble_external (function);
5220       TREE_USED (function) = 1;
5221     }
5222   funexp = XEXP (DECL_RTL (function), 0);
5223   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5224   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5225   SIBLING_CALL_P (insn) = 1;
5226
5227   insn = get_insns ();
5228   shorten_branches (insn);
5229   final_start_function (insn, file, 1);
5230   final (insn, file, 1);
5231   final_end_function ();
5232
5233   /* Stop pretending to be a post-reload pass.  */
5234   reload_completed = 0;
5235 }
5236
5237 static bool
5238 aarch64_tls_referenced_p (rtx x)
5239 {
5240   if (!TARGET_HAVE_TLS)
5241     return false;
5242   subrtx_iterator::array_type array;
5243   FOR_EACH_SUBRTX (iter, array, x, ALL)
5244     {
5245       const_rtx x = *iter;
5246       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5247         return true;
5248       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5249          TLS offsets, not real symbol references.  */
5250       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5251         iter.skip_subrtxes ();
5252     }
5253   return false;
5254 }
5255
5256
5257 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5258    a left shift of 0 or 12 bits.  */
5259 bool
5260 aarch64_uimm12_shift (HOST_WIDE_INT val)
5261 {
5262   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5263           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5264           );
5265 }
5266
5267
5268 /* Return true if val is an immediate that can be loaded into a
5269    register by a MOVZ instruction.  */
5270 static bool
5271 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5272 {
5273   if (GET_MODE_SIZE (mode) > 4)
5274     {
5275       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5276           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5277         return 1;
5278     }
5279   else
5280     {
5281       /* Ignore sign extension.  */
5282       val &= (HOST_WIDE_INT) 0xffffffff;
5283     }
5284   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5285           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5286 }
5287
5288 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5289    64-bit (DImode) integer.  */
5290
5291 static unsigned HOST_WIDE_INT
5292 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5293 {
5294   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5295   while (size < 64)
5296     {
5297       val &= (HOST_WIDE_INT_1U << size) - 1;
5298       val |= val << size;
5299       size *= 2;
5300     }
5301   return val;
5302 }
5303
5304 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5305
5306 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5307   {
5308     0x0000000100000001ull,
5309     0x0001000100010001ull,
5310     0x0101010101010101ull,
5311     0x1111111111111111ull,
5312     0x5555555555555555ull,
5313   };
5314
5315
5316 /* Return true if val is a valid bitmask immediate.  */
5317
5318 bool
5319 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5320 {
5321   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5322   int bits;
5323
5324   /* Check for a single sequence of one bits and return quickly if so.
5325      The special cases of all ones and all zeroes returns false.  */
5326   val = aarch64_replicate_bitmask_imm (val_in, mode);
5327   tmp = val + (val & -val);
5328
5329   if (tmp == (tmp & -tmp))
5330     return (val + 1) > 1;
5331
5332   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5333   if (mode == SImode)
5334     val = (val << 32) | (val & 0xffffffff);
5335
5336   /* Invert if the immediate doesn't start with a zero bit - this means we
5337      only need to search for sequences of one bits.  */
5338   if (val & 1)
5339     val = ~val;
5340
5341   /* Find the first set bit and set tmp to val with the first sequence of one
5342      bits removed.  Return success if there is a single sequence of ones.  */
5343   first_one = val & -val;
5344   tmp = val & (val + first_one);
5345
5346   if (tmp == 0)
5347     return true;
5348
5349   /* Find the next set bit and compute the difference in bit position.  */
5350   next_one = tmp & -tmp;
5351   bits = clz_hwi (first_one) - clz_hwi (next_one);
5352   mask = val ^ tmp;
5353
5354   /* Check the bit position difference is a power of 2, and that the first
5355      sequence of one bits fits within 'bits' bits.  */
5356   if ((mask >> bits) != 0 || bits != (bits & -bits))
5357     return false;
5358
5359   /* Check the sequence of one bits is repeated 64/bits times.  */
5360   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5361 }
5362
5363 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5364    Assumed precondition: VAL_IN Is not zero.  */
5365
5366 unsigned HOST_WIDE_INT
5367 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5368 {
5369   int lowest_bit_set = ctz_hwi (val_in);
5370   int highest_bit_set = floor_log2 (val_in);
5371   gcc_assert (val_in != 0);
5372
5373   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5374           (HOST_WIDE_INT_1U << lowest_bit_set));
5375 }
5376
5377 /* Create constant where bits outside of lowest bit set to highest bit set
5378    are set to 1.  */
5379
5380 unsigned HOST_WIDE_INT
5381 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5382 {
5383   return val_in | ~aarch64_and_split_imm1 (val_in);
5384 }
5385
5386 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5387
5388 bool
5389 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5390 {
5391   scalar_int_mode int_mode;
5392   if (!is_a <scalar_int_mode> (mode, &int_mode))
5393     return false;
5394
5395   if (aarch64_bitmask_imm (val_in, int_mode))
5396     return false;
5397
5398   if (aarch64_move_imm (val_in, int_mode))
5399     return false;
5400
5401   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5402
5403   return aarch64_bitmask_imm (imm2, int_mode);
5404 }
5405
5406 /* Return true if val is an immediate that can be loaded into a
5407    register in a single instruction.  */
5408 bool
5409 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5410 {
5411   scalar_int_mode int_mode;
5412   if (!is_a <scalar_int_mode> (mode, &int_mode))
5413     return false;
5414
5415   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5416     return 1;
5417   return aarch64_bitmask_imm (val, int_mode);
5418 }
5419
5420 static bool
5421 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5422 {
5423   rtx base, offset;
5424
5425   if (GET_CODE (x) == HIGH)
5426     return true;
5427
5428   /* There's no way to calculate VL-based values using relocations.  */
5429   subrtx_iterator::array_type array;
5430   FOR_EACH_SUBRTX (iter, array, x, ALL)
5431     if (GET_CODE (*iter) == CONST_POLY_INT)
5432       return true;
5433
5434   split_const (x, &base, &offset);
5435   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5436     {
5437       if (aarch64_classify_symbol (base, INTVAL (offset))
5438           != SYMBOL_FORCE_TO_MEM)
5439         return true;
5440       else
5441         /* Avoid generating a 64-bit relocation in ILP32; leave
5442            to aarch64_expand_mov_immediate to handle it properly.  */
5443         return mode != ptr_mode;
5444     }
5445
5446   return aarch64_tls_referenced_p (x);
5447 }
5448
5449 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5450    The expansion for a table switch is quite expensive due to the number
5451    of instructions, the table lookup and hard to predict indirect jump.
5452    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5453    set, otherwise use tables for > 16 cases as a tradeoff between size and
5454    performance.  When optimizing for size, use the default setting.  */
5455
5456 static unsigned int
5457 aarch64_case_values_threshold (void)
5458 {
5459   /* Use the specified limit for the number of cases before using jump
5460      tables at higher optimization levels.  */
5461   if (optimize > 2
5462       && selected_cpu->tune->max_case_values != 0)
5463     return selected_cpu->tune->max_case_values;
5464   else
5465     return optimize_size ? default_case_values_threshold () : 17;
5466 }
5467
5468 /* Return true if register REGNO is a valid index register.
5469    STRICT_P is true if REG_OK_STRICT is in effect.  */
5470
5471 bool
5472 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5473 {
5474   if (!HARD_REGISTER_NUM_P (regno))
5475     {
5476       if (!strict_p)
5477         return true;
5478
5479       if (!reg_renumber)
5480         return false;
5481
5482       regno = reg_renumber[regno];
5483     }
5484   return GP_REGNUM_P (regno);
5485 }
5486
5487 /* Return true if register REGNO is a valid base register for mode MODE.
5488    STRICT_P is true if REG_OK_STRICT is in effect.  */
5489
5490 bool
5491 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5492 {
5493   if (!HARD_REGISTER_NUM_P (regno))
5494     {
5495       if (!strict_p)
5496         return true;
5497
5498       if (!reg_renumber)
5499         return false;
5500
5501       regno = reg_renumber[regno];
5502     }
5503
5504   /* The fake registers will be eliminated to either the stack or
5505      hard frame pointer, both of which are usually valid base registers.
5506      Reload deals with the cases where the eliminated form isn't valid.  */
5507   return (GP_REGNUM_P (regno)
5508           || regno == SP_REGNUM
5509           || regno == FRAME_POINTER_REGNUM
5510           || regno == ARG_POINTER_REGNUM);
5511 }
5512
5513 /* Return true if X is a valid base register for mode MODE.
5514    STRICT_P is true if REG_OK_STRICT is in effect.  */
5515
5516 static bool
5517 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5518 {
5519   if (!strict_p
5520       && GET_CODE (x) == SUBREG
5521       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5522     x = SUBREG_REG (x);
5523
5524   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5525 }
5526
5527 /* Return true if address offset is a valid index.  If it is, fill in INFO
5528    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5529
5530 static bool
5531 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5532                         machine_mode mode, bool strict_p)
5533 {
5534   enum aarch64_address_type type;
5535   rtx index;
5536   int shift;
5537
5538   /* (reg:P) */
5539   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5540       && GET_MODE (x) == Pmode)
5541     {
5542       type = ADDRESS_REG_REG;
5543       index = x;
5544       shift = 0;
5545     }
5546   /* (sign_extend:DI (reg:SI)) */
5547   else if ((GET_CODE (x) == SIGN_EXTEND
5548             || GET_CODE (x) == ZERO_EXTEND)
5549            && GET_MODE (x) == DImode
5550            && GET_MODE (XEXP (x, 0)) == SImode)
5551     {
5552       type = (GET_CODE (x) == SIGN_EXTEND)
5553         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5554       index = XEXP (x, 0);
5555       shift = 0;
5556     }
5557   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5558   else if (GET_CODE (x) == MULT
5559            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5560                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5561            && GET_MODE (XEXP (x, 0)) == DImode
5562            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5563            && CONST_INT_P (XEXP (x, 1)))
5564     {
5565       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5566         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5567       index = XEXP (XEXP (x, 0), 0);
5568       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5569     }
5570   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5571   else if (GET_CODE (x) == ASHIFT
5572            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5573                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5574            && GET_MODE (XEXP (x, 0)) == DImode
5575            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5576            && CONST_INT_P (XEXP (x, 1)))
5577     {
5578       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5579         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5580       index = XEXP (XEXP (x, 0), 0);
5581       shift = INTVAL (XEXP (x, 1));
5582     }
5583   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5584   else if ((GET_CODE (x) == SIGN_EXTRACT
5585             || GET_CODE (x) == ZERO_EXTRACT)
5586            && GET_MODE (x) == DImode
5587            && GET_CODE (XEXP (x, 0)) == MULT
5588            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5589            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5590     {
5591       type = (GET_CODE (x) == SIGN_EXTRACT)
5592         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5593       index = XEXP (XEXP (x, 0), 0);
5594       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5595       if (INTVAL (XEXP (x, 1)) != 32 + shift
5596           || INTVAL (XEXP (x, 2)) != 0)
5597         shift = -1;
5598     }
5599   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5600      (const_int 0xffffffff<<shift)) */
5601   else if (GET_CODE (x) == AND
5602            && GET_MODE (x) == DImode
5603            && GET_CODE (XEXP (x, 0)) == MULT
5604            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5605            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5606            && CONST_INT_P (XEXP (x, 1)))
5607     {
5608       type = ADDRESS_REG_UXTW;
5609       index = XEXP (XEXP (x, 0), 0);
5610       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5611       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5612         shift = -1;
5613     }
5614   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5615   else if ((GET_CODE (x) == SIGN_EXTRACT
5616             || GET_CODE (x) == ZERO_EXTRACT)
5617            && GET_MODE (x) == DImode
5618            && GET_CODE (XEXP (x, 0)) == ASHIFT
5619            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5620            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5621     {
5622       type = (GET_CODE (x) == SIGN_EXTRACT)
5623         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5624       index = XEXP (XEXP (x, 0), 0);
5625       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5626       if (INTVAL (XEXP (x, 1)) != 32 + shift
5627           || INTVAL (XEXP (x, 2)) != 0)
5628         shift = -1;
5629     }
5630   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5631      (const_int 0xffffffff<<shift)) */
5632   else if (GET_CODE (x) == AND
5633            && GET_MODE (x) == DImode
5634            && GET_CODE (XEXP (x, 0)) == ASHIFT
5635            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5636            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5637            && CONST_INT_P (XEXP (x, 1)))
5638     {
5639       type = ADDRESS_REG_UXTW;
5640       index = XEXP (XEXP (x, 0), 0);
5641       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5642       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5643         shift = -1;
5644     }
5645   /* (mult:P (reg:P) (const_int scale)) */
5646   else if (GET_CODE (x) == MULT
5647            && GET_MODE (x) == Pmode
5648            && GET_MODE (XEXP (x, 0)) == Pmode
5649            && CONST_INT_P (XEXP (x, 1)))
5650     {
5651       type = ADDRESS_REG_REG;
5652       index = XEXP (x, 0);
5653       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5654     }
5655   /* (ashift:P (reg:P) (const_int shift)) */
5656   else if (GET_CODE (x) == ASHIFT
5657            && GET_MODE (x) == Pmode
5658            && GET_MODE (XEXP (x, 0)) == Pmode
5659            && CONST_INT_P (XEXP (x, 1)))
5660     {
5661       type = ADDRESS_REG_REG;
5662       index = XEXP (x, 0);
5663       shift = INTVAL (XEXP (x, 1));
5664     }
5665   else
5666     return false;
5667
5668   if (!strict_p
5669       && GET_CODE (index) == SUBREG
5670       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5671     index = SUBREG_REG (index);
5672
5673   if (aarch64_sve_data_mode_p (mode))
5674     {
5675       if (type != ADDRESS_REG_REG
5676           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5677         return false;
5678     }
5679   else
5680     {
5681       if (shift != 0
5682           && !(IN_RANGE (shift, 1, 3)
5683                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5684         return false;
5685     }
5686
5687   if (REG_P (index)
5688       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5689     {
5690       info->type = type;
5691       info->offset = index;
5692       info->shift = shift;
5693       return true;
5694     }
5695
5696   return false;
5697 }
5698
5699 /* Return true if MODE is one of the modes for which we
5700    support LDP/STP operations.  */
5701
5702 static bool
5703 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5704 {
5705   return mode == SImode || mode == DImode
5706          || mode == SFmode || mode == DFmode
5707          || (aarch64_vector_mode_supported_p (mode)
5708              && (known_eq (GET_MODE_SIZE (mode), 8)
5709                  || (known_eq (GET_MODE_SIZE (mode), 16)
5710                     && (aarch64_tune_params.extra_tuning_flags
5711                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5712 }
5713
5714 /* Return true if REGNO is a virtual pointer register, or an eliminable
5715    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5716    include stack_pointer or hard_frame_pointer.  */
5717 static bool
5718 virt_or_elim_regno_p (unsigned regno)
5719 {
5720   return ((regno >= FIRST_VIRTUAL_REGISTER
5721            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5722           || regno == FRAME_POINTER_REGNUM
5723           || regno == ARG_POINTER_REGNUM);
5724 }
5725
5726 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5727    If it is, fill in INFO appropriately.  STRICT_P is true if
5728    REG_OK_STRICT is in effect.  */
5729
5730 static bool
5731 aarch64_classify_address (struct aarch64_address_info *info,
5732                           rtx x, machine_mode mode, bool strict_p,
5733                           aarch64_addr_query_type type = ADDR_QUERY_M)
5734 {
5735   enum rtx_code code = GET_CODE (x);
5736   rtx op0, op1;
5737   poly_int64 offset;
5738
5739   HOST_WIDE_INT const_size;
5740
5741   /* On BE, we use load/store pair for all large int mode load/stores.
5742      TI/TFmode may also use a load/store pair.  */
5743   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5744   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5745   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5746                             || type == ADDR_QUERY_LDP_STP_N
5747                             || mode == TImode
5748                             || mode == TFmode
5749                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5750
5751   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5752      corresponds to the actual size of the memory being loaded/stored and the
5753      mode of the corresponding addressing mode is half of that.  */
5754   if (type == ADDR_QUERY_LDP_STP_N
5755       && known_eq (GET_MODE_SIZE (mode), 16))
5756     mode = DFmode;
5757
5758   bool allow_reg_index_p = (!load_store_pair_p
5759                             && (known_lt (GET_MODE_SIZE (mode), 16)
5760                                 || vec_flags == VEC_ADVSIMD
5761                                 || vec_flags == VEC_SVE_DATA));
5762
5763   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5764      [Rn, #offset, MUL VL].  */
5765   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5766       && (code != REG && code != PLUS))
5767     return false;
5768
5769   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5770      REG addressing.  */
5771   if (advsimd_struct_p
5772       && !BYTES_BIG_ENDIAN
5773       && (code != POST_INC && code != REG))
5774     return false;
5775
5776   gcc_checking_assert (GET_MODE (x) == VOIDmode
5777                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5778
5779   switch (code)
5780     {
5781     case REG:
5782     case SUBREG:
5783       info->type = ADDRESS_REG_IMM;
5784       info->base = x;
5785       info->offset = const0_rtx;
5786       info->const_offset = 0;
5787       return aarch64_base_register_rtx_p (x, strict_p);
5788
5789     case PLUS:
5790       op0 = XEXP (x, 0);
5791       op1 = XEXP (x, 1);
5792
5793       if (! strict_p
5794           && REG_P (op0)
5795           && virt_or_elim_regno_p (REGNO (op0))
5796           && poly_int_rtx_p (op1, &offset))
5797         {
5798           info->type = ADDRESS_REG_IMM;
5799           info->base = op0;
5800           info->offset = op1;
5801           info->const_offset = offset;
5802
5803           return true;
5804         }
5805
5806       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5807           && aarch64_base_register_rtx_p (op0, strict_p)
5808           && poly_int_rtx_p (op1, &offset))
5809         {
5810           info->type = ADDRESS_REG_IMM;
5811           info->base = op0;
5812           info->offset = op1;
5813           info->const_offset = offset;
5814
5815           /* TImode and TFmode values are allowed in both pairs of X
5816              registers and individual Q registers.  The available
5817              address modes are:
5818              X,X: 7-bit signed scaled offset
5819              Q:   9-bit signed offset
5820              We conservatively require an offset representable in either mode.
5821              When performing the check for pairs of X registers i.e.  LDP/STP
5822              pass down DImode since that is the natural size of the LDP/STP
5823              instruction memory accesses.  */
5824           if (mode == TImode || mode == TFmode)
5825             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5826                     && (offset_9bit_signed_unscaled_p (mode, offset)
5827                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5828
5829           /* A 7bit offset check because OImode will emit a ldp/stp
5830              instruction (only big endian will get here).
5831              For ldp/stp instructions, the offset is scaled for the size of a
5832              single element of the pair.  */
5833           if (mode == OImode)
5834             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5835
5836           /* Three 9/12 bit offsets checks because CImode will emit three
5837              ldr/str instructions (only big endian will get here).  */
5838           if (mode == CImode)
5839             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5840                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5841                         || offset_12bit_unsigned_scaled_p (V16QImode,
5842                                                            offset + 32)));
5843
5844           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5845              instructions (only big endian will get here).  */
5846           if (mode == XImode)
5847             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5848                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5849                                                             offset + 32));
5850
5851           /* Make "m" use the LD1 offset range for SVE data modes, so
5852              that pre-RTL optimizers like ivopts will work to that
5853              instead of the wider LDR/STR range.  */
5854           if (vec_flags == VEC_SVE_DATA)
5855             return (type == ADDR_QUERY_M
5856                     ? offset_4bit_signed_scaled_p (mode, offset)
5857                     : offset_9bit_signed_scaled_p (mode, offset));
5858
5859           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5860             {
5861               poly_int64 end_offset = (offset
5862                                        + GET_MODE_SIZE (mode)
5863                                        - BYTES_PER_SVE_VECTOR);
5864               return (type == ADDR_QUERY_M
5865                       ? offset_4bit_signed_scaled_p (mode, offset)
5866                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5867                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5868                                                          end_offset)));
5869             }
5870
5871           if (vec_flags == VEC_SVE_PRED)
5872             return offset_9bit_signed_scaled_p (mode, offset);
5873
5874           if (load_store_pair_p)
5875             return ((known_eq (GET_MODE_SIZE (mode), 4)
5876                      || known_eq (GET_MODE_SIZE (mode), 8)
5877                      || known_eq (GET_MODE_SIZE (mode), 16))
5878                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5879           else
5880             return (offset_9bit_signed_unscaled_p (mode, offset)
5881                     || offset_12bit_unsigned_scaled_p (mode, offset));
5882         }
5883
5884       if (allow_reg_index_p)
5885         {
5886           /* Look for base + (scaled/extended) index register.  */
5887           if (aarch64_base_register_rtx_p (op0, strict_p)
5888               && aarch64_classify_index (info, op1, mode, strict_p))
5889             {
5890               info->base = op0;
5891               return true;
5892             }
5893           if (aarch64_base_register_rtx_p (op1, strict_p)
5894               && aarch64_classify_index (info, op0, mode, strict_p))
5895             {
5896               info->base = op1;
5897               return true;
5898             }
5899         }
5900
5901       return false;
5902
5903     case POST_INC:
5904     case POST_DEC:
5905     case PRE_INC:
5906     case PRE_DEC:
5907       info->type = ADDRESS_REG_WB;
5908       info->base = XEXP (x, 0);
5909       info->offset = NULL_RTX;
5910       return aarch64_base_register_rtx_p (info->base, strict_p);
5911
5912     case POST_MODIFY:
5913     case PRE_MODIFY:
5914       info->type = ADDRESS_REG_WB;
5915       info->base = XEXP (x, 0);
5916       if (GET_CODE (XEXP (x, 1)) == PLUS
5917           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5918           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5919           && aarch64_base_register_rtx_p (info->base, strict_p))
5920         {
5921           info->offset = XEXP (XEXP (x, 1), 1);
5922           info->const_offset = offset;
5923
5924           /* TImode and TFmode values are allowed in both pairs of X
5925              registers and individual Q registers.  The available
5926              address modes are:
5927              X,X: 7-bit signed scaled offset
5928              Q:   9-bit signed offset
5929              We conservatively require an offset representable in either mode.
5930            */
5931           if (mode == TImode || mode == TFmode)
5932             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5933                     && offset_9bit_signed_unscaled_p (mode, offset));
5934
5935           if (load_store_pair_p)
5936             return ((known_eq (GET_MODE_SIZE (mode), 4)
5937                      || known_eq (GET_MODE_SIZE (mode), 8)
5938                      || known_eq (GET_MODE_SIZE (mode), 16))
5939                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5940           else
5941             return offset_9bit_signed_unscaled_p (mode, offset);
5942         }
5943       return false;
5944
5945     case CONST:
5946     case SYMBOL_REF:
5947     case LABEL_REF:
5948       /* load literal: pc-relative constant pool entry.  Only supported
5949          for SI mode or larger.  */
5950       info->type = ADDRESS_SYMBOLIC;
5951
5952       if (!load_store_pair_p
5953           && GET_MODE_SIZE (mode).is_constant (&const_size)
5954           && const_size >= 4)
5955         {
5956           rtx sym, addend;
5957
5958           split_const (x, &sym, &addend);
5959           return ((GET_CODE (sym) == LABEL_REF
5960                    || (GET_CODE (sym) == SYMBOL_REF
5961                        && CONSTANT_POOL_ADDRESS_P (sym)
5962                        && aarch64_pcrelative_literal_loads)));
5963         }
5964       return false;
5965
5966     case LO_SUM:
5967       info->type = ADDRESS_LO_SUM;
5968       info->base = XEXP (x, 0);
5969       info->offset = XEXP (x, 1);
5970       if (allow_reg_index_p
5971           && aarch64_base_register_rtx_p (info->base, strict_p))
5972         {
5973           rtx sym, offs;
5974           split_const (info->offset, &sym, &offs);
5975           if (GET_CODE (sym) == SYMBOL_REF
5976               && (aarch64_classify_symbol (sym, INTVAL (offs))
5977                   == SYMBOL_SMALL_ABSOLUTE))
5978             {
5979               /* The symbol and offset must be aligned to the access size.  */
5980               unsigned int align;
5981
5982               if (CONSTANT_POOL_ADDRESS_P (sym))
5983                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5984               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5985                 {
5986                   tree exp = SYMBOL_REF_DECL (sym);
5987                   align = TYPE_ALIGN (TREE_TYPE (exp));
5988                   align = aarch64_constant_alignment (exp, align);
5989                 }
5990               else if (SYMBOL_REF_DECL (sym))
5991                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5992               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5993                        && SYMBOL_REF_BLOCK (sym) != NULL)
5994                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5995               else
5996                 align = BITS_PER_UNIT;
5997
5998               poly_int64 ref_size = GET_MODE_SIZE (mode);
5999               if (known_eq (ref_size, 0))
6000                 ref_size = GET_MODE_SIZE (DImode);
6001
6002               return (multiple_p (INTVAL (offs), ref_size)
6003                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6004             }
6005         }
6006       return false;
6007
6008     default:
6009       return false;
6010     }
6011 }
6012
6013 /* Return true if the address X is valid for a PRFM instruction.
6014    STRICT_P is true if we should do strict checking with
6015    aarch64_classify_address.  */
6016
6017 bool
6018 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6019 {
6020   struct aarch64_address_info addr;
6021
6022   /* PRFM accepts the same addresses as DImode...  */
6023   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6024   if (!res)
6025     return false;
6026
6027   /* ... except writeback forms.  */
6028   return addr.type != ADDRESS_REG_WB;
6029 }
6030
6031 bool
6032 aarch64_symbolic_address_p (rtx x)
6033 {
6034   rtx offset;
6035
6036   split_const (x, &x, &offset);
6037   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6038 }
6039
6040 /* Classify the base of symbolic expression X.  */
6041
6042 enum aarch64_symbol_type
6043 aarch64_classify_symbolic_expression (rtx x)
6044 {
6045   rtx offset;
6046
6047   split_const (x, &x, &offset);
6048   return aarch64_classify_symbol (x, INTVAL (offset));
6049 }
6050
6051
6052 /* Return TRUE if X is a legitimate address for accessing memory in
6053    mode MODE.  */
6054 static bool
6055 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6056 {
6057   struct aarch64_address_info addr;
6058
6059   return aarch64_classify_address (&addr, x, mode, strict_p);
6060 }
6061
6062 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6063    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6064 bool
6065 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6066                               aarch64_addr_query_type type)
6067 {
6068   struct aarch64_address_info addr;
6069
6070   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6071 }
6072
6073 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6074
6075 static bool
6076 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6077                                          poly_int64 orig_offset,
6078                                          machine_mode mode)
6079 {
6080   HOST_WIDE_INT size;
6081   if (GET_MODE_SIZE (mode).is_constant (&size))
6082     {
6083       HOST_WIDE_INT const_offset, second_offset;
6084
6085       /* A general SVE offset is A * VQ + B.  Remove the A component from
6086          coefficient 0 in order to get the constant B.  */
6087       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6088
6089       /* Split an out-of-range address displacement into a base and
6090          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6091          range otherwise to increase opportunities for sharing the base
6092          address of different sizes.  Unaligned accesses use the signed
6093          9-bit range, TImode/TFmode use the intersection of signed
6094          scaled 7-bit and signed 9-bit offset.  */
6095       if (mode == TImode || mode == TFmode)
6096         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6097       else if ((const_offset & (size - 1)) != 0)
6098         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6099       else
6100         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6101
6102       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6103         return false;
6104
6105       /* Split the offset into second_offset and the rest.  */
6106       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6107       *offset2 = gen_int_mode (second_offset, Pmode);
6108       return true;
6109     }
6110   else
6111     {
6112       /* Get the mode we should use as the basis of the range.  For structure
6113          modes this is the mode of one vector.  */
6114       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6115       machine_mode step_mode
6116         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6117
6118       /* Get the "mul vl" multiplier we'd like to use.  */
6119       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6120       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6121       if (vec_flags & VEC_SVE_DATA)
6122         /* LDR supports a 9-bit range, but the move patterns for
6123            structure modes require all vectors to be in range of the
6124            same base.  The simplest way of accomodating that while still
6125            promoting reuse of anchor points between different modes is
6126            to use an 8-bit range unconditionally.  */
6127         vnum = ((vnum + 128) & 255) - 128;
6128       else
6129         /* Predicates are only handled singly, so we might as well use
6130            the full range.  */
6131         vnum = ((vnum + 256) & 511) - 256;
6132       if (vnum == 0)
6133         return false;
6134
6135       /* Convert the "mul vl" multiplier into a byte offset.  */
6136       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6137       if (known_eq (second_offset, orig_offset))
6138         return false;
6139
6140       /* Split the offset into second_offset and the rest.  */
6141       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6142       *offset2 = gen_int_mode (second_offset, Pmode);
6143       return true;
6144     }
6145 }
6146
6147 /* Return the binary representation of floating point constant VALUE in INTVAL.
6148    If the value cannot be converted, return false without setting INTVAL.
6149    The conversion is done in the given MODE.  */
6150 bool
6151 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6152 {
6153
6154   /* We make a general exception for 0.  */
6155   if (aarch64_float_const_zero_rtx_p (value))
6156     {
6157       *intval = 0;
6158       return true;
6159     }
6160
6161   scalar_float_mode mode;
6162   if (GET_CODE (value) != CONST_DOUBLE
6163       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6164       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6165       /* Only support up to DF mode.  */
6166       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6167     return false;
6168
6169   unsigned HOST_WIDE_INT ival = 0;
6170
6171   long res[2];
6172   real_to_target (res,
6173                   CONST_DOUBLE_REAL_VALUE (value),
6174                   REAL_MODE_FORMAT (mode));
6175
6176   if (mode == DFmode)
6177     {
6178       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6179       ival = zext_hwi (res[order], 32);
6180       ival |= (zext_hwi (res[1 - order], 32) << 32);
6181     }
6182   else
6183       ival = zext_hwi (res[0], 32);
6184
6185   *intval = ival;
6186   return true;
6187 }
6188
6189 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6190    single MOV(+MOVK) followed by an FMOV.  */
6191 bool
6192 aarch64_float_const_rtx_p (rtx x)
6193 {
6194   machine_mode mode = GET_MODE (x);
6195   if (mode == VOIDmode)
6196     return false;
6197
6198   /* Determine whether it's cheaper to write float constants as
6199      mov/movk pairs over ldr/adrp pairs.  */
6200   unsigned HOST_WIDE_INT ival;
6201
6202   if (GET_CODE (x) == CONST_DOUBLE
6203       && SCALAR_FLOAT_MODE_P (mode)
6204       && aarch64_reinterpret_float_as_int (x, &ival))
6205     {
6206       scalar_int_mode imode = (mode == HFmode
6207                                ? SImode
6208                                : int_mode_for_mode (mode).require ());
6209       int num_instr = aarch64_internal_mov_immediate
6210                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6211       return num_instr < 3;
6212     }
6213
6214   return false;
6215 }
6216
6217 /* Return TRUE if rtx X is immediate constant 0.0 */
6218 bool
6219 aarch64_float_const_zero_rtx_p (rtx x)
6220 {
6221   if (GET_MODE (x) == VOIDmode)
6222     return false;
6223
6224   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6225     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6226   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6227 }
6228
6229 /* Return TRUE if rtx X is immediate constant that fits in a single
6230    MOVI immediate operation.  */
6231 bool
6232 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6233 {
6234   if (!TARGET_SIMD)
6235      return false;
6236
6237   machine_mode vmode;
6238   scalar_int_mode imode;
6239   unsigned HOST_WIDE_INT ival;
6240
6241   if (GET_CODE (x) == CONST_DOUBLE
6242       && SCALAR_FLOAT_MODE_P (mode))
6243     {
6244       if (!aarch64_reinterpret_float_as_int (x, &ival))
6245         return false;
6246
6247       /* We make a general exception for 0.  */
6248       if (aarch64_float_const_zero_rtx_p (x))
6249         return true;
6250
6251       imode = int_mode_for_mode (mode).require ();
6252     }
6253   else if (GET_CODE (x) == CONST_INT
6254            && is_a <scalar_int_mode> (mode, &imode))
6255     ival = INTVAL (x);
6256   else
6257     return false;
6258
6259    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6260      a 128 bit vector mode.  */
6261   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6262
6263   vmode = aarch64_simd_container_mode (imode, width);
6264   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6265
6266   return aarch64_simd_valid_immediate (v_op, NULL);
6267 }
6268
6269
6270 /* Return the fixed registers used for condition codes.  */
6271
6272 static bool
6273 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6274 {
6275   *p1 = CC_REGNUM;
6276   *p2 = INVALID_REGNUM;
6277   return true;
6278 }
6279
6280 /* This function is used by the call expanders of the machine description.
6281    RESULT is the register in which the result is returned.  It's NULL for
6282    "call" and "sibcall".
6283    MEM is the location of the function call.
6284    SIBCALL indicates whether this function call is normal call or sibling call.
6285    It will generate different pattern accordingly.  */
6286
6287 void
6288 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6289 {
6290   rtx call, callee, tmp;
6291   rtvec vec;
6292   machine_mode mode;
6293
6294   gcc_assert (MEM_P (mem));
6295   callee = XEXP (mem, 0);
6296   mode = GET_MODE (callee);
6297   gcc_assert (mode == Pmode);
6298
6299   /* Decide if we should generate indirect calls by loading the
6300      address of the callee into a register before performing
6301      the branch-and-link.  */
6302   if (SYMBOL_REF_P (callee)
6303       ? (aarch64_is_long_call_p (callee)
6304          || aarch64_is_noplt_call_p (callee))
6305       : !REG_P (callee))
6306     XEXP (mem, 0) = force_reg (mode, callee);
6307
6308   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6309
6310   if (result != NULL_RTX)
6311     call = gen_rtx_SET (result, call);
6312
6313   if (sibcall)
6314     tmp = ret_rtx;
6315   else
6316     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6317
6318   vec = gen_rtvec (2, call, tmp);
6319   call = gen_rtx_PARALLEL (VOIDmode, vec);
6320
6321   aarch64_emit_call_insn (call);
6322 }
6323
6324 /* Emit call insn with PAT and do aarch64-specific handling.  */
6325
6326 void
6327 aarch64_emit_call_insn (rtx pat)
6328 {
6329   rtx insn = emit_call_insn (pat);
6330
6331   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6332   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6333   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6334 }
6335
6336 machine_mode
6337 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6338 {
6339   /* All floating point compares return CCFP if it is an equality
6340      comparison, and CCFPE otherwise.  */
6341   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6342     {
6343       switch (code)
6344         {
6345         case EQ:
6346         case NE:
6347         case UNORDERED:
6348         case ORDERED:
6349         case UNLT:
6350         case UNLE:
6351         case UNGT:
6352         case UNGE:
6353         case UNEQ:
6354           return CCFPmode;
6355
6356         case LT:
6357         case LE:
6358         case GT:
6359         case GE:
6360         case LTGT:
6361           return CCFPEmode;
6362
6363         default:
6364           gcc_unreachable ();
6365         }
6366     }
6367
6368   /* Equality comparisons of short modes against zero can be performed
6369      using the TST instruction with the appropriate bitmask.  */
6370   if (y == const0_rtx && REG_P (x)
6371       && (code == EQ || code == NE)
6372       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6373     return CC_NZmode;
6374
6375   /* Similarly, comparisons of zero_extends from shorter modes can
6376      be performed using an ANDS with an immediate mask.  */
6377   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6378       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6379       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6380       && (code == EQ || code == NE))
6381     return CC_NZmode;
6382
6383   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6384       && y == const0_rtx
6385       && (code == EQ || code == NE || code == LT || code == GE)
6386       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6387           || GET_CODE (x) == NEG
6388           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6389               && CONST_INT_P (XEXP (x, 2)))))
6390     return CC_NZmode;
6391
6392   /* A compare with a shifted operand.  Because of canonicalization,
6393      the comparison will have to be swapped when we emit the assembly
6394      code.  */
6395   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6396       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6397       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6398           || GET_CODE (x) == LSHIFTRT
6399           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6400     return CC_SWPmode;
6401
6402   /* Similarly for a negated operand, but we can only do this for
6403      equalities.  */
6404   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6405       && (REG_P (y) || GET_CODE (y) == SUBREG)
6406       && (code == EQ || code == NE)
6407       && GET_CODE (x) == NEG)
6408     return CC_Zmode;
6409
6410   /* A test for unsigned overflow.  */
6411   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6412       && code == NE
6413       && GET_CODE (x) == PLUS
6414       && GET_CODE (y) == ZERO_EXTEND)
6415     return CC_Cmode;
6416
6417   /* For everything else, return CCmode.  */
6418   return CCmode;
6419 }
6420
6421 static int
6422 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6423
6424 int
6425 aarch64_get_condition_code (rtx x)
6426 {
6427   machine_mode mode = GET_MODE (XEXP (x, 0));
6428   enum rtx_code comp_code = GET_CODE (x);
6429
6430   if (GET_MODE_CLASS (mode) != MODE_CC)
6431     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6432   return aarch64_get_condition_code_1 (mode, comp_code);
6433 }
6434
6435 static int
6436 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6437 {
6438   switch (mode)
6439     {
6440     case E_CCFPmode:
6441     case E_CCFPEmode:
6442       switch (comp_code)
6443         {
6444         case GE: return AARCH64_GE;
6445         case GT: return AARCH64_GT;
6446         case LE: return AARCH64_LS;
6447         case LT: return AARCH64_MI;
6448         case NE: return AARCH64_NE;
6449         case EQ: return AARCH64_EQ;
6450         case ORDERED: return AARCH64_VC;
6451         case UNORDERED: return AARCH64_VS;
6452         case UNLT: return AARCH64_LT;
6453         case UNLE: return AARCH64_LE;
6454         case UNGT: return AARCH64_HI;
6455         case UNGE: return AARCH64_PL;
6456         default: return -1;
6457         }
6458       break;
6459
6460     case E_CCmode:
6461       switch (comp_code)
6462         {
6463         case NE: return AARCH64_NE;
6464         case EQ: return AARCH64_EQ;
6465         case GE: return AARCH64_GE;
6466         case GT: return AARCH64_GT;
6467         case LE: return AARCH64_LE;
6468         case LT: return AARCH64_LT;
6469         case GEU: return AARCH64_CS;
6470         case GTU: return AARCH64_HI;
6471         case LEU: return AARCH64_LS;
6472         case LTU: return AARCH64_CC;
6473         default: return -1;
6474         }
6475       break;
6476
6477     case E_CC_SWPmode:
6478       switch (comp_code)
6479         {
6480         case NE: return AARCH64_NE;
6481         case EQ: return AARCH64_EQ;
6482         case GE: return AARCH64_LE;
6483         case GT: return AARCH64_LT;
6484         case LE: return AARCH64_GE;
6485         case LT: return AARCH64_GT;
6486         case GEU: return AARCH64_LS;
6487         case GTU: return AARCH64_CC;
6488         case LEU: return AARCH64_CS;
6489         case LTU: return AARCH64_HI;
6490         default: return -1;
6491         }
6492       break;
6493
6494     case E_CC_NZmode:
6495       switch (comp_code)
6496         {
6497         case NE: return AARCH64_NE;
6498         case EQ: return AARCH64_EQ;
6499         case GE: return AARCH64_PL;
6500         case LT: return AARCH64_MI;
6501         default: return -1;
6502         }
6503       break;
6504
6505     case E_CC_Zmode:
6506       switch (comp_code)
6507         {
6508         case NE: return AARCH64_NE;
6509         case EQ: return AARCH64_EQ;
6510         default: return -1;
6511         }
6512       break;
6513
6514     case E_CC_Cmode:
6515       switch (comp_code)
6516         {
6517         case NE: return AARCH64_CS;
6518         case EQ: return AARCH64_CC;
6519         default: return -1;
6520         }
6521       break;
6522
6523     default:
6524       return -1;
6525     }
6526
6527   return -1;
6528 }
6529
6530 bool
6531 aarch64_const_vec_all_same_in_range_p (rtx x,
6532                                        HOST_WIDE_INT minval,
6533                                        HOST_WIDE_INT maxval)
6534 {
6535   rtx elt;
6536   return (const_vec_duplicate_p (x, &elt)
6537           && CONST_INT_P (elt)
6538           && IN_RANGE (INTVAL (elt), minval, maxval));
6539 }
6540
6541 bool
6542 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6543 {
6544   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6545 }
6546
6547 /* Return true if VEC is a constant in which every element is in the range
6548    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6549
6550 static bool
6551 aarch64_const_vec_all_in_range_p (rtx vec,
6552                                   HOST_WIDE_INT minval,
6553                                   HOST_WIDE_INT maxval)
6554 {
6555   if (GET_CODE (vec) != CONST_VECTOR
6556       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6557     return false;
6558
6559   int nunits;
6560   if (!CONST_VECTOR_STEPPED_P (vec))
6561     nunits = const_vector_encoded_nelts (vec);
6562   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6563     return false;
6564
6565   for (int i = 0; i < nunits; i++)
6566     {
6567       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6568       if (!CONST_INT_P (vec_elem)
6569           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6570         return false;
6571     }
6572   return true;
6573 }
6574
6575 /* N Z C V.  */
6576 #define AARCH64_CC_V 1
6577 #define AARCH64_CC_C (1 << 1)
6578 #define AARCH64_CC_Z (1 << 2)
6579 #define AARCH64_CC_N (1 << 3)
6580
6581 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6582 static const int aarch64_nzcv_codes[] =
6583 {
6584   0,            /* EQ, Z == 1.  */
6585   AARCH64_CC_Z, /* NE, Z == 0.  */
6586   0,            /* CS, C == 1.  */
6587   AARCH64_CC_C, /* CC, C == 0.  */
6588   0,            /* MI, N == 1.  */
6589   AARCH64_CC_N, /* PL, N == 0.  */
6590   0,            /* VS, V == 1.  */
6591   AARCH64_CC_V, /* VC, V == 0.  */
6592   0,            /* HI, C ==1 && Z == 0.  */
6593   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6594   AARCH64_CC_V, /* GE, N == V.  */
6595   0,            /* LT, N != V.  */
6596   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6597   0,            /* LE, !(Z == 0 && N == V).  */
6598   0,            /* AL, Any.  */
6599   0             /* NV, Any.  */
6600 };
6601
6602 /* Print floating-point vector immediate operand X to F, negating it
6603    first if NEGATE is true.  Return true on success, false if it isn't
6604    a constant we can handle.  */
6605
6606 static bool
6607 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6608 {
6609   rtx elt;
6610
6611   if (!const_vec_duplicate_p (x, &elt))
6612     return false;
6613
6614   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6615   if (negate)
6616     r = real_value_negate (&r);
6617
6618   /* We only handle the SVE single-bit immediates here.  */
6619   if (real_equal (&r, &dconst0))
6620     asm_fprintf (f, "0.0");
6621   else if (real_equal (&r, &dconst1))
6622     asm_fprintf (f, "1.0");
6623   else if (real_equal (&r, &dconsthalf))
6624     asm_fprintf (f, "0.5");
6625   else
6626     return false;
6627
6628   return true;
6629 }
6630
6631 /* Return the equivalent letter for size.  */
6632 static char
6633 sizetochar (int size)
6634 {
6635   switch (size)
6636     {
6637     case 64: return 'd';
6638     case 32: return 's';
6639     case 16: return 'h';
6640     case 8 : return 'b';
6641     default: gcc_unreachable ();
6642     }
6643 }
6644
6645 /* Print operand X to file F in a target specific manner according to CODE.
6646    The acceptable formatting commands given by CODE are:
6647      'c':               An integer or symbol address without a preceding #
6648                         sign.
6649      'C':               Take the duplicated element in a vector constant
6650                         and print it in hex.
6651      'D':               Take the duplicated element in a vector constant
6652                         and print it as an unsigned integer, in decimal.
6653      'e':               Print the sign/zero-extend size as a character 8->b,
6654                         16->h, 32->w.
6655      'p':               Prints N such that 2^N == X (X must be power of 2 and
6656                         const int).
6657      'P':               Print the number of non-zero bits in X (a const_int).
6658      'H':               Print the higher numbered register of a pair (TImode)
6659                         of regs.
6660      'm':               Print a condition (eq, ne, etc).
6661      'M':               Same as 'm', but invert condition.
6662      'N':               Take the duplicated element in a vector constant
6663                         and print the negative of it in decimal.
6664      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6665      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6666                         The register printed is the FP/SIMD register name
6667                         of X + 0/1/2/3 for S/T/U/V.
6668      'R':               Print a scalar FP/SIMD register name + 1.
6669      'X':               Print bottom 16 bits of integer constant in hex.
6670      'w/x':             Print a general register name or the zero register
6671                         (32-bit or 64-bit).
6672      '0':               Print a normal operand, if it's a general register,
6673                         then we assume DImode.
6674      'k':               Print NZCV for conditional compare instructions.
6675      'A':               Output address constant representing the first
6676                         argument of X, specifying a relocation offset
6677                         if appropriate.
6678      'L':               Output constant address specified by X
6679                         with a relocation offset if appropriate.
6680      'G':               Prints address of X, specifying a PC relative
6681                         relocation mode if appropriate.
6682      'y':               Output address of LDP or STP - this is used for
6683                         some LDP/STPs which don't use a PARALLEL in their
6684                         pattern (so the mode needs to be adjusted).
6685      'z':               Output address of a typical LDP or STP.  */
6686
6687 static void
6688 aarch64_print_operand (FILE *f, rtx x, int code)
6689 {
6690   rtx elt;
6691   switch (code)
6692     {
6693     case 'c':
6694       switch (GET_CODE (x))
6695         {
6696         case CONST_INT:
6697           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6698           break;
6699
6700         case SYMBOL_REF:
6701           output_addr_const (f, x);
6702           break;
6703
6704         case CONST:
6705           if (GET_CODE (XEXP (x, 0)) == PLUS
6706               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6707             {
6708               output_addr_const (f, x);
6709               break;
6710             }
6711           /* Fall through.  */
6712
6713         default:
6714           output_operand_lossage ("unsupported operand for code '%c'", code);
6715         }
6716       break;
6717
6718     case 'e':
6719       {
6720         int n;
6721
6722         if (!CONST_INT_P (x)
6723             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6724           {
6725             output_operand_lossage ("invalid operand for '%%%c'", code);
6726             return;
6727           }
6728
6729         switch (n)
6730           {
6731           case 3:
6732             fputc ('b', f);
6733             break;
6734           case 4:
6735             fputc ('h', f);
6736             break;
6737           case 5:
6738             fputc ('w', f);
6739             break;
6740           default:
6741             output_operand_lossage ("invalid operand for '%%%c'", code);
6742             return;
6743           }
6744       }
6745       break;
6746
6747     case 'p':
6748       {
6749         int n;
6750
6751         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6752           {
6753             output_operand_lossage ("invalid operand for '%%%c'", code);
6754             return;
6755           }
6756
6757         asm_fprintf (f, "%d", n);
6758       }
6759       break;
6760
6761     case 'P':
6762       if (!CONST_INT_P (x))
6763         {
6764           output_operand_lossage ("invalid operand for '%%%c'", code);
6765           return;
6766         }
6767
6768       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6769       break;
6770
6771     case 'H':
6772       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6773         {
6774           output_operand_lossage ("invalid operand for '%%%c'", code);
6775           return;
6776         }
6777
6778       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6779       break;
6780
6781     case 'M':
6782     case 'm':
6783       {
6784         int cond_code;
6785         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6786         if (x == const_true_rtx)
6787           {
6788             if (code == 'M')
6789               fputs ("nv", f);
6790             return;
6791           }
6792
6793         if (!COMPARISON_P (x))
6794           {
6795             output_operand_lossage ("invalid operand for '%%%c'", code);
6796             return;
6797           }
6798
6799         cond_code = aarch64_get_condition_code (x);
6800         gcc_assert (cond_code >= 0);
6801         if (code == 'M')
6802           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6803         fputs (aarch64_condition_codes[cond_code], f);
6804       }
6805       break;
6806
6807     case 'N':
6808       if (!const_vec_duplicate_p (x, &elt))
6809         {
6810           output_operand_lossage ("invalid vector constant");
6811           return;
6812         }
6813
6814       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6815         asm_fprintf (f, "%wd", -INTVAL (elt));
6816       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6817                && aarch64_print_vector_float_operand (f, x, true))
6818         ;
6819       else
6820         {
6821           output_operand_lossage ("invalid vector constant");
6822           return;
6823         }
6824       break;
6825
6826     case 'b':
6827     case 'h':
6828     case 's':
6829     case 'd':
6830     case 'q':
6831       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6832         {
6833           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6834           return;
6835         }
6836       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6837       break;
6838
6839     case 'S':
6840     case 'T':
6841     case 'U':
6842     case 'V':
6843       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6844         {
6845           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6846           return;
6847         }
6848       asm_fprintf (f, "%c%d",
6849                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6850                    REGNO (x) - V0_REGNUM + (code - 'S'));
6851       break;
6852
6853     case 'R':
6854       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6855         {
6856           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6857           return;
6858         }
6859       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6860       break;
6861
6862     case 'X':
6863       if (!CONST_INT_P (x))
6864         {
6865           output_operand_lossage ("invalid operand for '%%%c'", code);
6866           return;
6867         }
6868       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6869       break;
6870
6871     case 'C':
6872       {
6873         /* Print a replicated constant in hex.  */
6874         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6875           {
6876             output_operand_lossage ("invalid operand for '%%%c'", code);
6877             return;
6878           }
6879         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6880         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6881       }
6882       break;
6883
6884     case 'D':
6885       {
6886         /* Print a replicated constant in decimal, treating it as
6887            unsigned.  */
6888         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6889           {
6890             output_operand_lossage ("invalid operand for '%%%c'", code);
6891             return;
6892           }
6893         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6894         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6895       }
6896       break;
6897
6898     case 'w':
6899     case 'x':
6900       if (x == const0_rtx
6901           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6902         {
6903           asm_fprintf (f, "%czr", code);
6904           break;
6905         }
6906
6907       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6908         {
6909           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6910           break;
6911         }
6912
6913       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6914         {
6915           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6916           break;
6917         }
6918
6919       /* Fall through */
6920
6921     case 0:
6922       if (x == NULL)
6923         {
6924           output_operand_lossage ("missing operand");
6925           return;
6926         }
6927
6928       switch (GET_CODE (x))
6929         {
6930         case REG:
6931           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6932             {
6933               if (REG_NREGS (x) == 1)
6934                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6935               else
6936                 {
6937                   char suffix
6938                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6939                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6940                                REGNO (x) - V0_REGNUM, suffix,
6941                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6942                 }
6943             }
6944           else
6945             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6946           break;
6947
6948         case MEM:
6949           output_address (GET_MODE (x), XEXP (x, 0));
6950           break;
6951
6952         case LABEL_REF:
6953         case SYMBOL_REF:
6954           output_addr_const (asm_out_file, x);
6955           break;
6956
6957         case CONST_INT:
6958           asm_fprintf (f, "%wd", INTVAL (x));
6959           break;
6960
6961         case CONST:
6962           if (!VECTOR_MODE_P (GET_MODE (x)))
6963             {
6964               output_addr_const (asm_out_file, x);
6965               break;
6966             }
6967           /* fall through */
6968
6969         case CONST_VECTOR:
6970           if (!const_vec_duplicate_p (x, &elt))
6971             {
6972               output_operand_lossage ("invalid vector constant");
6973               return;
6974             }
6975
6976           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6977             asm_fprintf (f, "%wd", INTVAL (elt));
6978           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6979                    && aarch64_print_vector_float_operand (f, x, false))
6980             ;
6981           else
6982             {
6983               output_operand_lossage ("invalid vector constant");
6984               return;
6985             }
6986           break;
6987
6988         case CONST_DOUBLE:
6989           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6990              be getting CONST_DOUBLEs holding integers.  */
6991           gcc_assert (GET_MODE (x) != VOIDmode);
6992           if (aarch64_float_const_zero_rtx_p (x))
6993             {
6994               fputc ('0', f);
6995               break;
6996             }
6997           else if (aarch64_float_const_representable_p (x))
6998             {
6999 #define buf_size 20
7000               char float_buf[buf_size] = {'\0'};
7001               real_to_decimal_for_mode (float_buf,
7002                                         CONST_DOUBLE_REAL_VALUE (x),
7003                                         buf_size, buf_size,
7004                                         1, GET_MODE (x));
7005               asm_fprintf (asm_out_file, "%s", float_buf);
7006               break;
7007 #undef buf_size
7008             }
7009           output_operand_lossage ("invalid constant");
7010           return;
7011         default:
7012           output_operand_lossage ("invalid operand");
7013           return;
7014         }
7015       break;
7016
7017     case 'A':
7018       if (GET_CODE (x) == HIGH)
7019         x = XEXP (x, 0);
7020
7021       switch (aarch64_classify_symbolic_expression (x))
7022         {
7023         case SYMBOL_SMALL_GOT_4G:
7024           asm_fprintf (asm_out_file, ":got:");
7025           break;
7026
7027         case SYMBOL_SMALL_TLSGD:
7028           asm_fprintf (asm_out_file, ":tlsgd:");
7029           break;
7030
7031         case SYMBOL_SMALL_TLSDESC:
7032           asm_fprintf (asm_out_file, ":tlsdesc:");
7033           break;
7034
7035         case SYMBOL_SMALL_TLSIE:
7036           asm_fprintf (asm_out_file, ":gottprel:");
7037           break;
7038
7039         case SYMBOL_TLSLE24:
7040           asm_fprintf (asm_out_file, ":tprel:");
7041           break;
7042
7043         case SYMBOL_TINY_GOT:
7044           gcc_unreachable ();
7045           break;
7046
7047         default:
7048           break;
7049         }
7050       output_addr_const (asm_out_file, x);
7051       break;
7052
7053     case 'L':
7054       switch (aarch64_classify_symbolic_expression (x))
7055         {
7056         case SYMBOL_SMALL_GOT_4G:
7057           asm_fprintf (asm_out_file, ":lo12:");
7058           break;
7059
7060         case SYMBOL_SMALL_TLSGD:
7061           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7062           break;
7063
7064         case SYMBOL_SMALL_TLSDESC:
7065           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7066           break;
7067
7068         case SYMBOL_SMALL_TLSIE:
7069           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7070           break;
7071
7072         case SYMBOL_TLSLE12:
7073           asm_fprintf (asm_out_file, ":tprel_lo12:");
7074           break;
7075
7076         case SYMBOL_TLSLE24:
7077           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7078           break;
7079
7080         case SYMBOL_TINY_GOT:
7081           asm_fprintf (asm_out_file, ":got:");
7082           break;
7083
7084         case SYMBOL_TINY_TLSIE:
7085           asm_fprintf (asm_out_file, ":gottprel:");
7086           break;
7087
7088         default:
7089           break;
7090         }
7091       output_addr_const (asm_out_file, x);
7092       break;
7093
7094     case 'G':
7095       switch (aarch64_classify_symbolic_expression (x))
7096         {
7097         case SYMBOL_TLSLE24:
7098           asm_fprintf (asm_out_file, ":tprel_hi12:");
7099           break;
7100         default:
7101           break;
7102         }
7103       output_addr_const (asm_out_file, x);
7104       break;
7105
7106     case 'k':
7107       {
7108         HOST_WIDE_INT cond_code;
7109
7110         if (!CONST_INT_P (x))
7111           {
7112             output_operand_lossage ("invalid operand for '%%%c'", code);
7113             return;
7114           }
7115
7116         cond_code = INTVAL (x);
7117         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7118         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7119       }
7120       break;
7121
7122     case 'y':
7123     case 'z':
7124       {
7125         machine_mode mode = GET_MODE (x);
7126
7127         if (GET_CODE (x) != MEM
7128             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7129           {
7130             output_operand_lossage ("invalid operand for '%%%c'", code);
7131             return;
7132           }
7133
7134         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7135                                             code == 'y'
7136                                             ? ADDR_QUERY_LDP_STP_N
7137                                             : ADDR_QUERY_LDP_STP))
7138           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7139       }
7140       break;
7141
7142     default:
7143       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7144       return;
7145     }
7146 }
7147
7148 /* Print address 'x' of a memory access with mode 'mode'.
7149    'op' is the context required by aarch64_classify_address.  It can either be
7150    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7151 static bool
7152 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7153                                 aarch64_addr_query_type type)
7154 {
7155   struct aarch64_address_info addr;
7156   unsigned int size;
7157
7158   /* Check all addresses are Pmode - including ILP32.  */
7159   if (GET_MODE (x) != Pmode)
7160     output_operand_lossage ("invalid address mode");
7161
7162   if (aarch64_classify_address (&addr, x, mode, true, type))
7163     switch (addr.type)
7164       {
7165       case ADDRESS_REG_IMM:
7166         if (known_eq (addr.const_offset, 0))
7167           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7168         else if (aarch64_sve_data_mode_p (mode))
7169           {
7170             HOST_WIDE_INT vnum
7171               = exact_div (addr.const_offset,
7172                            BYTES_PER_SVE_VECTOR).to_constant ();
7173             asm_fprintf (f, "[%s, #%wd, mul vl]",
7174                          reg_names[REGNO (addr.base)], vnum);
7175           }
7176         else if (aarch64_sve_pred_mode_p (mode))
7177           {
7178             HOST_WIDE_INT vnum
7179               = exact_div (addr.const_offset,
7180                            BYTES_PER_SVE_PRED).to_constant ();
7181             asm_fprintf (f, "[%s, #%wd, mul vl]",
7182                          reg_names[REGNO (addr.base)], vnum);
7183           }
7184         else
7185           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7186                        INTVAL (addr.offset));
7187         return true;
7188
7189       case ADDRESS_REG_REG:
7190         if (addr.shift == 0)
7191           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7192                        reg_names [REGNO (addr.offset)]);
7193         else
7194           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7195                        reg_names [REGNO (addr.offset)], addr.shift);
7196         return true;
7197
7198       case ADDRESS_REG_UXTW:
7199         if (addr.shift == 0)
7200           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7201                        REGNO (addr.offset) - R0_REGNUM);
7202         else
7203           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7204                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7205         return true;
7206
7207       case ADDRESS_REG_SXTW:
7208         if (addr.shift == 0)
7209           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7210                        REGNO (addr.offset) - R0_REGNUM);
7211         else
7212           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7213                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7214         return true;
7215
7216       case ADDRESS_REG_WB:
7217         /* Writeback is only supported for fixed-width modes.  */
7218         size = GET_MODE_SIZE (mode).to_constant ();
7219         switch (GET_CODE (x))
7220           {
7221           case PRE_INC:
7222             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7223             return true;
7224           case POST_INC:
7225             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7226             return true;
7227           case PRE_DEC:
7228             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7229             return true;
7230           case POST_DEC:
7231             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7232             return true;
7233           case PRE_MODIFY:
7234             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7235                          INTVAL (addr.offset));
7236             return true;
7237           case POST_MODIFY:
7238             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7239                          INTVAL (addr.offset));
7240             return true;
7241           default:
7242             break;
7243           }
7244         break;
7245
7246       case ADDRESS_LO_SUM:
7247         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7248         output_addr_const (f, addr.offset);
7249         asm_fprintf (f, "]");
7250         return true;
7251
7252       case ADDRESS_SYMBOLIC:
7253         output_addr_const (f, x);
7254         return true;
7255       }
7256
7257   return false;
7258 }
7259
7260 /* Print address 'x' of a memory access with mode 'mode'.  */
7261 static void
7262 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7263 {
7264   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7265     output_addr_const (f, x);
7266 }
7267
7268 bool
7269 aarch64_label_mentioned_p (rtx x)
7270 {
7271   const char *fmt;
7272   int i;
7273
7274   if (GET_CODE (x) == LABEL_REF)
7275     return true;
7276
7277   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7278      referencing instruction, but they are constant offsets, not
7279      symbols.  */
7280   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7281     return false;
7282
7283   fmt = GET_RTX_FORMAT (GET_CODE (x));
7284   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7285     {
7286       if (fmt[i] == 'E')
7287         {
7288           int j;
7289
7290           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7291             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7292               return 1;
7293         }
7294       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7295         return 1;
7296     }
7297
7298   return 0;
7299 }
7300
7301 /* Implement REGNO_REG_CLASS.  */
7302
7303 enum reg_class
7304 aarch64_regno_regclass (unsigned regno)
7305 {
7306   if (GP_REGNUM_P (regno))
7307     return GENERAL_REGS;
7308
7309   if (regno == SP_REGNUM)
7310     return STACK_REG;
7311
7312   if (regno == FRAME_POINTER_REGNUM
7313       || regno == ARG_POINTER_REGNUM)
7314     return POINTER_REGS;
7315
7316   if (FP_REGNUM_P (regno))
7317     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7318
7319   if (PR_REGNUM_P (regno))
7320     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7321
7322   return NO_REGS;
7323 }
7324
7325 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7326    If OFFSET is out of range, return an offset of an anchor point
7327    that is in range.  Return 0 otherwise.  */
7328
7329 static HOST_WIDE_INT
7330 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7331                        machine_mode mode)
7332 {
7333   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7334   if (size > 16)
7335     return (offset + 0x400) & ~0x7f0;
7336
7337   /* For offsets that aren't a multiple of the access size, the limit is
7338      -256...255.  */
7339   if (offset & (size - 1))
7340     {
7341       /* BLKmode typically uses LDP of X-registers.  */
7342       if (mode == BLKmode)
7343         return (offset + 512) & ~0x3ff;
7344       return (offset + 0x100) & ~0x1ff;
7345     }
7346
7347   /* Small negative offsets are supported.  */
7348   if (IN_RANGE (offset, -256, 0))
7349     return 0;
7350
7351   if (mode == TImode || mode == TFmode)
7352     return (offset + 0x100) & ~0x1ff;
7353
7354   /* Use 12-bit offset by access size.  */
7355   return offset & (~0xfff * size);
7356 }
7357
7358 static rtx
7359 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7360 {
7361   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7362      where mask is selected by alignment and size of the offset.
7363      We try to pick as large a range for the offset as possible to
7364      maximize the chance of a CSE.  However, for aligned addresses
7365      we limit the range to 4k so that structures with different sized
7366      elements are likely to use the same base.  We need to be careful
7367      not to split a CONST for some forms of address expression, otherwise
7368      it will generate sub-optimal code.  */
7369
7370   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7371     {
7372       rtx base = XEXP (x, 0);
7373       rtx offset_rtx = XEXP (x, 1);
7374       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7375
7376       if (GET_CODE (base) == PLUS)
7377         {
7378           rtx op0 = XEXP (base, 0);
7379           rtx op1 = XEXP (base, 1);
7380
7381           /* Force any scaling into a temp for CSE.  */
7382           op0 = force_reg (Pmode, op0);
7383           op1 = force_reg (Pmode, op1);
7384
7385           /* Let the pointer register be in op0.  */
7386           if (REG_POINTER (op1))
7387             std::swap (op0, op1);
7388
7389           /* If the pointer is virtual or frame related, then we know that
7390              virtual register instantiation or register elimination is going
7391              to apply a second constant.  We want the two constants folded
7392              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7393           if (virt_or_elim_regno_p (REGNO (op0)))
7394             {
7395               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7396                                    NULL_RTX, true, OPTAB_DIRECT);
7397               return gen_rtx_PLUS (Pmode, base, op1);
7398             }
7399
7400           /* Otherwise, in order to encourage CSE (and thence loop strength
7401              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7402           base = expand_binop (Pmode, add_optab, op0, op1,
7403                                NULL_RTX, true, OPTAB_DIRECT);
7404           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7405         }
7406
7407       HOST_WIDE_INT size;
7408       if (GET_MODE_SIZE (mode).is_constant (&size))
7409         {
7410           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7411                                                              mode);
7412           if (base_offset != 0)
7413             {
7414               base = plus_constant (Pmode, base, base_offset);
7415               base = force_operand (base, NULL_RTX);
7416               return plus_constant (Pmode, base, offset - base_offset);
7417             }
7418         }
7419     }
7420
7421   return x;
7422 }
7423
7424 /* Return the reload icode required for a constant pool in mode.  */
7425 static enum insn_code
7426 aarch64_constant_pool_reload_icode (machine_mode mode)
7427 {
7428   switch (mode)
7429     {
7430     case E_SFmode:
7431       return CODE_FOR_aarch64_reload_movcpsfdi;
7432
7433     case E_DFmode:
7434       return CODE_FOR_aarch64_reload_movcpdfdi;
7435
7436     case E_TFmode:
7437       return CODE_FOR_aarch64_reload_movcptfdi;
7438
7439     case E_V8QImode:
7440       return CODE_FOR_aarch64_reload_movcpv8qidi;
7441
7442     case E_V16QImode:
7443       return CODE_FOR_aarch64_reload_movcpv16qidi;
7444
7445     case E_V4HImode:
7446       return CODE_FOR_aarch64_reload_movcpv4hidi;
7447
7448     case E_V8HImode:
7449       return CODE_FOR_aarch64_reload_movcpv8hidi;
7450
7451     case E_V2SImode:
7452       return CODE_FOR_aarch64_reload_movcpv2sidi;
7453
7454     case E_V4SImode:
7455       return CODE_FOR_aarch64_reload_movcpv4sidi;
7456
7457     case E_V2DImode:
7458       return CODE_FOR_aarch64_reload_movcpv2didi;
7459
7460     case E_V2DFmode:
7461       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7462
7463     default:
7464       gcc_unreachable ();
7465     }
7466
7467   gcc_unreachable ();
7468 }
7469 static reg_class_t
7470 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7471                           reg_class_t rclass,
7472                           machine_mode mode,
7473                           secondary_reload_info *sri)
7474 {
7475   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7476      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7477      comment at the head of aarch64-sve.md for more details about the
7478      big-endian handling.  */
7479   if (BYTES_BIG_ENDIAN
7480       && reg_class_subset_p (rclass, FP_REGS)
7481       && !((REG_P (x) && HARD_REGISTER_P (x))
7482            || aarch64_simd_valid_immediate (x, NULL))
7483       && aarch64_sve_data_mode_p (mode))
7484     {
7485       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7486       return NO_REGS;
7487     }
7488
7489   /* If we have to disable direct literal pool loads and stores because the
7490      function is too big, then we need a scratch register.  */
7491   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7492       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7493           || targetm.vector_mode_supported_p (GET_MODE (x)))
7494       && !aarch64_pcrelative_literal_loads)
7495     {
7496       sri->icode = aarch64_constant_pool_reload_icode (mode);
7497       return NO_REGS;
7498     }
7499
7500   /* Without the TARGET_SIMD instructions we cannot move a Q register
7501      to a Q register directly.  We need a scratch.  */
7502   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7503       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7504       && reg_class_subset_p (rclass, FP_REGS))
7505     {
7506       if (mode == TFmode)
7507         sri->icode = CODE_FOR_aarch64_reload_movtf;
7508       else if (mode == TImode)
7509         sri->icode = CODE_FOR_aarch64_reload_movti;
7510       return NO_REGS;
7511     }
7512
7513   /* A TFmode or TImode memory access should be handled via an FP_REGS
7514      because AArch64 has richer addressing modes for LDR/STR instructions
7515      than LDP/STP instructions.  */
7516   if (TARGET_FLOAT && rclass == GENERAL_REGS
7517       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7518     return FP_REGS;
7519
7520   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7521       return GENERAL_REGS;
7522
7523   return NO_REGS;
7524 }
7525
7526 static bool
7527 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7528 {
7529   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7530
7531   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7532      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7533   if (frame_pointer_needed)
7534     return to == HARD_FRAME_POINTER_REGNUM;
7535   return true;
7536 }
7537
7538 poly_int64
7539 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7540 {
7541   aarch64_layout_frame ();
7542
7543   if (to == HARD_FRAME_POINTER_REGNUM)
7544     {
7545       if (from == ARG_POINTER_REGNUM)
7546         return cfun->machine->frame.hard_fp_offset;
7547
7548       if (from == FRAME_POINTER_REGNUM)
7549         return cfun->machine->frame.hard_fp_offset
7550                - cfun->machine->frame.locals_offset;
7551     }
7552
7553   if (to == STACK_POINTER_REGNUM)
7554     {
7555       if (from == FRAME_POINTER_REGNUM)
7556           return cfun->machine->frame.frame_size
7557                  - cfun->machine->frame.locals_offset;
7558     }
7559
7560   return cfun->machine->frame.frame_size;
7561 }
7562
7563 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7564    previous frame.  */
7565
7566 rtx
7567 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7568 {
7569   if (count != 0)
7570     return const0_rtx;
7571   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7572 }
7573
7574
7575 static void
7576 aarch64_asm_trampoline_template (FILE *f)
7577 {
7578   if (TARGET_ILP32)
7579     {
7580       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7581       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7582     }
7583   else
7584     {
7585       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7586       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7587     }
7588   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7589   assemble_aligned_integer (4, const0_rtx);
7590   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7591   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7592 }
7593
7594 static void
7595 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7596 {
7597   rtx fnaddr, mem, a_tramp;
7598   const int tramp_code_sz = 16;
7599
7600   /* Don't need to copy the trailing D-words, we fill those in below.  */
7601   emit_block_move (m_tramp, assemble_trampoline_template (),
7602                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7603   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7604   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7605   if (GET_MODE (fnaddr) != ptr_mode)
7606     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7607   emit_move_insn (mem, fnaddr);
7608
7609   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7610   emit_move_insn (mem, chain_value);
7611
7612   /* XXX We should really define a "clear_cache" pattern and use
7613      gen_clear_cache().  */
7614   a_tramp = XEXP (m_tramp, 0);
7615   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7616                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7617                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7618                      ptr_mode);
7619 }
7620
7621 static unsigned char
7622 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7623 {
7624   /* ??? Logically we should only need to provide a value when
7625      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7626      can hold MODE, but at the moment we need to handle all modes.
7627      Just ignore any runtime parts for registers that can't store them.  */
7628   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7629   unsigned int nregs;
7630   switch (regclass)
7631     {
7632     case TAILCALL_ADDR_REGS:
7633     case POINTER_REGS:
7634     case GENERAL_REGS:
7635     case ALL_REGS:
7636     case POINTER_AND_FP_REGS:
7637     case FP_REGS:
7638     case FP_LO_REGS:
7639       if (aarch64_sve_data_mode_p (mode)
7640           && constant_multiple_p (GET_MODE_SIZE (mode),
7641                                   BYTES_PER_SVE_VECTOR, &nregs))
7642         return nregs;
7643       return (aarch64_vector_data_mode_p (mode)
7644               ? CEIL (lowest_size, UNITS_PER_VREG)
7645               : CEIL (lowest_size, UNITS_PER_WORD));
7646     case STACK_REG:
7647     case PR_REGS:
7648     case PR_LO_REGS:
7649     case PR_HI_REGS:
7650       return 1;
7651
7652     case NO_REGS:
7653       return 0;
7654
7655     default:
7656       break;
7657     }
7658   gcc_unreachable ();
7659 }
7660
7661 static reg_class_t
7662 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7663 {
7664   if (regclass == POINTER_REGS)
7665     return GENERAL_REGS;
7666
7667   if (regclass == STACK_REG)
7668     {
7669       if (REG_P(x)
7670           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7671           return regclass;
7672
7673       return NO_REGS;
7674     }
7675
7676   /* Register eliminiation can result in a request for
7677      SP+constant->FP_REGS.  We cannot support such operations which
7678      use SP as source and an FP_REG as destination, so reject out
7679      right now.  */
7680   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7681     {
7682       rtx lhs = XEXP (x, 0);
7683
7684       /* Look through a possible SUBREG introduced by ILP32.  */
7685       if (GET_CODE (lhs) == SUBREG)
7686         lhs = SUBREG_REG (lhs);
7687
7688       gcc_assert (REG_P (lhs));
7689       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7690                                       POINTER_REGS));
7691       return NO_REGS;
7692     }
7693
7694   return regclass;
7695 }
7696
7697 void
7698 aarch64_asm_output_labelref (FILE* f, const char *name)
7699 {
7700   asm_fprintf (f, "%U%s", name);
7701 }
7702
7703 static void
7704 aarch64_elf_asm_constructor (rtx symbol, int priority)
7705 {
7706   if (priority == DEFAULT_INIT_PRIORITY)
7707     default_ctor_section_asm_out_constructor (symbol, priority);
7708   else
7709     {
7710       section *s;
7711       /* While priority is known to be in range [0, 65535], so 18 bytes
7712          would be enough, the compiler might not know that.  To avoid
7713          -Wformat-truncation false positive, use a larger size.  */
7714       char buf[23];
7715       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7716       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7717       switch_to_section (s);
7718       assemble_align (POINTER_SIZE);
7719       assemble_aligned_integer (POINTER_BYTES, symbol);
7720     }
7721 }
7722
7723 static void
7724 aarch64_elf_asm_destructor (rtx symbol, int priority)
7725 {
7726   if (priority == DEFAULT_INIT_PRIORITY)
7727     default_dtor_section_asm_out_destructor (symbol, priority);
7728   else
7729     {
7730       section *s;
7731       /* While priority is known to be in range [0, 65535], so 18 bytes
7732          would be enough, the compiler might not know that.  To avoid
7733          -Wformat-truncation false positive, use a larger size.  */
7734       char buf[23];
7735       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7736       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7737       switch_to_section (s);
7738       assemble_align (POINTER_SIZE);
7739       assemble_aligned_integer (POINTER_BYTES, symbol);
7740     }
7741 }
7742
7743 const char*
7744 aarch64_output_casesi (rtx *operands)
7745 {
7746   char buf[100];
7747   char label[100];
7748   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7749   int index;
7750   static const char *const patterns[4][2] =
7751   {
7752     {
7753       "ldrb\t%w3, [%0,%w1,uxtw]",
7754       "add\t%3, %4, %w3, sxtb #2"
7755     },
7756     {
7757       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7758       "add\t%3, %4, %w3, sxth #2"
7759     },
7760     {
7761       "ldr\t%w3, [%0,%w1,uxtw #2]",
7762       "add\t%3, %4, %w3, sxtw #2"
7763     },
7764     /* We assume that DImode is only generated when not optimizing and
7765        that we don't really need 64-bit address offsets.  That would
7766        imply an object file with 8GB of code in a single function!  */
7767     {
7768       "ldr\t%w3, [%0,%w1,uxtw #2]",
7769       "add\t%3, %4, %w3, sxtw #2"
7770     }
7771   };
7772
7773   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7774
7775   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7776   index = exact_log2 (GET_MODE_SIZE (mode));
7777
7778   gcc_assert (index >= 0 && index <= 3);
7779
7780   /* Need to implement table size reduction, by chaning the code below.  */
7781   output_asm_insn (patterns[index][0], operands);
7782   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7783   snprintf (buf, sizeof (buf),
7784             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7785   output_asm_insn (buf, operands);
7786   output_asm_insn (patterns[index][1], operands);
7787   output_asm_insn ("br\t%3", operands);
7788   assemble_label (asm_out_file, label);
7789   return "";
7790 }
7791
7792
7793 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7794    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7795    operator.  */
7796
7797 int
7798 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7799 {
7800   if (shift >= 0 && shift <= 3)
7801     {
7802       int size;
7803       for (size = 8; size <= 32; size *= 2)
7804         {
7805           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7806           if (mask == bits << shift)
7807             return size;
7808         }
7809     }
7810   return 0;
7811 }
7812
7813 /* Constant pools are per function only when PC relative
7814    literal loads are true or we are in the large memory
7815    model.  */
7816
7817 static inline bool
7818 aarch64_can_use_per_function_literal_pools_p (void)
7819 {
7820   return (aarch64_pcrelative_literal_loads
7821           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7822 }
7823
7824 static bool
7825 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7826 {
7827   /* We can't use blocks for constants when we're using a per-function
7828      constant pool.  */
7829   return !aarch64_can_use_per_function_literal_pools_p ();
7830 }
7831
7832 /* Select appropriate section for constants depending
7833    on where we place literal pools.  */
7834
7835 static section *
7836 aarch64_select_rtx_section (machine_mode mode,
7837                             rtx x,
7838                             unsigned HOST_WIDE_INT align)
7839 {
7840   if (aarch64_can_use_per_function_literal_pools_p ())
7841     return function_section (current_function_decl);
7842
7843   return default_elf_select_rtx_section (mode, x, align);
7844 }
7845
7846 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7847 void
7848 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7849                                   HOST_WIDE_INT offset)
7850 {
7851   /* When using per-function literal pools, we must ensure that any code
7852      section is aligned to the minimal instruction length, lest we get
7853      errors from the assembler re "unaligned instructions".  */
7854   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7855     ASM_OUTPUT_ALIGN (f, 2);
7856 }
7857
7858 /* Costs.  */
7859
7860 /* Helper function for rtx cost calculation.  Strip a shift expression
7861    from X.  Returns the inner operand if successful, or the original
7862    expression on failure.  */
7863 static rtx
7864 aarch64_strip_shift (rtx x)
7865 {
7866   rtx op = x;
7867
7868   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7869      we can convert both to ROR during final output.  */
7870   if ((GET_CODE (op) == ASHIFT
7871        || GET_CODE (op) == ASHIFTRT
7872        || GET_CODE (op) == LSHIFTRT
7873        || GET_CODE (op) == ROTATERT
7874        || GET_CODE (op) == ROTATE)
7875       && CONST_INT_P (XEXP (op, 1)))
7876     return XEXP (op, 0);
7877
7878   if (GET_CODE (op) == MULT
7879       && CONST_INT_P (XEXP (op, 1))
7880       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7881     return XEXP (op, 0);
7882
7883   return x;
7884 }
7885
7886 /* Helper function for rtx cost calculation.  Strip an extend
7887    expression from X.  Returns the inner operand if successful, or the
7888    original expression on failure.  We deal with a number of possible
7889    canonicalization variations here. If STRIP_SHIFT is true, then
7890    we can strip off a shift also.  */
7891 static rtx
7892 aarch64_strip_extend (rtx x, bool strip_shift)
7893 {
7894   scalar_int_mode mode;
7895   rtx op = x;
7896
7897   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7898     return op;
7899
7900   /* Zero and sign extraction of a widened value.  */
7901   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7902       && XEXP (op, 2) == const0_rtx
7903       && GET_CODE (XEXP (op, 0)) == MULT
7904       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7905                                          XEXP (op, 1)))
7906     return XEXP (XEXP (op, 0), 0);
7907
7908   /* It can also be represented (for zero-extend) as an AND with an
7909      immediate.  */
7910   if (GET_CODE (op) == AND
7911       && GET_CODE (XEXP (op, 0)) == MULT
7912       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7913       && CONST_INT_P (XEXP (op, 1))
7914       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7915                            INTVAL (XEXP (op, 1))) != 0)
7916     return XEXP (XEXP (op, 0), 0);
7917
7918   /* Now handle extended register, as this may also have an optional
7919      left shift by 1..4.  */
7920   if (strip_shift
7921       && GET_CODE (op) == ASHIFT
7922       && CONST_INT_P (XEXP (op, 1))
7923       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7924     op = XEXP (op, 0);
7925
7926   if (GET_CODE (op) == ZERO_EXTEND
7927       || GET_CODE (op) == SIGN_EXTEND)
7928     op = XEXP (op, 0);
7929
7930   if (op != x)
7931     return op;
7932
7933   return x;
7934 }
7935
7936 /* Return true iff CODE is a shift supported in combination
7937    with arithmetic instructions.  */
7938
7939 static bool
7940 aarch64_shift_p (enum rtx_code code)
7941 {
7942   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7943 }
7944
7945
7946 /* Return true iff X is a cheap shift without a sign extend. */
7947
7948 static bool
7949 aarch64_cheap_mult_shift_p (rtx x)
7950 {
7951   rtx op0, op1;
7952
7953   op0 = XEXP (x, 0);
7954   op1 = XEXP (x, 1);
7955
7956   if (!(aarch64_tune_params.extra_tuning_flags
7957                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7958     return false;
7959
7960   if (GET_CODE (op0) == SIGN_EXTEND)
7961     return false;
7962
7963   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7964       && UINTVAL (op1) <= 4)
7965     return true;
7966
7967   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7968     return false;
7969
7970   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7971
7972   if (l2 > 0 && l2 <= 4)
7973     return true;
7974
7975   return false;
7976 }
7977
7978 /* Helper function for rtx cost calculation.  Calculate the cost of
7979    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7980    Return the calculated cost of the expression, recursing manually in to
7981    operands where needed.  */
7982
7983 static int
7984 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7985 {
7986   rtx op0, op1;
7987   const struct cpu_cost_table *extra_cost
7988     = aarch64_tune_params.insn_extra_cost;
7989   int cost = 0;
7990   bool compound_p = (outer == PLUS || outer == MINUS);
7991   machine_mode mode = GET_MODE (x);
7992
7993   gcc_checking_assert (code == MULT);
7994
7995   op0 = XEXP (x, 0);
7996   op1 = XEXP (x, 1);
7997
7998   if (VECTOR_MODE_P (mode))
7999     mode = GET_MODE_INNER (mode);
8000
8001   /* Integer multiply/fma.  */
8002   if (GET_MODE_CLASS (mode) == MODE_INT)
8003     {
8004       /* The multiply will be canonicalized as a shift, cost it as such.  */
8005       if (aarch64_shift_p (GET_CODE (x))
8006           || (CONST_INT_P (op1)
8007               && exact_log2 (INTVAL (op1)) > 0))
8008         {
8009           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8010                            || GET_CODE (op0) == SIGN_EXTEND;
8011           if (speed)
8012             {
8013               if (compound_p)
8014                 {
8015                   /* If the shift is considered cheap,
8016                      then don't add any cost. */
8017                   if (aarch64_cheap_mult_shift_p (x))
8018                     ;
8019                   else if (REG_P (op1))
8020                     /* ARITH + shift-by-register.  */
8021                     cost += extra_cost->alu.arith_shift_reg;
8022                   else if (is_extend)
8023                     /* ARITH + extended register.  We don't have a cost field
8024                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8025                     cost += extra_cost->alu.extend_arith;
8026                   else
8027                     /* ARITH + shift-by-immediate.  */
8028                     cost += extra_cost->alu.arith_shift;
8029                 }
8030               else
8031                 /* LSL (immediate).  */
8032                 cost += extra_cost->alu.shift;
8033
8034             }
8035           /* Strip extends as we will have costed them in the case above.  */
8036           if (is_extend)
8037             op0 = aarch64_strip_extend (op0, true);
8038
8039           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8040
8041           return cost;
8042         }
8043
8044       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8045          compound and let the below cases handle it.  After all, MNEG is a
8046          special-case alias of MSUB.  */
8047       if (GET_CODE (op0) == NEG)
8048         {
8049           op0 = XEXP (op0, 0);
8050           compound_p = true;
8051         }
8052
8053       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8054       if ((GET_CODE (op0) == ZERO_EXTEND
8055            && GET_CODE (op1) == ZERO_EXTEND)
8056           || (GET_CODE (op0) == SIGN_EXTEND
8057               && GET_CODE (op1) == SIGN_EXTEND))
8058         {
8059           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8060           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8061
8062           if (speed)
8063             {
8064               if (compound_p)
8065                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8066                 cost += extra_cost->mult[0].extend_add;
8067               else
8068                 /* MUL/SMULL/UMULL.  */
8069                 cost += extra_cost->mult[0].extend;
8070             }
8071
8072           return cost;
8073         }
8074
8075       /* This is either an integer multiply or a MADD.  In both cases
8076          we want to recurse and cost the operands.  */
8077       cost += rtx_cost (op0, mode, MULT, 0, speed);
8078       cost += rtx_cost (op1, mode, MULT, 1, speed);
8079
8080       if (speed)
8081         {
8082           if (compound_p)
8083             /* MADD/MSUB.  */
8084             cost += extra_cost->mult[mode == DImode].add;
8085           else
8086             /* MUL.  */
8087             cost += extra_cost->mult[mode == DImode].simple;
8088         }
8089
8090       return cost;
8091     }
8092   else
8093     {
8094       if (speed)
8095         {
8096           /* Floating-point FMA/FMUL can also support negations of the
8097              operands, unless the rounding mode is upward or downward in
8098              which case FNMUL is different than FMUL with operand negation.  */
8099           bool neg0 = GET_CODE (op0) == NEG;
8100           bool neg1 = GET_CODE (op1) == NEG;
8101           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8102             {
8103               if (neg0)
8104                 op0 = XEXP (op0, 0);
8105               if (neg1)
8106                 op1 = XEXP (op1, 0);
8107             }
8108
8109           if (compound_p)
8110             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8111             cost += extra_cost->fp[mode == DFmode].fma;
8112           else
8113             /* FMUL/FNMUL.  */
8114             cost += extra_cost->fp[mode == DFmode].mult;
8115         }
8116
8117       cost += rtx_cost (op0, mode, MULT, 0, speed);
8118       cost += rtx_cost (op1, mode, MULT, 1, speed);
8119       return cost;
8120     }
8121 }
8122
8123 static int
8124 aarch64_address_cost (rtx x,
8125                       machine_mode mode,
8126                       addr_space_t as ATTRIBUTE_UNUSED,
8127                       bool speed)
8128 {
8129   enum rtx_code c = GET_CODE (x);
8130   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8131   struct aarch64_address_info info;
8132   int cost = 0;
8133   info.shift = 0;
8134
8135   if (!aarch64_classify_address (&info, x, mode, false))
8136     {
8137       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8138         {
8139           /* This is a CONST or SYMBOL ref which will be split
8140              in a different way depending on the code model in use.
8141              Cost it through the generic infrastructure.  */
8142           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8143           /* Divide through by the cost of one instruction to
8144              bring it to the same units as the address costs.  */
8145           cost_symbol_ref /= COSTS_N_INSNS (1);
8146           /* The cost is then the cost of preparing the address,
8147              followed by an immediate (possibly 0) offset.  */
8148           return cost_symbol_ref + addr_cost->imm_offset;
8149         }
8150       else
8151         {
8152           /* This is most likely a jump table from a case
8153              statement.  */
8154           return addr_cost->register_offset;
8155         }
8156     }
8157
8158   switch (info.type)
8159     {
8160       case ADDRESS_LO_SUM:
8161       case ADDRESS_SYMBOLIC:
8162       case ADDRESS_REG_IMM:
8163         cost += addr_cost->imm_offset;
8164         break;
8165
8166       case ADDRESS_REG_WB:
8167         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8168           cost += addr_cost->pre_modify;
8169         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8170           cost += addr_cost->post_modify;
8171         else
8172           gcc_unreachable ();
8173
8174         break;
8175
8176       case ADDRESS_REG_REG:
8177         cost += addr_cost->register_offset;
8178         break;
8179
8180       case ADDRESS_REG_SXTW:
8181         cost += addr_cost->register_sextend;
8182         break;
8183
8184       case ADDRESS_REG_UXTW:
8185         cost += addr_cost->register_zextend;
8186         break;
8187
8188       default:
8189         gcc_unreachable ();
8190     }
8191
8192
8193   if (info.shift > 0)
8194     {
8195       /* For the sake of calculating the cost of the shifted register
8196          component, we can treat same sized modes in the same way.  */
8197       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8198         cost += addr_cost->addr_scale_costs.hi;
8199       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8200         cost += addr_cost->addr_scale_costs.si;
8201       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8202         cost += addr_cost->addr_scale_costs.di;
8203       else
8204         /* We can't tell, or this is a 128-bit vector.  */
8205         cost += addr_cost->addr_scale_costs.ti;
8206     }
8207
8208   return cost;
8209 }
8210
8211 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8212    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8213    to be taken.  */
8214
8215 int
8216 aarch64_branch_cost (bool speed_p, bool predictable_p)
8217 {
8218   /* When optimizing for speed, use the cost of unpredictable branches.  */
8219   const struct cpu_branch_cost *branch_costs =
8220     aarch64_tune_params.branch_costs;
8221
8222   if (!speed_p || predictable_p)
8223     return branch_costs->predictable;
8224   else
8225     return branch_costs->unpredictable;
8226 }
8227
8228 /* Return true if the RTX X in mode MODE is a zero or sign extract
8229    usable in an ADD or SUB (extended register) instruction.  */
8230 static bool
8231 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8232 {
8233   /* Catch add with a sign extract.
8234      This is add_<optab><mode>_multp2.  */
8235   if (GET_CODE (x) == SIGN_EXTRACT
8236       || GET_CODE (x) == ZERO_EXTRACT)
8237     {
8238       rtx op0 = XEXP (x, 0);
8239       rtx op1 = XEXP (x, 1);
8240       rtx op2 = XEXP (x, 2);
8241
8242       if (GET_CODE (op0) == MULT
8243           && CONST_INT_P (op1)
8244           && op2 == const0_rtx
8245           && CONST_INT_P (XEXP (op0, 1))
8246           && aarch64_is_extend_from_extract (mode,
8247                                              XEXP (op0, 1),
8248                                              op1))
8249         {
8250           return true;
8251         }
8252     }
8253   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8254      No shift.  */
8255   else if (GET_CODE (x) == SIGN_EXTEND
8256            || GET_CODE (x) == ZERO_EXTEND)
8257     return REG_P (XEXP (x, 0));
8258
8259   return false;
8260 }
8261
8262 static bool
8263 aarch64_frint_unspec_p (unsigned int u)
8264 {
8265   switch (u)
8266     {
8267       case UNSPEC_FRINTZ:
8268       case UNSPEC_FRINTP:
8269       case UNSPEC_FRINTM:
8270       case UNSPEC_FRINTA:
8271       case UNSPEC_FRINTN:
8272       case UNSPEC_FRINTX:
8273       case UNSPEC_FRINTI:
8274         return true;
8275
8276       default:
8277         return false;
8278     }
8279 }
8280
8281 /* Return true iff X is an rtx that will match an extr instruction
8282    i.e. as described in the *extr<mode>5_insn family of patterns.
8283    OP0 and OP1 will be set to the operands of the shifts involved
8284    on success and will be NULL_RTX otherwise.  */
8285
8286 static bool
8287 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8288 {
8289   rtx op0, op1;
8290   scalar_int_mode mode;
8291   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8292     return false;
8293
8294   *res_op0 = NULL_RTX;
8295   *res_op1 = NULL_RTX;
8296
8297   if (GET_CODE (x) != IOR)
8298     return false;
8299
8300   op0 = XEXP (x, 0);
8301   op1 = XEXP (x, 1);
8302
8303   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8304       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8305     {
8306      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8307       if (GET_CODE (op1) == ASHIFT)
8308         std::swap (op0, op1);
8309
8310       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8311         return false;
8312
8313       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8314       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8315
8316       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8317           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8318         {
8319           *res_op0 = XEXP (op0, 0);
8320           *res_op1 = XEXP (op1, 0);
8321           return true;
8322         }
8323     }
8324
8325   return false;
8326 }
8327
8328 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8329    storing it in *COST.  Result is true if the total cost of the operation
8330    has now been calculated.  */
8331 static bool
8332 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8333 {
8334   rtx inner;
8335   rtx comparator;
8336   enum rtx_code cmpcode;
8337
8338   if (COMPARISON_P (op0))
8339     {
8340       inner = XEXP (op0, 0);
8341       comparator = XEXP (op0, 1);
8342       cmpcode = GET_CODE (op0);
8343     }
8344   else
8345     {
8346       inner = op0;
8347       comparator = const0_rtx;
8348       cmpcode = NE;
8349     }
8350
8351   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8352     {
8353       /* Conditional branch.  */
8354       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8355         return true;
8356       else
8357         {
8358           if (cmpcode == NE || cmpcode == EQ)
8359             {
8360               if (comparator == const0_rtx)
8361                 {
8362                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8363                   if (GET_CODE (inner) == ZERO_EXTRACT)
8364                     /* TBZ/TBNZ.  */
8365                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8366                                        ZERO_EXTRACT, 0, speed);
8367                   else
8368                     /* CBZ/CBNZ.  */
8369                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8370
8371                 return true;
8372               }
8373             }
8374           else if (cmpcode == LT || cmpcode == GE)
8375             {
8376               /* TBZ/TBNZ.  */
8377               if (comparator == const0_rtx)
8378                 return true;
8379             }
8380         }
8381     }
8382   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8383     {
8384       /* CCMP.  */
8385       if (GET_CODE (op1) == COMPARE)
8386         {
8387           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8388           if (XEXP (op1, 1) == const0_rtx)
8389             *cost += 1;
8390           if (speed)
8391             {
8392               machine_mode mode = GET_MODE (XEXP (op1, 0));
8393               const struct cpu_cost_table *extra_cost
8394                 = aarch64_tune_params.insn_extra_cost;
8395
8396               if (GET_MODE_CLASS (mode) == MODE_INT)
8397                 *cost += extra_cost->alu.arith;
8398               else
8399                 *cost += extra_cost->fp[mode == DFmode].compare;
8400             }
8401           return true;
8402         }
8403
8404       /* It's a conditional operation based on the status flags,
8405          so it must be some flavor of CSEL.  */
8406
8407       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8408       if (GET_CODE (op1) == NEG
8409           || GET_CODE (op1) == NOT
8410           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8411         op1 = XEXP (op1, 0);
8412       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8413         {
8414           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8415           op1 = XEXP (op1, 0);
8416           op2 = XEXP (op2, 0);
8417         }
8418
8419       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8420       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8421       return true;
8422     }
8423
8424   /* We don't know what this is, cost all operands.  */
8425   return false;
8426 }
8427
8428 /* Check whether X is a bitfield operation of the form shift + extend that
8429    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8430    operand to which the bitfield operation is applied.  Otherwise return
8431    NULL_RTX.  */
8432
8433 static rtx
8434 aarch64_extend_bitfield_pattern_p (rtx x)
8435 {
8436   rtx_code outer_code = GET_CODE (x);
8437   machine_mode outer_mode = GET_MODE (x);
8438
8439   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8440       && outer_mode != SImode && outer_mode != DImode)
8441     return NULL_RTX;
8442
8443   rtx inner = XEXP (x, 0);
8444   rtx_code inner_code = GET_CODE (inner);
8445   machine_mode inner_mode = GET_MODE (inner);
8446   rtx op = NULL_RTX;
8447
8448   switch (inner_code)
8449     {
8450       case ASHIFT:
8451         if (CONST_INT_P (XEXP (inner, 1))
8452             && (inner_mode == QImode || inner_mode == HImode))
8453           op = XEXP (inner, 0);
8454         break;
8455       case LSHIFTRT:
8456         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8457             && (inner_mode == QImode || inner_mode == HImode))
8458           op = XEXP (inner, 0);
8459         break;
8460       case ASHIFTRT:
8461         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8462             && (inner_mode == QImode || inner_mode == HImode))
8463           op = XEXP (inner, 0);
8464         break;
8465       default:
8466         break;
8467     }
8468
8469   return op;
8470 }
8471
8472 /* Return true if the mask and a shift amount from an RTX of the form
8473    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8474    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8475
8476 bool
8477 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8478                                     rtx shft_amnt)
8479 {
8480   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8481          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8482          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8483          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8484 }
8485
8486 /* Calculate the cost of calculating X, storing it in *COST.  Result
8487    is true if the total cost of the operation has now been calculated.  */
8488 static bool
8489 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8490                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8491 {
8492   rtx op0, op1, op2;
8493   const struct cpu_cost_table *extra_cost
8494     = aarch64_tune_params.insn_extra_cost;
8495   int code = GET_CODE (x);
8496   scalar_int_mode int_mode;
8497
8498   /* By default, assume that everything has equivalent cost to the
8499      cheapest instruction.  Any additional costs are applied as a delta
8500      above this default.  */
8501   *cost = COSTS_N_INSNS (1);
8502
8503   switch (code)
8504     {
8505     case SET:
8506       /* The cost depends entirely on the operands to SET.  */
8507       *cost = 0;
8508       op0 = SET_DEST (x);
8509       op1 = SET_SRC (x);
8510
8511       switch (GET_CODE (op0))
8512         {
8513         case MEM:
8514           if (speed)
8515             {
8516               rtx address = XEXP (op0, 0);
8517               if (VECTOR_MODE_P (mode))
8518                 *cost += extra_cost->ldst.storev;
8519               else if (GET_MODE_CLASS (mode) == MODE_INT)
8520                 *cost += extra_cost->ldst.store;
8521               else if (mode == SFmode)
8522                 *cost += extra_cost->ldst.storef;
8523               else if (mode == DFmode)
8524                 *cost += extra_cost->ldst.stored;
8525
8526               *cost +=
8527                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8528                                                      0, speed));
8529             }
8530
8531           *cost += rtx_cost (op1, mode, SET, 1, speed);
8532           return true;
8533
8534         case SUBREG:
8535           if (! REG_P (SUBREG_REG (op0)))
8536             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8537
8538           /* Fall through.  */
8539         case REG:
8540           /* The cost is one per vector-register copied.  */
8541           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8542             {
8543               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8544               *cost = COSTS_N_INSNS (nregs);
8545             }
8546           /* const0_rtx is in general free, but we will use an
8547              instruction to set a register to 0.  */
8548           else if (REG_P (op1) || op1 == const0_rtx)
8549             {
8550               /* The cost is 1 per register copied.  */
8551               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8552               *cost = COSTS_N_INSNS (nregs);
8553             }
8554           else
8555             /* Cost is just the cost of the RHS of the set.  */
8556             *cost += rtx_cost (op1, mode, SET, 1, speed);
8557           return true;
8558
8559         case ZERO_EXTRACT:
8560         case SIGN_EXTRACT:
8561           /* Bit-field insertion.  Strip any redundant widening of
8562              the RHS to meet the width of the target.  */
8563           if (GET_CODE (op1) == SUBREG)
8564             op1 = SUBREG_REG (op1);
8565           if ((GET_CODE (op1) == ZERO_EXTEND
8566                || GET_CODE (op1) == SIGN_EXTEND)
8567               && CONST_INT_P (XEXP (op0, 1))
8568               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8569               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8570             op1 = XEXP (op1, 0);
8571
8572           if (CONST_INT_P (op1))
8573             {
8574               /* MOV immediate is assumed to always be cheap.  */
8575               *cost = COSTS_N_INSNS (1);
8576             }
8577           else
8578             {
8579               /* BFM.  */
8580               if (speed)
8581                 *cost += extra_cost->alu.bfi;
8582               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8583             }
8584
8585           return true;
8586
8587         default:
8588           /* We can't make sense of this, assume default cost.  */
8589           *cost = COSTS_N_INSNS (1);
8590           return false;
8591         }
8592       return false;
8593
8594     case CONST_INT:
8595       /* If an instruction can incorporate a constant within the
8596          instruction, the instruction's expression avoids calling
8597          rtx_cost() on the constant.  If rtx_cost() is called on a
8598          constant, then it is usually because the constant must be
8599          moved into a register by one or more instructions.
8600
8601          The exception is constant 0, which can be expressed
8602          as XZR/WZR and is therefore free.  The exception to this is
8603          if we have (set (reg) (const0_rtx)) in which case we must cost
8604          the move.  However, we can catch that when we cost the SET, so
8605          we don't need to consider that here.  */
8606       if (x == const0_rtx)
8607         *cost = 0;
8608       else
8609         {
8610           /* To an approximation, building any other constant is
8611              proportionally expensive to the number of instructions
8612              required to build that constant.  This is true whether we
8613              are compiling for SPEED or otherwise.  */
8614           if (!is_a <scalar_int_mode> (mode, &int_mode))
8615             int_mode = word_mode;
8616           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8617                                  (NULL_RTX, x, false, int_mode));
8618         }
8619       return true;
8620
8621     case CONST_DOUBLE:
8622
8623       /* First determine number of instructions to do the move
8624           as an integer constant.  */
8625       if (!aarch64_float_const_representable_p (x)
8626            && !aarch64_can_const_movi_rtx_p (x, mode)
8627            && aarch64_float_const_rtx_p (x))
8628         {
8629           unsigned HOST_WIDE_INT ival;
8630           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8631           gcc_assert (succeed);
8632
8633           scalar_int_mode imode = (mode == HFmode
8634                                    ? SImode
8635                                    : int_mode_for_mode (mode).require ());
8636           int ncost = aarch64_internal_mov_immediate
8637                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8638           *cost += COSTS_N_INSNS (ncost);
8639           return true;
8640         }
8641
8642       if (speed)
8643         {
8644           /* mov[df,sf]_aarch64.  */
8645           if (aarch64_float_const_representable_p (x))
8646             /* FMOV (scalar immediate).  */
8647             *cost += extra_cost->fp[mode == DFmode].fpconst;
8648           else if (!aarch64_float_const_zero_rtx_p (x))
8649             {
8650               /* This will be a load from memory.  */
8651               if (mode == DFmode)
8652                 *cost += extra_cost->ldst.loadd;
8653               else
8654                 *cost += extra_cost->ldst.loadf;
8655             }
8656           else
8657             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8658                or MOV v0.s[0], wzr - neither of which are modeled by the
8659                cost tables.  Just use the default cost.  */
8660             {
8661             }
8662         }
8663
8664       return true;
8665
8666     case MEM:
8667       if (speed)
8668         {
8669           /* For loads we want the base cost of a load, plus an
8670              approximation for the additional cost of the addressing
8671              mode.  */
8672           rtx address = XEXP (x, 0);
8673           if (VECTOR_MODE_P (mode))
8674             *cost += extra_cost->ldst.loadv;
8675           else if (GET_MODE_CLASS (mode) == MODE_INT)
8676             *cost += extra_cost->ldst.load;
8677           else if (mode == SFmode)
8678             *cost += extra_cost->ldst.loadf;
8679           else if (mode == DFmode)
8680             *cost += extra_cost->ldst.loadd;
8681
8682           *cost +=
8683                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8684                                                      0, speed));
8685         }
8686
8687       return true;
8688
8689     case NEG:
8690       op0 = XEXP (x, 0);
8691
8692       if (VECTOR_MODE_P (mode))
8693         {
8694           if (speed)
8695             {
8696               /* FNEG.  */
8697               *cost += extra_cost->vect.alu;
8698             }
8699           return false;
8700         }
8701
8702       if (GET_MODE_CLASS (mode) == MODE_INT)
8703         {
8704           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8705               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8706             {
8707               /* CSETM.  */
8708               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8709               return true;
8710             }
8711
8712           /* Cost this as SUB wzr, X.  */
8713           op0 = CONST0_RTX (mode);
8714           op1 = XEXP (x, 0);
8715           goto cost_minus;
8716         }
8717
8718       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8719         {
8720           /* Support (neg(fma...)) as a single instruction only if
8721              sign of zeros is unimportant.  This matches the decision
8722              making in aarch64.md.  */
8723           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8724             {
8725               /* FNMADD.  */
8726               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8727               return true;
8728             }
8729           if (GET_CODE (op0) == MULT)
8730             {
8731               /* FNMUL.  */
8732               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8733               return true;
8734             }
8735           if (speed)
8736             /* FNEG.  */
8737             *cost += extra_cost->fp[mode == DFmode].neg;
8738           return false;
8739         }
8740
8741       return false;
8742
8743     case CLRSB:
8744     case CLZ:
8745       if (speed)
8746         {
8747           if (VECTOR_MODE_P (mode))
8748             *cost += extra_cost->vect.alu;
8749           else
8750             *cost += extra_cost->alu.clz;
8751         }
8752
8753       return false;
8754
8755     case COMPARE:
8756       op0 = XEXP (x, 0);
8757       op1 = XEXP (x, 1);
8758
8759       if (op1 == const0_rtx
8760           && GET_CODE (op0) == AND)
8761         {
8762           x = op0;
8763           mode = GET_MODE (op0);
8764           goto cost_logic;
8765         }
8766
8767       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8768         {
8769           /* TODO: A write to the CC flags possibly costs extra, this
8770              needs encoding in the cost tables.  */
8771
8772           mode = GET_MODE (op0);
8773           /* ANDS.  */
8774           if (GET_CODE (op0) == AND)
8775             {
8776               x = op0;
8777               goto cost_logic;
8778             }
8779
8780           if (GET_CODE (op0) == PLUS)
8781             {
8782               /* ADDS (and CMN alias).  */
8783               x = op0;
8784               goto cost_plus;
8785             }
8786
8787           if (GET_CODE (op0) == MINUS)
8788             {
8789               /* SUBS.  */
8790               x = op0;
8791               goto cost_minus;
8792             }
8793
8794           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8795               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8796               && CONST_INT_P (XEXP (op0, 2)))
8797             {
8798               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8799                  Handle it here directly rather than going to cost_logic
8800                  since we know the immediate generated for the TST is valid
8801                  so we can avoid creating an intermediate rtx for it only
8802                  for costing purposes.  */
8803               if (speed)
8804                 *cost += extra_cost->alu.logical;
8805
8806               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8807                                  ZERO_EXTRACT, 0, speed);
8808               return true;
8809             }
8810
8811           if (GET_CODE (op1) == NEG)
8812             {
8813               /* CMN.  */
8814               if (speed)
8815                 *cost += extra_cost->alu.arith;
8816
8817               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8818               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8819               return true;
8820             }
8821
8822           /* CMP.
8823
8824              Compare can freely swap the order of operands, and
8825              canonicalization puts the more complex operation first.
8826              But the integer MINUS logic expects the shift/extend
8827              operation in op1.  */
8828           if (! (REG_P (op0)
8829                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8830           {
8831             op0 = XEXP (x, 1);
8832             op1 = XEXP (x, 0);
8833           }
8834           goto cost_minus;
8835         }
8836
8837       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8838         {
8839           /* FCMP.  */
8840           if (speed)
8841             *cost += extra_cost->fp[mode == DFmode].compare;
8842
8843           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8844             {
8845               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8846               /* FCMP supports constant 0.0 for no extra cost. */
8847               return true;
8848             }
8849           return false;
8850         }
8851
8852       if (VECTOR_MODE_P (mode))
8853         {
8854           /* Vector compare.  */
8855           if (speed)
8856             *cost += extra_cost->vect.alu;
8857
8858           if (aarch64_float_const_zero_rtx_p (op1))
8859             {
8860               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8861                  cost.  */
8862               return true;
8863             }
8864           return false;
8865         }
8866       return false;
8867
8868     case MINUS:
8869       {
8870         op0 = XEXP (x, 0);
8871         op1 = XEXP (x, 1);
8872
8873 cost_minus:
8874         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8875
8876         /* Detect valid immediates.  */
8877         if ((GET_MODE_CLASS (mode) == MODE_INT
8878              || (GET_MODE_CLASS (mode) == MODE_CC
8879                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8880             && CONST_INT_P (op1)
8881             && aarch64_uimm12_shift (INTVAL (op1)))
8882           {
8883             if (speed)
8884               /* SUB(S) (immediate).  */
8885               *cost += extra_cost->alu.arith;
8886             return true;
8887           }
8888
8889         /* Look for SUB (extended register).  */
8890         if (is_a <scalar_int_mode> (mode, &int_mode)
8891             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8892           {
8893             if (speed)
8894               *cost += extra_cost->alu.extend_arith;
8895
8896             op1 = aarch64_strip_extend (op1, true);
8897             *cost += rtx_cost (op1, VOIDmode,
8898                                (enum rtx_code) GET_CODE (op1), 0, speed);
8899             return true;
8900           }
8901
8902         rtx new_op1 = aarch64_strip_extend (op1, false);
8903
8904         /* Cost this as an FMA-alike operation.  */
8905         if ((GET_CODE (new_op1) == MULT
8906              || aarch64_shift_p (GET_CODE (new_op1)))
8907             && code != COMPARE)
8908           {
8909             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8910                                             (enum rtx_code) code,
8911                                             speed);
8912             return true;
8913           }
8914
8915         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8916
8917         if (speed)
8918           {
8919             if (VECTOR_MODE_P (mode))
8920               {
8921                 /* Vector SUB.  */
8922                 *cost += extra_cost->vect.alu;
8923               }
8924             else if (GET_MODE_CLASS (mode) == MODE_INT)
8925               {
8926                 /* SUB(S).  */
8927                 *cost += extra_cost->alu.arith;
8928               }
8929             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8930               {
8931                 /* FSUB.  */
8932                 *cost += extra_cost->fp[mode == DFmode].addsub;
8933               }
8934           }
8935         return true;
8936       }
8937
8938     case PLUS:
8939       {
8940         rtx new_op0;
8941
8942         op0 = XEXP (x, 0);
8943         op1 = XEXP (x, 1);
8944
8945 cost_plus:
8946         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8947             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8948           {
8949             /* CSINC.  */
8950             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8951             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8952             return true;
8953           }
8954
8955         if (GET_MODE_CLASS (mode) == MODE_INT
8956             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8957                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8958           {
8959             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8960
8961             if (speed)
8962               /* ADD (immediate).  */
8963               *cost += extra_cost->alu.arith;
8964             return true;
8965           }
8966
8967         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8968
8969         /* Look for ADD (extended register).  */
8970         if (is_a <scalar_int_mode> (mode, &int_mode)
8971             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8972           {
8973             if (speed)
8974               *cost += extra_cost->alu.extend_arith;
8975
8976             op0 = aarch64_strip_extend (op0, true);
8977             *cost += rtx_cost (op0, VOIDmode,
8978                                (enum rtx_code) GET_CODE (op0), 0, speed);
8979             return true;
8980           }
8981
8982         /* Strip any extend, leave shifts behind as we will
8983            cost them through mult_cost.  */
8984         new_op0 = aarch64_strip_extend (op0, false);
8985
8986         if (GET_CODE (new_op0) == MULT
8987             || aarch64_shift_p (GET_CODE (new_op0)))
8988           {
8989             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8990                                             speed);
8991             return true;
8992           }
8993
8994         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8995
8996         if (speed)
8997           {
8998             if (VECTOR_MODE_P (mode))
8999               {
9000                 /* Vector ADD.  */
9001                 *cost += extra_cost->vect.alu;
9002               }
9003             else if (GET_MODE_CLASS (mode) == MODE_INT)
9004               {
9005                 /* ADD.  */
9006                 *cost += extra_cost->alu.arith;
9007               }
9008             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9009               {
9010                 /* FADD.  */
9011                 *cost += extra_cost->fp[mode == DFmode].addsub;
9012               }
9013           }
9014         return true;
9015       }
9016
9017     case BSWAP:
9018       *cost = COSTS_N_INSNS (1);
9019
9020       if (speed)
9021         {
9022           if (VECTOR_MODE_P (mode))
9023             *cost += extra_cost->vect.alu;
9024           else
9025             *cost += extra_cost->alu.rev;
9026         }
9027       return false;
9028
9029     case IOR:
9030       if (aarch_rev16_p (x))
9031         {
9032           *cost = COSTS_N_INSNS (1);
9033
9034           if (speed)
9035             {
9036               if (VECTOR_MODE_P (mode))
9037                 *cost += extra_cost->vect.alu;
9038               else
9039                 *cost += extra_cost->alu.rev;
9040             }
9041           return true;
9042         }
9043
9044       if (aarch64_extr_rtx_p (x, &op0, &op1))
9045         {
9046           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9047           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9048           if (speed)
9049             *cost += extra_cost->alu.shift;
9050
9051           return true;
9052         }
9053     /* Fall through.  */
9054     case XOR:
9055     case AND:
9056     cost_logic:
9057       op0 = XEXP (x, 0);
9058       op1 = XEXP (x, 1);
9059
9060       if (VECTOR_MODE_P (mode))
9061         {
9062           if (speed)
9063             *cost += extra_cost->vect.alu;
9064           return true;
9065         }
9066
9067       if (code == AND
9068           && GET_CODE (op0) == MULT
9069           && CONST_INT_P (XEXP (op0, 1))
9070           && CONST_INT_P (op1)
9071           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9072                                INTVAL (op1)) != 0)
9073         {
9074           /* This is a UBFM/SBFM.  */
9075           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9076           if (speed)
9077             *cost += extra_cost->alu.bfx;
9078           return true;
9079         }
9080
9081       if (is_int_mode (mode, &int_mode))
9082         {
9083           if (CONST_INT_P (op1))
9084             {
9085               /* We have a mask + shift version of a UBFIZ
9086                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9087               if (GET_CODE (op0) == ASHIFT
9088                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9089                                                          XEXP (op0, 1)))
9090                 {
9091                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9092                                      (enum rtx_code) code, 0, speed);
9093                   if (speed)
9094                     *cost += extra_cost->alu.bfx;
9095
9096                   return true;
9097                 }
9098               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9099                 {
9100                 /* We possibly get the immediate for free, this is not
9101                    modelled.  */
9102                   *cost += rtx_cost (op0, int_mode,
9103                                      (enum rtx_code) code, 0, speed);
9104                   if (speed)
9105                     *cost += extra_cost->alu.logical;
9106
9107                   return true;
9108                 }
9109             }
9110           else
9111             {
9112               rtx new_op0 = op0;
9113
9114               /* Handle ORN, EON, or BIC.  */
9115               if (GET_CODE (op0) == NOT)
9116                 op0 = XEXP (op0, 0);
9117
9118               new_op0 = aarch64_strip_shift (op0);
9119
9120               /* If we had a shift on op0 then this is a logical-shift-
9121                  by-register/immediate operation.  Otherwise, this is just
9122                  a logical operation.  */
9123               if (speed)
9124                 {
9125                   if (new_op0 != op0)
9126                     {
9127                       /* Shift by immediate.  */
9128                       if (CONST_INT_P (XEXP (op0, 1)))
9129                         *cost += extra_cost->alu.log_shift;
9130                       else
9131                         *cost += extra_cost->alu.log_shift_reg;
9132                     }
9133                   else
9134                     *cost += extra_cost->alu.logical;
9135                 }
9136
9137               /* In both cases we want to cost both operands.  */
9138               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9139                                  0, speed);
9140               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9141                                  1, speed);
9142
9143               return true;
9144             }
9145         }
9146       return false;
9147
9148     case NOT:
9149       x = XEXP (x, 0);
9150       op0 = aarch64_strip_shift (x);
9151
9152       if (VECTOR_MODE_P (mode))
9153         {
9154           /* Vector NOT.  */
9155           *cost += extra_cost->vect.alu;
9156           return false;
9157         }
9158
9159       /* MVN-shifted-reg.  */
9160       if (op0 != x)
9161         {
9162           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9163
9164           if (speed)
9165             *cost += extra_cost->alu.log_shift;
9166
9167           return true;
9168         }
9169       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9170          Handle the second form here taking care that 'a' in the above can
9171          be a shift.  */
9172       else if (GET_CODE (op0) == XOR)
9173         {
9174           rtx newop0 = XEXP (op0, 0);
9175           rtx newop1 = XEXP (op0, 1);
9176           rtx op0_stripped = aarch64_strip_shift (newop0);
9177
9178           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9179           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9180
9181           if (speed)
9182             {
9183               if (op0_stripped != newop0)
9184                 *cost += extra_cost->alu.log_shift;
9185               else
9186                 *cost += extra_cost->alu.logical;
9187             }
9188
9189           return true;
9190         }
9191       /* MVN.  */
9192       if (speed)
9193         *cost += extra_cost->alu.logical;
9194
9195       return false;
9196
9197     case ZERO_EXTEND:
9198
9199       op0 = XEXP (x, 0);
9200       /* If a value is written in SI mode, then zero extended to DI
9201          mode, the operation will in general be free as a write to
9202          a 'w' register implicitly zeroes the upper bits of an 'x'
9203          register.  However, if this is
9204
9205            (set (reg) (zero_extend (reg)))
9206
9207          we must cost the explicit register move.  */
9208       if (mode == DImode
9209           && GET_MODE (op0) == SImode
9210           && outer == SET)
9211         {
9212           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9213
9214         /* If OP_COST is non-zero, then the cost of the zero extend
9215            is effectively the cost of the inner operation.  Otherwise
9216            we have a MOV instruction and we take the cost from the MOV
9217            itself.  This is true independently of whether we are
9218            optimizing for space or time.  */
9219           if (op_cost)
9220             *cost = op_cost;
9221
9222           return true;
9223         }
9224       else if (MEM_P (op0))
9225         {
9226           /* All loads can zero extend to any size for free.  */
9227           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9228           return true;
9229         }
9230
9231       op0 = aarch64_extend_bitfield_pattern_p (x);
9232       if (op0)
9233         {
9234           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9235           if (speed)
9236             *cost += extra_cost->alu.bfx;
9237           return true;
9238         }
9239
9240       if (speed)
9241         {
9242           if (VECTOR_MODE_P (mode))
9243             {
9244               /* UMOV.  */
9245               *cost += extra_cost->vect.alu;
9246             }
9247           else
9248             {
9249               /* We generate an AND instead of UXTB/UXTH.  */
9250               *cost += extra_cost->alu.logical;
9251             }
9252         }
9253       return false;
9254
9255     case SIGN_EXTEND:
9256       if (MEM_P (XEXP (x, 0)))
9257         {
9258           /* LDRSH.  */
9259           if (speed)
9260             {
9261               rtx address = XEXP (XEXP (x, 0), 0);
9262               *cost += extra_cost->ldst.load_sign_extend;
9263
9264               *cost +=
9265                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9266                                                      0, speed));
9267             }
9268           return true;
9269         }
9270
9271       op0 = aarch64_extend_bitfield_pattern_p (x);
9272       if (op0)
9273         {
9274           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9275           if (speed)
9276             *cost += extra_cost->alu.bfx;
9277           return true;
9278         }
9279
9280       if (speed)
9281         {
9282           if (VECTOR_MODE_P (mode))
9283             *cost += extra_cost->vect.alu;
9284           else
9285             *cost += extra_cost->alu.extend;
9286         }
9287       return false;
9288
9289     case ASHIFT:
9290       op0 = XEXP (x, 0);
9291       op1 = XEXP (x, 1);
9292
9293       if (CONST_INT_P (op1))
9294         {
9295           if (speed)
9296             {
9297               if (VECTOR_MODE_P (mode))
9298                 {
9299                   /* Vector shift (immediate).  */
9300                   *cost += extra_cost->vect.alu;
9301                 }
9302               else
9303                 {
9304                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9305                      aliases.  */
9306                   *cost += extra_cost->alu.shift;
9307                 }
9308             }
9309
9310           /* We can incorporate zero/sign extend for free.  */
9311           if (GET_CODE (op0) == ZERO_EXTEND
9312               || GET_CODE (op0) == SIGN_EXTEND)
9313             op0 = XEXP (op0, 0);
9314
9315           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9316           return true;
9317         }
9318       else
9319         {
9320           if (VECTOR_MODE_P (mode))
9321             {
9322               if (speed)
9323                 /* Vector shift (register).  */
9324                 *cost += extra_cost->vect.alu;
9325             }
9326           else
9327             {
9328               if (speed)
9329                 /* LSLV.  */
9330                 *cost += extra_cost->alu.shift_reg;
9331
9332               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9333                   && CONST_INT_P (XEXP (op1, 1))
9334                   && known_eq (INTVAL (XEXP (op1, 1)),
9335                                GET_MODE_BITSIZE (mode) - 1))
9336                 {
9337                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9338                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9339                      don't recurse into it.  */
9340                   return true;
9341                 }
9342             }
9343           return false;  /* All arguments need to be in registers.  */
9344         }
9345
9346     case ROTATE:
9347     case ROTATERT:
9348     case LSHIFTRT:
9349     case ASHIFTRT:
9350       op0 = XEXP (x, 0);
9351       op1 = XEXP (x, 1);
9352
9353       if (CONST_INT_P (op1))
9354         {
9355           /* ASR (immediate) and friends.  */
9356           if (speed)
9357             {
9358               if (VECTOR_MODE_P (mode))
9359                 *cost += extra_cost->vect.alu;
9360               else
9361                 *cost += extra_cost->alu.shift;
9362             }
9363
9364           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9365           return true;
9366         }
9367       else
9368         {
9369           if (VECTOR_MODE_P (mode))
9370             {
9371               if (speed)
9372                 /* Vector shift (register).  */
9373                 *cost += extra_cost->vect.alu;
9374             }
9375           else
9376             {
9377               if (speed)
9378                 /* ASR (register) and friends.  */
9379                 *cost += extra_cost->alu.shift_reg;
9380
9381               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9382                   && CONST_INT_P (XEXP (op1, 1))
9383                   && known_eq (INTVAL (XEXP (op1, 1)),
9384                                GET_MODE_BITSIZE (mode) - 1))
9385                 {
9386                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9387                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9388                      don't recurse into it.  */
9389                   return true;
9390                 }
9391             }
9392           return false;  /* All arguments need to be in registers.  */
9393         }
9394
9395     case SYMBOL_REF:
9396
9397       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9398           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9399         {
9400           /* LDR.  */
9401           if (speed)
9402             *cost += extra_cost->ldst.load;
9403         }
9404       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9405                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9406         {
9407           /* ADRP, followed by ADD.  */
9408           *cost += COSTS_N_INSNS (1);
9409           if (speed)
9410             *cost += 2 * extra_cost->alu.arith;
9411         }
9412       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9413                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9414         {
9415           /* ADR.  */
9416           if (speed)
9417             *cost += extra_cost->alu.arith;
9418         }
9419
9420       if (flag_pic)
9421         {
9422           /* One extra load instruction, after accessing the GOT.  */
9423           *cost += COSTS_N_INSNS (1);
9424           if (speed)
9425             *cost += extra_cost->ldst.load;
9426         }
9427       return true;
9428
9429     case HIGH:
9430     case LO_SUM:
9431       /* ADRP/ADD (immediate).  */
9432       if (speed)
9433         *cost += extra_cost->alu.arith;
9434       return true;
9435
9436     case ZERO_EXTRACT:
9437     case SIGN_EXTRACT:
9438       /* UBFX/SBFX.  */
9439       if (speed)
9440         {
9441           if (VECTOR_MODE_P (mode))
9442             *cost += extra_cost->vect.alu;
9443           else
9444             *cost += extra_cost->alu.bfx;
9445         }
9446
9447       /* We can trust that the immediates used will be correct (there
9448          are no by-register forms), so we need only cost op0.  */
9449       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9450       return true;
9451
9452     case MULT:
9453       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9454       /* aarch64_rtx_mult_cost always handles recursion to its
9455          operands.  */
9456       return true;
9457
9458     case MOD:
9459     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9460        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9461        an unconditional negate.  This case should only ever be reached through
9462        the set_smod_pow2_cheap check in expmed.c.  */
9463       if (CONST_INT_P (XEXP (x, 1))
9464           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9465           && (mode == SImode || mode == DImode))
9466         {
9467           /* We expand to 4 instructions.  Reset the baseline.  */
9468           *cost = COSTS_N_INSNS (4);
9469
9470           if (speed)
9471             *cost += 2 * extra_cost->alu.logical
9472                      + 2 * extra_cost->alu.arith;
9473
9474           return true;
9475         }
9476
9477     /* Fall-through.  */
9478     case UMOD:
9479       if (speed)
9480         {
9481           /* Slighly prefer UMOD over SMOD.  */
9482           if (VECTOR_MODE_P (mode))
9483             *cost += extra_cost->vect.alu;
9484           else if (GET_MODE_CLASS (mode) == MODE_INT)
9485             *cost += (extra_cost->mult[mode == DImode].add
9486                       + extra_cost->mult[mode == DImode].idiv
9487                       + (code == MOD ? 1 : 0));
9488         }
9489       return false;  /* All arguments need to be in registers.  */
9490
9491     case DIV:
9492     case UDIV:
9493     case SQRT:
9494       if (speed)
9495         {
9496           if (VECTOR_MODE_P (mode))
9497             *cost += extra_cost->vect.alu;
9498           else if (GET_MODE_CLASS (mode) == MODE_INT)
9499             /* There is no integer SQRT, so only DIV and UDIV can get
9500                here.  */
9501             *cost += (extra_cost->mult[mode == DImode].idiv
9502                      /* Slighly prefer UDIV over SDIV.  */
9503                      + (code == DIV ? 1 : 0));
9504           else
9505             *cost += extra_cost->fp[mode == DFmode].div;
9506         }
9507       return false;  /* All arguments need to be in registers.  */
9508
9509     case IF_THEN_ELSE:
9510       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9511                                          XEXP (x, 2), cost, speed);
9512
9513     case EQ:
9514     case NE:
9515     case GT:
9516     case GTU:
9517     case LT:
9518     case LTU:
9519     case GE:
9520     case GEU:
9521     case LE:
9522     case LEU:
9523
9524       return false; /* All arguments must be in registers.  */
9525
9526     case FMA:
9527       op0 = XEXP (x, 0);
9528       op1 = XEXP (x, 1);
9529       op2 = XEXP (x, 2);
9530
9531       if (speed)
9532         {
9533           if (VECTOR_MODE_P (mode))
9534             *cost += extra_cost->vect.alu;
9535           else
9536             *cost += extra_cost->fp[mode == DFmode].fma;
9537         }
9538
9539       /* FMSUB, FNMADD, and FNMSUB are free.  */
9540       if (GET_CODE (op0) == NEG)
9541         op0 = XEXP (op0, 0);
9542
9543       if (GET_CODE (op2) == NEG)
9544         op2 = XEXP (op2, 0);
9545
9546       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9547          and the by-element operand as operand 0.  */
9548       if (GET_CODE (op1) == NEG)
9549         op1 = XEXP (op1, 0);
9550
9551       /* Catch vector-by-element operations.  The by-element operand can
9552          either be (vec_duplicate (vec_select (x))) or just
9553          (vec_select (x)), depending on whether we are multiplying by
9554          a vector or a scalar.
9555
9556          Canonicalization is not very good in these cases, FMA4 will put the
9557          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9558       if (GET_CODE (op0) == VEC_DUPLICATE)
9559         op0 = XEXP (op0, 0);
9560       else if (GET_CODE (op1) == VEC_DUPLICATE)
9561         op1 = XEXP (op1, 0);
9562
9563       if (GET_CODE (op0) == VEC_SELECT)
9564         op0 = XEXP (op0, 0);
9565       else if (GET_CODE (op1) == VEC_SELECT)
9566         op1 = XEXP (op1, 0);
9567
9568       /* If the remaining parameters are not registers,
9569          get the cost to put them into registers.  */
9570       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9571       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9572       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9573       return true;
9574
9575     case FLOAT:
9576     case UNSIGNED_FLOAT:
9577       if (speed)
9578         *cost += extra_cost->fp[mode == DFmode].fromint;
9579       return false;
9580
9581     case FLOAT_EXTEND:
9582       if (speed)
9583         {
9584           if (VECTOR_MODE_P (mode))
9585             {
9586               /*Vector truncate.  */
9587               *cost += extra_cost->vect.alu;
9588             }
9589           else
9590             *cost += extra_cost->fp[mode == DFmode].widen;
9591         }
9592       return false;
9593
9594     case FLOAT_TRUNCATE:
9595       if (speed)
9596         {
9597           if (VECTOR_MODE_P (mode))
9598             {
9599               /*Vector conversion.  */
9600               *cost += extra_cost->vect.alu;
9601             }
9602           else
9603             *cost += extra_cost->fp[mode == DFmode].narrow;
9604         }
9605       return false;
9606
9607     case FIX:
9608     case UNSIGNED_FIX:
9609       x = XEXP (x, 0);
9610       /* Strip the rounding part.  They will all be implemented
9611          by the fcvt* family of instructions anyway.  */
9612       if (GET_CODE (x) == UNSPEC)
9613         {
9614           unsigned int uns_code = XINT (x, 1);
9615
9616           if (uns_code == UNSPEC_FRINTA
9617               || uns_code == UNSPEC_FRINTM
9618               || uns_code == UNSPEC_FRINTN
9619               || uns_code == UNSPEC_FRINTP
9620               || uns_code == UNSPEC_FRINTZ)
9621             x = XVECEXP (x, 0, 0);
9622         }
9623
9624       if (speed)
9625         {
9626           if (VECTOR_MODE_P (mode))
9627             *cost += extra_cost->vect.alu;
9628           else
9629             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9630         }
9631
9632       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9633          fixed-point fcvt.  */
9634       if (GET_CODE (x) == MULT
9635           && ((VECTOR_MODE_P (mode)
9636                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9637               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9638         {
9639           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9640                              0, speed);
9641           return true;
9642         }
9643
9644       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9645       return true;
9646
9647     case ABS:
9648       if (VECTOR_MODE_P (mode))
9649         {
9650           /* ABS (vector).  */
9651           if (speed)
9652             *cost += extra_cost->vect.alu;
9653         }
9654       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9655         {
9656           op0 = XEXP (x, 0);
9657
9658           /* FABD, which is analogous to FADD.  */
9659           if (GET_CODE (op0) == MINUS)
9660             {
9661               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9662               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9663               if (speed)
9664                 *cost += extra_cost->fp[mode == DFmode].addsub;
9665
9666               return true;
9667             }
9668           /* Simple FABS is analogous to FNEG.  */
9669           if (speed)
9670             *cost += extra_cost->fp[mode == DFmode].neg;
9671         }
9672       else
9673         {
9674           /* Integer ABS will either be split to
9675              two arithmetic instructions, or will be an ABS
9676              (scalar), which we don't model.  */
9677           *cost = COSTS_N_INSNS (2);
9678           if (speed)
9679             *cost += 2 * extra_cost->alu.arith;
9680         }
9681       return false;
9682
9683     case SMAX:
9684     case SMIN:
9685       if (speed)
9686         {
9687           if (VECTOR_MODE_P (mode))
9688             *cost += extra_cost->vect.alu;
9689           else
9690             {
9691               /* FMAXNM/FMINNM/FMAX/FMIN.
9692                  TODO: This may not be accurate for all implementations, but
9693                  we do not model this in the cost tables.  */
9694               *cost += extra_cost->fp[mode == DFmode].addsub;
9695             }
9696         }
9697       return false;
9698
9699     case UNSPEC:
9700       /* The floating point round to integer frint* instructions.  */
9701       if (aarch64_frint_unspec_p (XINT (x, 1)))
9702         {
9703           if (speed)
9704             *cost += extra_cost->fp[mode == DFmode].roundint;
9705
9706           return false;
9707         }
9708
9709       if (XINT (x, 1) == UNSPEC_RBIT)
9710         {
9711           if (speed)
9712             *cost += extra_cost->alu.rev;
9713
9714           return false;
9715         }
9716       break;
9717
9718     case TRUNCATE:
9719
9720       /* Decompose <su>muldi3_highpart.  */
9721       if (/* (truncate:DI  */
9722           mode == DImode
9723           /*   (lshiftrt:TI  */
9724           && GET_MODE (XEXP (x, 0)) == TImode
9725           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9726           /*      (mult:TI  */
9727           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9728           /*        (ANY_EXTEND:TI (reg:DI))
9729                     (ANY_EXTEND:TI (reg:DI)))  */
9730           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9731                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9732               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9733                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9734           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9735           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9736           /*     (const_int 64)  */
9737           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9738           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9739         {
9740           /* UMULH/SMULH.  */
9741           if (speed)
9742             *cost += extra_cost->mult[mode == DImode].extend;
9743           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9744                              mode, MULT, 0, speed);
9745           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9746                              mode, MULT, 1, speed);
9747           return true;
9748         }
9749
9750       /* Fall through.  */
9751     default:
9752       break;
9753     }
9754
9755   if (dump_file
9756       && flag_aarch64_verbose_cost)
9757     fprintf (dump_file,
9758       "\nFailed to cost RTX.  Assuming default cost.\n");
9759
9760   return true;
9761 }
9762
9763 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9764    calculated for X.  This cost is stored in *COST.  Returns true
9765    if the total cost of X was calculated.  */
9766 static bool
9767 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9768                    int param, int *cost, bool speed)
9769 {
9770   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9771
9772   if (dump_file
9773       && flag_aarch64_verbose_cost)
9774     {
9775       print_rtl_single (dump_file, x);
9776       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9777                speed ? "Hot" : "Cold",
9778                *cost, result ? "final" : "partial");
9779     }
9780
9781   return result;
9782 }
9783
9784 static int
9785 aarch64_register_move_cost (machine_mode mode,
9786                             reg_class_t from_i, reg_class_t to_i)
9787 {
9788   enum reg_class from = (enum reg_class) from_i;
9789   enum reg_class to = (enum reg_class) to_i;
9790   const struct cpu_regmove_cost *regmove_cost
9791     = aarch64_tune_params.regmove_cost;
9792
9793   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9794   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9795     to = GENERAL_REGS;
9796
9797   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9798     from = GENERAL_REGS;
9799
9800   /* Moving between GPR and stack cost is the same as GP2GP.  */
9801   if ((from == GENERAL_REGS && to == STACK_REG)
9802       || (to == GENERAL_REGS && from == STACK_REG))
9803     return regmove_cost->GP2GP;
9804
9805   /* To/From the stack register, we move via the gprs.  */
9806   if (to == STACK_REG || from == STACK_REG)
9807     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9808             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9809
9810   if (known_eq (GET_MODE_SIZE (mode), 16))
9811     {
9812       /* 128-bit operations on general registers require 2 instructions.  */
9813       if (from == GENERAL_REGS && to == GENERAL_REGS)
9814         return regmove_cost->GP2GP * 2;
9815       else if (from == GENERAL_REGS)
9816         return regmove_cost->GP2FP * 2;
9817       else if (to == GENERAL_REGS)
9818         return regmove_cost->FP2GP * 2;
9819
9820       /* When AdvSIMD instructions are disabled it is not possible to move
9821          a 128-bit value directly between Q registers.  This is handled in
9822          secondary reload.  A general register is used as a scratch to move
9823          the upper DI value and the lower DI value is moved directly,
9824          hence the cost is the sum of three moves. */
9825       if (! TARGET_SIMD)
9826         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9827
9828       return regmove_cost->FP2FP;
9829     }
9830
9831   if (from == GENERAL_REGS && to == GENERAL_REGS)
9832     return regmove_cost->GP2GP;
9833   else if (from == GENERAL_REGS)
9834     return regmove_cost->GP2FP;
9835   else if (to == GENERAL_REGS)
9836     return regmove_cost->FP2GP;
9837
9838   return regmove_cost->FP2FP;
9839 }
9840
9841 static int
9842 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9843                           reg_class_t rclass ATTRIBUTE_UNUSED,
9844                           bool in ATTRIBUTE_UNUSED)
9845 {
9846   return aarch64_tune_params.memmov_cost;
9847 }
9848
9849 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9850    to optimize 1.0/sqrt.  */
9851
9852 static bool
9853 use_rsqrt_p (machine_mode mode)
9854 {
9855   return (!flag_trapping_math
9856           && flag_unsafe_math_optimizations
9857           && ((aarch64_tune_params.approx_modes->recip_sqrt
9858                & AARCH64_APPROX_MODE (mode))
9859               || flag_mrecip_low_precision_sqrt));
9860 }
9861
9862 /* Function to decide when to use the approximate reciprocal square root
9863    builtin.  */
9864
9865 static tree
9866 aarch64_builtin_reciprocal (tree fndecl)
9867 {
9868   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9869
9870   if (!use_rsqrt_p (mode))
9871     return NULL_TREE;
9872   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9873 }
9874
9875 typedef rtx (*rsqrte_type) (rtx, rtx);
9876
9877 /* Select reciprocal square root initial estimate insn depending on machine
9878    mode.  */
9879
9880 static rsqrte_type
9881 get_rsqrte_type (machine_mode mode)
9882 {
9883   switch (mode)
9884   {
9885     case E_DFmode:   return gen_aarch64_rsqrtedf;
9886     case E_SFmode:   return gen_aarch64_rsqrtesf;
9887     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9888     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9889     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9890     default: gcc_unreachable ();
9891   }
9892 }
9893
9894 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9895
9896 /* Select reciprocal square root series step insn depending on machine mode.  */
9897
9898 static rsqrts_type
9899 get_rsqrts_type (machine_mode mode)
9900 {
9901   switch (mode)
9902   {
9903     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9904     case E_SFmode:   return gen_aarch64_rsqrtssf;
9905     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9906     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9907     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9908     default: gcc_unreachable ();
9909   }
9910 }
9911
9912 /* Emit instruction sequence to compute either the approximate square root
9913    or its approximate reciprocal, depending on the flag RECP, and return
9914    whether the sequence was emitted or not.  */
9915
9916 bool
9917 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9918 {
9919   machine_mode mode = GET_MODE (dst);
9920
9921   if (GET_MODE_INNER (mode) == HFmode)
9922     {
9923       gcc_assert (!recp);
9924       return false;
9925     }
9926
9927   if (!recp)
9928     {
9929       if (!(flag_mlow_precision_sqrt
9930             || (aarch64_tune_params.approx_modes->sqrt
9931                 & AARCH64_APPROX_MODE (mode))))
9932         return false;
9933
9934       if (flag_finite_math_only
9935           || flag_trapping_math
9936           || !flag_unsafe_math_optimizations
9937           || optimize_function_for_size_p (cfun))
9938         return false;
9939     }
9940   else
9941     /* Caller assumes we cannot fail.  */
9942     gcc_assert (use_rsqrt_p (mode));
9943
9944   machine_mode mmsk = mode_for_int_vector (mode).require ();
9945   rtx xmsk = gen_reg_rtx (mmsk);
9946   if (!recp)
9947     /* When calculating the approximate square root, compare the
9948        argument with 0.0 and create a mask.  */
9949     emit_insn (gen_rtx_SET (xmsk,
9950                             gen_rtx_NEG (mmsk,
9951                                          gen_rtx_EQ (mmsk, src,
9952                                                      CONST0_RTX (mode)))));
9953
9954   /* Estimate the approximate reciprocal square root.  */
9955   rtx xdst = gen_reg_rtx (mode);
9956   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9957
9958   /* Iterate over the series twice for SF and thrice for DF.  */
9959   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9960
9961   /* Optionally iterate over the series once less for faster performance
9962      while sacrificing the accuracy.  */
9963   if ((recp && flag_mrecip_low_precision_sqrt)
9964       || (!recp && flag_mlow_precision_sqrt))
9965     iterations--;
9966
9967   /* Iterate over the series to calculate the approximate reciprocal square
9968      root.  */
9969   rtx x1 = gen_reg_rtx (mode);
9970   while (iterations--)
9971     {
9972       rtx x2 = gen_reg_rtx (mode);
9973       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9974
9975       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9976
9977       if (iterations > 0)
9978         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9979     }
9980
9981   if (!recp)
9982     {
9983       /* Qualify the approximate reciprocal square root when the argument is
9984          0.0 by squashing the intermediary result to 0.0.  */
9985       rtx xtmp = gen_reg_rtx (mmsk);
9986       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9987                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9988       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9989
9990       /* Calculate the approximate square root.  */
9991       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9992     }
9993
9994   /* Finalize the approximation.  */
9995   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9996
9997   return true;
9998 }
9999
10000 typedef rtx (*recpe_type) (rtx, rtx);
10001
10002 /* Select reciprocal initial estimate insn depending on machine mode.  */
10003
10004 static recpe_type
10005 get_recpe_type (machine_mode mode)
10006 {
10007   switch (mode)
10008   {
10009     case E_SFmode:   return (gen_aarch64_frecpesf);
10010     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
10011     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
10012     case E_DFmode:   return (gen_aarch64_frecpedf);
10013     case E_V2DFmode: return (gen_aarch64_frecpev2df);
10014     default:         gcc_unreachable ();
10015   }
10016 }
10017
10018 typedef rtx (*recps_type) (rtx, rtx, rtx);
10019
10020 /* Select reciprocal series step insn depending on machine mode.  */
10021
10022 static recps_type
10023 get_recps_type (machine_mode mode)
10024 {
10025   switch (mode)
10026   {
10027     case E_SFmode:   return (gen_aarch64_frecpssf);
10028     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10029     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10030     case E_DFmode:   return (gen_aarch64_frecpsdf);
10031     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10032     default:         gcc_unreachable ();
10033   }
10034 }
10035
10036 /* Emit the instruction sequence to compute the approximation for the division
10037    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10038
10039 bool
10040 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10041 {
10042   machine_mode mode = GET_MODE (quo);
10043
10044   if (GET_MODE_INNER (mode) == HFmode)
10045     return false;
10046
10047   bool use_approx_division_p = (flag_mlow_precision_div
10048                                 || (aarch64_tune_params.approx_modes->division
10049                                     & AARCH64_APPROX_MODE (mode)));
10050
10051   if (!flag_finite_math_only
10052       || flag_trapping_math
10053       || !flag_unsafe_math_optimizations
10054       || optimize_function_for_size_p (cfun)
10055       || !use_approx_division_p)
10056     return false;
10057
10058   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10059     return false;
10060
10061   /* Estimate the approximate reciprocal.  */
10062   rtx xrcp = gen_reg_rtx (mode);
10063   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10064
10065   /* Iterate over the series twice for SF and thrice for DF.  */
10066   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10067
10068   /* Optionally iterate over the series once less for faster performance,
10069      while sacrificing the accuracy.  */
10070   if (flag_mlow_precision_div)
10071     iterations--;
10072
10073   /* Iterate over the series to calculate the approximate reciprocal.  */
10074   rtx xtmp = gen_reg_rtx (mode);
10075   while (iterations--)
10076     {
10077       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10078
10079       if (iterations > 0)
10080         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10081     }
10082
10083   if (num != CONST1_RTX (mode))
10084     {
10085       /* As the approximate reciprocal of DEN is already calculated, only
10086          calculate the approximate division when NUM is not 1.0.  */
10087       rtx xnum = force_reg (mode, num);
10088       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10089     }
10090
10091   /* Finalize the approximation.  */
10092   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10093   return true;
10094 }
10095
10096 /* Return the number of instructions that can be issued per cycle.  */
10097 static int
10098 aarch64_sched_issue_rate (void)
10099 {
10100   return aarch64_tune_params.issue_rate;
10101 }
10102
10103 static int
10104 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10105 {
10106   int issue_rate = aarch64_sched_issue_rate ();
10107
10108   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10109 }
10110
10111
10112 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10113    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10114    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10115
10116 static int
10117 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10118                                                     int ready_index)
10119 {
10120   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10121 }
10122
10123
10124 /* Vectorizer cost model target hooks.  */
10125
10126 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10127 static int
10128 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10129                                     tree vectype,
10130                                     int misalign ATTRIBUTE_UNUSED)
10131 {
10132   unsigned elements;
10133   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10134   bool fp = false;
10135
10136   if (vectype != NULL)
10137     fp = FLOAT_TYPE_P (vectype);
10138
10139   switch (type_of_cost)
10140     {
10141       case scalar_stmt:
10142         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10143
10144       case scalar_load:
10145         return costs->scalar_load_cost;
10146
10147       case scalar_store:
10148         return costs->scalar_store_cost;
10149
10150       case vector_stmt:
10151         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10152
10153       case vector_load:
10154         return costs->vec_align_load_cost;
10155
10156       case vector_store:
10157         return costs->vec_store_cost;
10158
10159       case vec_to_scalar:
10160         return costs->vec_to_scalar_cost;
10161
10162       case scalar_to_vec:
10163         return costs->scalar_to_vec_cost;
10164
10165       case unaligned_load:
10166       case vector_gather_load:
10167         return costs->vec_unalign_load_cost;
10168
10169       case unaligned_store:
10170       case vector_scatter_store:
10171         return costs->vec_unalign_store_cost;
10172
10173       case cond_branch_taken:
10174         return costs->cond_taken_branch_cost;
10175
10176       case cond_branch_not_taken:
10177         return costs->cond_not_taken_branch_cost;
10178
10179       case vec_perm:
10180         return costs->vec_permute_cost;
10181
10182       case vec_promote_demote:
10183         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10184
10185       case vec_construct:
10186         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10187         return elements / 2 + 1;
10188
10189       default:
10190         gcc_unreachable ();
10191     }
10192 }
10193
10194 /* Implement targetm.vectorize.add_stmt_cost.  */
10195 static unsigned
10196 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10197                        struct _stmt_vec_info *stmt_info, int misalign,
10198                        enum vect_cost_model_location where)
10199 {
10200   unsigned *cost = (unsigned *) data;
10201   unsigned retval = 0;
10202
10203   if (flag_vect_cost_model)
10204     {
10205       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10206       int stmt_cost =
10207             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10208
10209       /* Statements in an inner loop relative to the loop being
10210          vectorized are weighted more heavily.  The value here is
10211          arbitrary and could potentially be improved with analysis.  */
10212       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10213         count *= 50; /*  FIXME  */
10214
10215       retval = (unsigned) (count * stmt_cost);
10216       cost[where] += retval;
10217     }
10218
10219   return retval;
10220 }
10221
10222 static void initialize_aarch64_code_model (struct gcc_options *);
10223
10224 /* Parse the TO_PARSE string and put the architecture struct that it
10225    selects into RES and the architectural features into ISA_FLAGS.
10226    Return an aarch64_parse_opt_result describing the parse result.
10227    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10228
10229 static enum aarch64_parse_opt_result
10230 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10231                     unsigned long *isa_flags)
10232 {
10233   char *ext;
10234   const struct processor *arch;
10235   char *str = (char *) alloca (strlen (to_parse) + 1);
10236   size_t len;
10237
10238   strcpy (str, to_parse);
10239
10240   ext = strchr (str, '+');
10241
10242   if (ext != NULL)
10243     len = ext - str;
10244   else
10245     len = strlen (str);
10246
10247   if (len == 0)
10248     return AARCH64_PARSE_MISSING_ARG;
10249
10250
10251   /* Loop through the list of supported ARCHes to find a match.  */
10252   for (arch = all_architectures; arch->name != NULL; arch++)
10253     {
10254       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10255         {
10256           unsigned long isa_temp = arch->flags;
10257
10258           if (ext != NULL)
10259             {
10260               /* TO_PARSE string contains at least one extension.  */
10261               enum aarch64_parse_opt_result ext_res
10262                 = aarch64_parse_extension (ext, &isa_temp);
10263
10264               if (ext_res != AARCH64_PARSE_OK)
10265                 return ext_res;
10266             }
10267           /* Extension parsing was successful.  Confirm the result
10268              arch and ISA flags.  */
10269           *res = arch;
10270           *isa_flags = isa_temp;
10271           return AARCH64_PARSE_OK;
10272         }
10273     }
10274
10275   /* ARCH name not found in list.  */
10276   return AARCH64_PARSE_INVALID_ARG;
10277 }
10278
10279 /* Parse the TO_PARSE string and put the result tuning in RES and the
10280    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10281    describing the parse result.  If there is an error parsing, RES and
10282    ISA_FLAGS are left unchanged.  */
10283
10284 static enum aarch64_parse_opt_result
10285 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10286                    unsigned long *isa_flags)
10287 {
10288   char *ext;
10289   const struct processor *cpu;
10290   char *str = (char *) alloca (strlen (to_parse) + 1);
10291   size_t len;
10292
10293   strcpy (str, to_parse);
10294
10295   ext = strchr (str, '+');
10296
10297   if (ext != NULL)
10298     len = ext - str;
10299   else
10300     len = strlen (str);
10301
10302   if (len == 0)
10303     return AARCH64_PARSE_MISSING_ARG;
10304
10305
10306   /* Loop through the list of supported CPUs to find a match.  */
10307   for (cpu = all_cores; cpu->name != NULL; cpu++)
10308     {
10309       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10310         {
10311           unsigned long isa_temp = cpu->flags;
10312
10313
10314           if (ext != NULL)
10315             {
10316               /* TO_PARSE string contains at least one extension.  */
10317               enum aarch64_parse_opt_result ext_res
10318                 = aarch64_parse_extension (ext, &isa_temp);
10319
10320               if (ext_res != AARCH64_PARSE_OK)
10321                 return ext_res;
10322             }
10323           /* Extension parsing was successfull.  Confirm the result
10324              cpu and ISA flags.  */
10325           *res = cpu;
10326           *isa_flags = isa_temp;
10327           return AARCH64_PARSE_OK;
10328         }
10329     }
10330
10331   /* CPU name not found in list.  */
10332   return AARCH64_PARSE_INVALID_ARG;
10333 }
10334
10335 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10336    Return an aarch64_parse_opt_result describing the parse result.
10337    If the parsing fails the RES does not change.  */
10338
10339 static enum aarch64_parse_opt_result
10340 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10341 {
10342   const struct processor *cpu;
10343   char *str = (char *) alloca (strlen (to_parse) + 1);
10344
10345   strcpy (str, to_parse);
10346
10347   /* Loop through the list of supported CPUs to find a match.  */
10348   for (cpu = all_cores; cpu->name != NULL; cpu++)
10349     {
10350       if (strcmp (cpu->name, str) == 0)
10351         {
10352           *res = cpu;
10353           return AARCH64_PARSE_OK;
10354         }
10355     }
10356
10357   /* CPU name not found in list.  */
10358   return AARCH64_PARSE_INVALID_ARG;
10359 }
10360
10361 /* Parse TOKEN, which has length LENGTH to see if it is an option
10362    described in FLAG.  If it is, return the index bit for that fusion type.
10363    If not, error (printing OPTION_NAME) and return zero.  */
10364
10365 static unsigned int
10366 aarch64_parse_one_option_token (const char *token,
10367                                 size_t length,
10368                                 const struct aarch64_flag_desc *flag,
10369                                 const char *option_name)
10370 {
10371   for (; flag->name != NULL; flag++)
10372     {
10373       if (length == strlen (flag->name)
10374           && !strncmp (flag->name, token, length))
10375         return flag->flag;
10376     }
10377
10378   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10379   return 0;
10380 }
10381
10382 /* Parse OPTION which is a comma-separated list of flags to enable.
10383    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10384    default state we inherit from the CPU tuning structures.  OPTION_NAME
10385    gives the top-level option we are parsing in the -moverride string,
10386    for use in error messages.  */
10387
10388 static unsigned int
10389 aarch64_parse_boolean_options (const char *option,
10390                                const struct aarch64_flag_desc *flags,
10391                                unsigned int initial_state,
10392                                const char *option_name)
10393 {
10394   const char separator = '.';
10395   const char* specs = option;
10396   const char* ntoken = option;
10397   unsigned int found_flags = initial_state;
10398
10399   while ((ntoken = strchr (specs, separator)))
10400     {
10401       size_t token_length = ntoken - specs;
10402       unsigned token_ops = aarch64_parse_one_option_token (specs,
10403                                                            token_length,
10404                                                            flags,
10405                                                            option_name);
10406       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10407          in the token stream, reset the supported operations.  So:
10408
10409            adrp+add.cmp+branch.none.adrp+add
10410
10411            would have the result of turning on only adrp+add fusion.  */
10412       if (!token_ops)
10413         found_flags = 0;
10414
10415       found_flags |= token_ops;
10416       specs = ++ntoken;
10417     }
10418
10419   /* We ended with a comma, print something.  */
10420   if (!(*specs))
10421     {
10422       error ("%s string ill-formed\n", option_name);
10423       return 0;
10424     }
10425
10426   /* We still have one more token to parse.  */
10427   size_t token_length = strlen (specs);
10428   unsigned token_ops = aarch64_parse_one_option_token (specs,
10429                                                        token_length,
10430                                                        flags,
10431                                                        option_name);
10432    if (!token_ops)
10433      found_flags = 0;
10434
10435   found_flags |= token_ops;
10436   return found_flags;
10437 }
10438
10439 /* Support for overriding instruction fusion.  */
10440
10441 static void
10442 aarch64_parse_fuse_string (const char *fuse_string,
10443                             struct tune_params *tune)
10444 {
10445   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10446                                                      aarch64_fusible_pairs,
10447                                                      tune->fusible_ops,
10448                                                      "fuse=");
10449 }
10450
10451 /* Support for overriding other tuning flags.  */
10452
10453 static void
10454 aarch64_parse_tune_string (const char *tune_string,
10455                             struct tune_params *tune)
10456 {
10457   tune->extra_tuning_flags
10458     = aarch64_parse_boolean_options (tune_string,
10459                                      aarch64_tuning_flags,
10460                                      tune->extra_tuning_flags,
10461                                      "tune=");
10462 }
10463
10464 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10465    we understand.  If it is, extract the option string and handoff to
10466    the appropriate function.  */
10467
10468 void
10469 aarch64_parse_one_override_token (const char* token,
10470                                   size_t length,
10471                                   struct tune_params *tune)
10472 {
10473   const struct aarch64_tuning_override_function *fn
10474     = aarch64_tuning_override_functions;
10475
10476   const char *option_part = strchr (token, '=');
10477   if (!option_part)
10478     {
10479       error ("tuning string missing in option (%s)", token);
10480       return;
10481     }
10482
10483   /* Get the length of the option name.  */
10484   length = option_part - token;
10485   /* Skip the '=' to get to the option string.  */
10486   option_part++;
10487
10488   for (; fn->name != NULL; fn++)
10489     {
10490       if (!strncmp (fn->name, token, length))
10491         {
10492           fn->parse_override (option_part, tune);
10493           return;
10494         }
10495     }
10496
10497   error ("unknown tuning option (%s)",token);
10498   return;
10499 }
10500
10501 /* A checking mechanism for the implementation of the tls size.  */
10502
10503 static void
10504 initialize_aarch64_tls_size (struct gcc_options *opts)
10505 {
10506   if (aarch64_tls_size == 0)
10507     aarch64_tls_size = 24;
10508
10509   switch (opts->x_aarch64_cmodel_var)
10510     {
10511     case AARCH64_CMODEL_TINY:
10512       /* Both the default and maximum TLS size allowed under tiny is 1M which
10513          needs two instructions to address, so we clamp the size to 24.  */
10514       if (aarch64_tls_size > 24)
10515         aarch64_tls_size = 24;
10516       break;
10517     case AARCH64_CMODEL_SMALL:
10518       /* The maximum TLS size allowed under small is 4G.  */
10519       if (aarch64_tls_size > 32)
10520         aarch64_tls_size = 32;
10521       break;
10522     case AARCH64_CMODEL_LARGE:
10523       /* The maximum TLS size allowed under large is 16E.
10524          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10525       if (aarch64_tls_size > 48)
10526         aarch64_tls_size = 48;
10527       break;
10528     default:
10529       gcc_unreachable ();
10530     }
10531
10532   return;
10533 }
10534
10535 /* Parse STRING looking for options in the format:
10536      string     :: option:string
10537      option     :: name=substring
10538      name       :: {a-z}
10539      substring  :: defined by option.  */
10540
10541 static void
10542 aarch64_parse_override_string (const char* input_string,
10543                                struct tune_params* tune)
10544 {
10545   const char separator = ':';
10546   size_t string_length = strlen (input_string) + 1;
10547   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10548   char *string = string_root;
10549   strncpy (string, input_string, string_length);
10550   string[string_length - 1] = '\0';
10551
10552   char* ntoken = string;
10553
10554   while ((ntoken = strchr (string, separator)))
10555     {
10556       size_t token_length = ntoken - string;
10557       /* Make this substring look like a string.  */
10558       *ntoken = '\0';
10559       aarch64_parse_one_override_token (string, token_length, tune);
10560       string = ++ntoken;
10561     }
10562
10563   /* One last option to parse.  */
10564   aarch64_parse_one_override_token (string, strlen (string), tune);
10565   free (string_root);
10566 }
10567
10568
10569 static void
10570 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10571 {
10572   /* PR 70044: We have to be careful about being called multiple times for the
10573      same function.  This means all changes should be repeatable.  */
10574
10575   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10576      Disable the frame pointer flag so the mid-end will not use a frame
10577      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10578      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10579      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10580   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10581   if (opts->x_flag_omit_frame_pointer == 0)
10582     opts->x_flag_omit_frame_pointer = 2;
10583
10584   /* If not optimizing for size, set the default
10585      alignment to what the target wants.  */
10586   if (!opts->x_optimize_size)
10587     {
10588       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10589         opts->x_str_align_loops = aarch64_tune_params.loop_align;
10590       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10591         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10592       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10593         opts->x_str_align_functions = aarch64_tune_params.function_align;
10594     }
10595
10596   /* We default to no pc-relative literal loads.  */
10597
10598   aarch64_pcrelative_literal_loads = false;
10599
10600   /* If -mpc-relative-literal-loads is set on the command line, this
10601      implies that the user asked for PC relative literal loads.  */
10602   if (opts->x_pcrelative_literal_loads == 1)
10603     aarch64_pcrelative_literal_loads = true;
10604
10605   /* In the tiny memory model it makes no sense to disallow PC relative
10606      literal pool loads.  */
10607   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10608       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10609     aarch64_pcrelative_literal_loads = true;
10610
10611   /* When enabling the lower precision Newton series for the square root, also
10612      enable it for the reciprocal square root, since the latter is an
10613      intermediary step for the former.  */
10614   if (flag_mlow_precision_sqrt)
10615     flag_mrecip_low_precision_sqrt = true;
10616 }
10617
10618 /* 'Unpack' up the internal tuning structs and update the options
10619     in OPTS.  The caller must have set up selected_tune and selected_arch
10620     as all the other target-specific codegen decisions are
10621     derived from them.  */
10622
10623 void
10624 aarch64_override_options_internal (struct gcc_options *opts)
10625 {
10626   aarch64_tune_flags = selected_tune->flags;
10627   aarch64_tune = selected_tune->sched_core;
10628   /* Make a copy of the tuning parameters attached to the core, which
10629      we may later overwrite.  */
10630   aarch64_tune_params = *(selected_tune->tune);
10631   aarch64_architecture_version = selected_arch->architecture_version;
10632
10633   if (opts->x_aarch64_override_tune_string)
10634     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10635                                   &aarch64_tune_params);
10636
10637   /* This target defaults to strict volatile bitfields.  */
10638   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10639     opts->x_flag_strict_volatile_bitfields = 1;
10640
10641   initialize_aarch64_code_model (opts);
10642   initialize_aarch64_tls_size (opts);
10643
10644   int queue_depth = 0;
10645   switch (aarch64_tune_params.autoprefetcher_model)
10646     {
10647       case tune_params::AUTOPREFETCHER_OFF:
10648         queue_depth = -1;
10649         break;
10650       case tune_params::AUTOPREFETCHER_WEAK:
10651         queue_depth = 0;
10652         break;
10653       case tune_params::AUTOPREFETCHER_STRONG:
10654         queue_depth = max_insn_queue_index + 1;
10655         break;
10656       default:
10657         gcc_unreachable ();
10658     }
10659
10660   /* We don't mind passing in global_options_set here as we don't use
10661      the *options_set structs anyway.  */
10662   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10663                          queue_depth,
10664                          opts->x_param_values,
10665                          global_options_set.x_param_values);
10666
10667   /* Set up parameters to be used in prefetching algorithm.  Do not
10668      override the defaults unless we are tuning for a core we have
10669      researched values for.  */
10670   if (aarch64_tune_params.prefetch->num_slots > 0)
10671     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10672                            aarch64_tune_params.prefetch->num_slots,
10673                            opts->x_param_values,
10674                            global_options_set.x_param_values);
10675   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10676     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10677                            aarch64_tune_params.prefetch->l1_cache_size,
10678                            opts->x_param_values,
10679                            global_options_set.x_param_values);
10680   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10681     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10682                            aarch64_tune_params.prefetch->l1_cache_line_size,
10683                            opts->x_param_values,
10684                            global_options_set.x_param_values);
10685   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10686     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10687                            aarch64_tune_params.prefetch->l2_cache_size,
10688                            opts->x_param_values,
10689                            global_options_set.x_param_values);
10690   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10691     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10692                            0,
10693                            opts->x_param_values,
10694                            global_options_set.x_param_values);
10695   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10696     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10697                            aarch64_tune_params.prefetch->minimum_stride,
10698                            opts->x_param_values,
10699                            global_options_set.x_param_values);
10700
10701   /* Use the alternative scheduling-pressure algorithm by default.  */
10702   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10703                          opts->x_param_values,
10704                          global_options_set.x_param_values);
10705
10706   /* Enable sw prefetching at specified optimization level for
10707      CPUS that have prefetch.  Lower optimization level threshold by 1
10708      when profiling is enabled.  */
10709   if (opts->x_flag_prefetch_loop_arrays < 0
10710       && !opts->x_optimize_size
10711       && aarch64_tune_params.prefetch->default_opt_level >= 0
10712       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10713     opts->x_flag_prefetch_loop_arrays = 1;
10714
10715   aarch64_override_options_after_change_1 (opts);
10716 }
10717
10718 /* Print a hint with a suggestion for a core or architecture name that
10719    most closely resembles what the user passed in STR.  ARCH is true if
10720    the user is asking for an architecture name.  ARCH is false if the user
10721    is asking for a core name.  */
10722
10723 static void
10724 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10725 {
10726   auto_vec<const char *> candidates;
10727   const struct processor *entry = arch ? all_architectures : all_cores;
10728   for (; entry->name != NULL; entry++)
10729     candidates.safe_push (entry->name);
10730
10731 #ifdef HAVE_LOCAL_CPU_DETECT
10732   /* Add also "native" as possible value.  */
10733   if (arch)
10734     candidates.safe_push ("native");
10735 #endif
10736
10737   char *s;
10738   const char *hint = candidates_list_and_hint (str, s, candidates);
10739   if (hint)
10740     inform (input_location, "valid arguments are: %s;"
10741                              " did you mean %qs?", s, hint);
10742   else
10743     inform (input_location, "valid arguments are: %s", s);
10744
10745   XDELETEVEC (s);
10746 }
10747
10748 /* Print a hint with a suggestion for a core name that most closely resembles
10749    what the user passed in STR.  */
10750
10751 inline static void
10752 aarch64_print_hint_for_core (const char *str)
10753 {
10754   aarch64_print_hint_for_core_or_arch (str, false);
10755 }
10756
10757 /* Print a hint with a suggestion for an architecture name that most closely
10758    resembles what the user passed in STR.  */
10759
10760 inline static void
10761 aarch64_print_hint_for_arch (const char *str)
10762 {
10763   aarch64_print_hint_for_core_or_arch (str, true);
10764 }
10765
10766 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10767    specified in STR and throw errors if appropriate.  Put the results if
10768    they are valid in RES and ISA_FLAGS.  Return whether the option is
10769    valid.  */
10770
10771 static bool
10772 aarch64_validate_mcpu (const char *str, const struct processor **res,
10773                        unsigned long *isa_flags)
10774 {
10775   enum aarch64_parse_opt_result parse_res
10776     = aarch64_parse_cpu (str, res, isa_flags);
10777
10778   if (parse_res == AARCH64_PARSE_OK)
10779     return true;
10780
10781   switch (parse_res)
10782     {
10783       case AARCH64_PARSE_MISSING_ARG:
10784         error ("missing cpu name in %<-mcpu=%s%>", str);
10785         break;
10786       case AARCH64_PARSE_INVALID_ARG:
10787         error ("unknown value %qs for -mcpu", str);
10788         aarch64_print_hint_for_core (str);
10789         break;
10790       case AARCH64_PARSE_INVALID_FEATURE:
10791         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10792         break;
10793       default:
10794         gcc_unreachable ();
10795     }
10796
10797   return false;
10798 }
10799
10800 /* Validate a command-line -march option.  Parse the arch and extensions
10801    (if any) specified in STR and throw errors if appropriate.  Put the
10802    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10803    option is valid.  */
10804
10805 static bool
10806 aarch64_validate_march (const char *str, const struct processor **res,
10807                          unsigned long *isa_flags)
10808 {
10809   enum aarch64_parse_opt_result parse_res
10810     = aarch64_parse_arch (str, res, isa_flags);
10811
10812   if (parse_res == AARCH64_PARSE_OK)
10813     return true;
10814
10815   switch (parse_res)
10816     {
10817       case AARCH64_PARSE_MISSING_ARG:
10818         error ("missing arch name in %<-march=%s%>", str);
10819         break;
10820       case AARCH64_PARSE_INVALID_ARG:
10821         error ("unknown value %qs for -march", str);
10822         aarch64_print_hint_for_arch (str);
10823         break;
10824       case AARCH64_PARSE_INVALID_FEATURE:
10825         error ("invalid feature modifier in %<-march=%s%>", str);
10826         break;
10827       default:
10828         gcc_unreachable ();
10829     }
10830
10831   return false;
10832 }
10833
10834 /* Validate a command-line -mtune option.  Parse the cpu
10835    specified in STR and throw errors if appropriate.  Put the
10836    result, if it is valid, in RES.  Return whether the option is
10837    valid.  */
10838
10839 static bool
10840 aarch64_validate_mtune (const char *str, const struct processor **res)
10841 {
10842   enum aarch64_parse_opt_result parse_res
10843     = aarch64_parse_tune (str, res);
10844
10845   if (parse_res == AARCH64_PARSE_OK)
10846     return true;
10847
10848   switch (parse_res)
10849     {
10850       case AARCH64_PARSE_MISSING_ARG:
10851         error ("missing cpu name in %<-mtune=%s%>", str);
10852         break;
10853       case AARCH64_PARSE_INVALID_ARG:
10854         error ("unknown value %qs for -mtune", str);
10855         aarch64_print_hint_for_core (str);
10856         break;
10857       default:
10858         gcc_unreachable ();
10859     }
10860   return false;
10861 }
10862
10863 /* Return the CPU corresponding to the enum CPU.
10864    If it doesn't specify a cpu, return the default.  */
10865
10866 static const struct processor *
10867 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10868 {
10869   if (cpu != aarch64_none)
10870     return &all_cores[cpu];
10871
10872   /* The & 0x3f is to extract the bottom 6 bits that encode the
10873      default cpu as selected by the --with-cpu GCC configure option
10874      in config.gcc.
10875      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10876      flags mechanism should be reworked to make it more sane.  */
10877   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10878 }
10879
10880 /* Return the architecture corresponding to the enum ARCH.
10881    If it doesn't specify a valid architecture, return the default.  */
10882
10883 static const struct processor *
10884 aarch64_get_arch (enum aarch64_arch arch)
10885 {
10886   if (arch != aarch64_no_arch)
10887     return &all_architectures[arch];
10888
10889   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10890
10891   return &all_architectures[cpu->arch];
10892 }
10893
10894 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10895
10896 static poly_uint16
10897 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10898 {
10899   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10900      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10901      deciding which .md file patterns to use and when deciding whether
10902      something is a legitimate address or constant.  */
10903   if (value == SVE_SCALABLE || value == SVE_128)
10904     return poly_uint16 (2, 2);
10905   else
10906     return (int) value / 64;
10907 }
10908
10909 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10910    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10911    tuning structs.  In particular it must set selected_tune and
10912    aarch64_isa_flags that define the available ISA features and tuning
10913    decisions.  It must also set selected_arch as this will be used to
10914    output the .arch asm tags for each function.  */
10915
10916 static void
10917 aarch64_override_options (void)
10918 {
10919   unsigned long cpu_isa = 0;
10920   unsigned long arch_isa = 0;
10921   aarch64_isa_flags = 0;
10922
10923   bool valid_cpu = true;
10924   bool valid_tune = true;
10925   bool valid_arch = true;
10926
10927   selected_cpu = NULL;
10928   selected_arch = NULL;
10929   selected_tune = NULL;
10930
10931   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10932      If either of -march or -mtune is given, they override their
10933      respective component of -mcpu.  */
10934   if (aarch64_cpu_string)
10935     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10936                                         &cpu_isa);
10937
10938   if (aarch64_arch_string)
10939     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10940                                           &arch_isa);
10941
10942   if (aarch64_tune_string)
10943     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10944
10945   /* If the user did not specify a processor, choose the default
10946      one for them.  This will be the CPU set during configuration using
10947      --with-cpu, otherwise it is "generic".  */
10948   if (!selected_cpu)
10949     {
10950       if (selected_arch)
10951         {
10952           selected_cpu = &all_cores[selected_arch->ident];
10953           aarch64_isa_flags = arch_isa;
10954           explicit_arch = selected_arch->arch;
10955         }
10956       else
10957         {
10958           /* Get default configure-time CPU.  */
10959           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10960           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10961         }
10962
10963       if (selected_tune)
10964         explicit_tune_core = selected_tune->ident;
10965     }
10966   /* If both -mcpu and -march are specified check that they are architecturally
10967      compatible, warn if they're not and prefer the -march ISA flags.  */
10968   else if (selected_arch)
10969     {
10970       if (selected_arch->arch != selected_cpu->arch)
10971         {
10972           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10973                        all_architectures[selected_cpu->arch].name,
10974                        selected_arch->name);
10975         }
10976       aarch64_isa_flags = arch_isa;
10977       explicit_arch = selected_arch->arch;
10978       explicit_tune_core = selected_tune ? selected_tune->ident
10979                                           : selected_cpu->ident;
10980     }
10981   else
10982     {
10983       /* -mcpu but no -march.  */
10984       aarch64_isa_flags = cpu_isa;
10985       explicit_tune_core = selected_tune ? selected_tune->ident
10986                                           : selected_cpu->ident;
10987       gcc_assert (selected_cpu);
10988       selected_arch = &all_architectures[selected_cpu->arch];
10989       explicit_arch = selected_arch->arch;
10990     }
10991
10992   /* Set the arch as well as we will need it when outputing
10993      the .arch directive in assembly.  */
10994   if (!selected_arch)
10995     {
10996       gcc_assert (selected_cpu);
10997       selected_arch = &all_architectures[selected_cpu->arch];
10998     }
10999
11000   if (!selected_tune)
11001     selected_tune = selected_cpu;
11002
11003 #ifndef HAVE_AS_MABI_OPTION
11004   /* The compiler may have been configured with 2.23.* binutils, which does
11005      not have support for ILP32.  */
11006   if (TARGET_ILP32)
11007     error ("assembler does not support -mabi=ilp32");
11008 #endif
11009
11010   /* Convert -msve-vector-bits to a VG count.  */
11011   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11012
11013   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11014     sorry ("return address signing is only supported for -mabi=lp64");
11015
11016   /* Make sure we properly set up the explicit options.  */
11017   if ((aarch64_cpu_string && valid_cpu)
11018        || (aarch64_tune_string && valid_tune))
11019     gcc_assert (explicit_tune_core != aarch64_none);
11020
11021   if ((aarch64_cpu_string && valid_cpu)
11022        || (aarch64_arch_string && valid_arch))
11023     gcc_assert (explicit_arch != aarch64_no_arch);
11024
11025   aarch64_override_options_internal (&global_options);
11026
11027   /* Save these options as the default ones in case we push and pop them later
11028      while processing functions with potential target attributes.  */
11029   target_option_default_node = target_option_current_node
11030       = build_target_option_node (&global_options);
11031 }
11032
11033 /* Implement targetm.override_options_after_change.  */
11034
11035 static void
11036 aarch64_override_options_after_change (void)
11037 {
11038   aarch64_override_options_after_change_1 (&global_options);
11039 }
11040
11041 static struct machine_function *
11042 aarch64_init_machine_status (void)
11043 {
11044   struct machine_function *machine;
11045   machine = ggc_cleared_alloc<machine_function> ();
11046   return machine;
11047 }
11048
11049 void
11050 aarch64_init_expanders (void)
11051 {
11052   init_machine_status = aarch64_init_machine_status;
11053 }
11054
11055 /* A checking mechanism for the implementation of the various code models.  */
11056 static void
11057 initialize_aarch64_code_model (struct gcc_options *opts)
11058 {
11059    if (opts->x_flag_pic)
11060      {
11061        switch (opts->x_aarch64_cmodel_var)
11062          {
11063          case AARCH64_CMODEL_TINY:
11064            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11065            break;
11066          case AARCH64_CMODEL_SMALL:
11067 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11068            aarch64_cmodel = (flag_pic == 2
11069                              ? AARCH64_CMODEL_SMALL_PIC
11070                              : AARCH64_CMODEL_SMALL_SPIC);
11071 #else
11072            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11073 #endif
11074            break;
11075          case AARCH64_CMODEL_LARGE:
11076            sorry ("code model %qs with -f%s", "large",
11077                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11078            break;
11079          default:
11080            gcc_unreachable ();
11081          }
11082      }
11083    else
11084      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11085 }
11086
11087 /* Implement TARGET_OPTION_SAVE.  */
11088
11089 static void
11090 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11091 {
11092   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11093 }
11094
11095 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11096    using the information saved in PTR.  */
11097
11098 static void
11099 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11100 {
11101   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11102   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11103   opts->x_explicit_arch = ptr->x_explicit_arch;
11104   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11105   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11106
11107   aarch64_override_options_internal (opts);
11108 }
11109
11110 /* Implement TARGET_OPTION_PRINT.  */
11111
11112 static void
11113 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11114 {
11115   const struct processor *cpu
11116     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11117   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11118   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11119   std::string extension
11120     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11121
11122   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11123   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11124            arch->name, extension.c_str ());
11125 }
11126
11127 static GTY(()) tree aarch64_previous_fndecl;
11128
11129 void
11130 aarch64_reset_previous_fndecl (void)
11131 {
11132   aarch64_previous_fndecl = NULL;
11133 }
11134
11135 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11136    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11137    make sure optab availability predicates are recomputed when necessary.  */
11138
11139 void
11140 aarch64_save_restore_target_globals (tree new_tree)
11141 {
11142   if (TREE_TARGET_GLOBALS (new_tree))
11143     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11144   else if (new_tree == target_option_default_node)
11145     restore_target_globals (&default_target_globals);
11146   else
11147     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11148 }
11149
11150 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11151    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11152    of the function, if such exists.  This function may be called multiple
11153    times on a single function so use aarch64_previous_fndecl to avoid
11154    setting up identical state.  */
11155
11156 static void
11157 aarch64_set_current_function (tree fndecl)
11158 {
11159   if (!fndecl || fndecl == aarch64_previous_fndecl)
11160     return;
11161
11162   tree old_tree = (aarch64_previous_fndecl
11163                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11164                    : NULL_TREE);
11165
11166   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11167
11168   /* If current function has no attributes but the previous one did,
11169      use the default node.  */
11170   if (!new_tree && old_tree)
11171     new_tree = target_option_default_node;
11172
11173   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11174      the default have been handled by aarch64_save_restore_target_globals from
11175      aarch64_pragma_target_parse.  */
11176   if (old_tree == new_tree)
11177     return;
11178
11179   aarch64_previous_fndecl = fndecl;
11180
11181   /* First set the target options.  */
11182   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11183
11184   aarch64_save_restore_target_globals (new_tree);
11185 }
11186
11187 /* Enum describing the various ways we can handle attributes.
11188    In many cases we can reuse the generic option handling machinery.  */
11189
11190 enum aarch64_attr_opt_type
11191 {
11192   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11193   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11194   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11195   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11196 };
11197
11198 /* All the information needed to handle a target attribute.
11199    NAME is the name of the attribute.
11200    ATTR_TYPE specifies the type of behavior of the attribute as described
11201    in the definition of enum aarch64_attr_opt_type.
11202    ALLOW_NEG is true if the attribute supports a "no-" form.
11203    HANDLER is the function that takes the attribute string as an argument
11204    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11205    OPT_NUM is the enum specifying the option that the attribute modifies.
11206    This is needed for attributes that mirror the behavior of a command-line
11207    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11208    aarch64_attr_enum.  */
11209
11210 struct aarch64_attribute_info
11211 {
11212   const char *name;
11213   enum aarch64_attr_opt_type attr_type;
11214   bool allow_neg;
11215   bool (*handler) (const char *);
11216   enum opt_code opt_num;
11217 };
11218
11219 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11220
11221 static bool
11222 aarch64_handle_attr_arch (const char *str)
11223 {
11224   const struct processor *tmp_arch = NULL;
11225   enum aarch64_parse_opt_result parse_res
11226     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11227
11228   if (parse_res == AARCH64_PARSE_OK)
11229     {
11230       gcc_assert (tmp_arch);
11231       selected_arch = tmp_arch;
11232       explicit_arch = selected_arch->arch;
11233       return true;
11234     }
11235
11236   switch (parse_res)
11237     {
11238       case AARCH64_PARSE_MISSING_ARG:
11239         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11240         break;
11241       case AARCH64_PARSE_INVALID_ARG:
11242         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11243         aarch64_print_hint_for_arch (str);
11244         break;
11245       case AARCH64_PARSE_INVALID_FEATURE:
11246         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11247         break;
11248       default:
11249         gcc_unreachable ();
11250     }
11251
11252   return false;
11253 }
11254
11255 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11256
11257 static bool
11258 aarch64_handle_attr_cpu (const char *str)
11259 {
11260   const struct processor *tmp_cpu = NULL;
11261   enum aarch64_parse_opt_result parse_res
11262     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11263
11264   if (parse_res == AARCH64_PARSE_OK)
11265     {
11266       gcc_assert (tmp_cpu);
11267       selected_tune = tmp_cpu;
11268       explicit_tune_core = selected_tune->ident;
11269
11270       selected_arch = &all_architectures[tmp_cpu->arch];
11271       explicit_arch = selected_arch->arch;
11272       return true;
11273     }
11274
11275   switch (parse_res)
11276     {
11277       case AARCH64_PARSE_MISSING_ARG:
11278         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11279         break;
11280       case AARCH64_PARSE_INVALID_ARG:
11281         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11282         aarch64_print_hint_for_core (str);
11283         break;
11284       case AARCH64_PARSE_INVALID_FEATURE:
11285         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11286         break;
11287       default:
11288         gcc_unreachable ();
11289     }
11290
11291   return false;
11292 }
11293
11294 /* Handle the argument STR to the tune= target attribute.  */
11295
11296 static bool
11297 aarch64_handle_attr_tune (const char *str)
11298 {
11299   const struct processor *tmp_tune = NULL;
11300   enum aarch64_parse_opt_result parse_res
11301     = aarch64_parse_tune (str, &tmp_tune);
11302
11303   if (parse_res == AARCH64_PARSE_OK)
11304     {
11305       gcc_assert (tmp_tune);
11306       selected_tune = tmp_tune;
11307       explicit_tune_core = selected_tune->ident;
11308       return true;
11309     }
11310
11311   switch (parse_res)
11312     {
11313       case AARCH64_PARSE_INVALID_ARG:
11314         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11315         aarch64_print_hint_for_core (str);
11316         break;
11317       default:
11318         gcc_unreachable ();
11319     }
11320
11321   return false;
11322 }
11323
11324 /* Parse an architecture extensions target attribute string specified in STR.
11325    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11326    if successful.  Update aarch64_isa_flags to reflect the ISA features
11327    modified.  */
11328
11329 static bool
11330 aarch64_handle_attr_isa_flags (char *str)
11331 {
11332   enum aarch64_parse_opt_result parse_res;
11333   unsigned long isa_flags = aarch64_isa_flags;
11334
11335   /* We allow "+nothing" in the beginning to clear out all architectural
11336      features if the user wants to handpick specific features.  */
11337   if (strncmp ("+nothing", str, 8) == 0)
11338     {
11339       isa_flags = 0;
11340       str += 8;
11341     }
11342
11343   parse_res = aarch64_parse_extension (str, &isa_flags);
11344
11345   if (parse_res == AARCH64_PARSE_OK)
11346     {
11347       aarch64_isa_flags = isa_flags;
11348       return true;
11349     }
11350
11351   switch (parse_res)
11352     {
11353       case AARCH64_PARSE_MISSING_ARG:
11354         error ("missing value in %<target()%> pragma or attribute");
11355         break;
11356
11357       case AARCH64_PARSE_INVALID_FEATURE:
11358         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11359         break;
11360
11361       default:
11362         gcc_unreachable ();
11363     }
11364
11365  return false;
11366 }
11367
11368 /* The target attributes that we support.  On top of these we also support just
11369    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11370    handled explicitly in aarch64_process_one_target_attr.  */
11371
11372 static const struct aarch64_attribute_info aarch64_attributes[] =
11373 {
11374   { "general-regs-only", aarch64_attr_mask, false, NULL,
11375      OPT_mgeneral_regs_only },
11376   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11377      OPT_mfix_cortex_a53_835769 },
11378   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11379      OPT_mfix_cortex_a53_843419 },
11380   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11381   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11382   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11383      OPT_momit_leaf_frame_pointer },
11384   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11385   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11386      OPT_march_ },
11387   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11388   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11389      OPT_mtune_ },
11390   { "sign-return-address", aarch64_attr_enum, false, NULL,
11391      OPT_msign_return_address_ },
11392   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11393 };
11394
11395 /* Parse ARG_STR which contains the definition of one target attribute.
11396    Show appropriate errors if any or return true if the attribute is valid.  */
11397
11398 static bool
11399 aarch64_process_one_target_attr (char *arg_str)
11400 {
11401   bool invert = false;
11402
11403   size_t len = strlen (arg_str);
11404
11405   if (len == 0)
11406     {
11407       error ("malformed %<target()%> pragma or attribute");
11408       return false;
11409     }
11410
11411   char *str_to_check = (char *) alloca (len + 1);
11412   strcpy (str_to_check, arg_str);
11413
11414   /* Skip leading whitespace.  */
11415   while (*str_to_check == ' ' || *str_to_check == '\t')
11416     str_to_check++;
11417
11418   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11419      It is easier to detect and handle it explicitly here rather than going
11420      through the machinery for the rest of the target attributes in this
11421      function.  */
11422   if (*str_to_check == '+')
11423     return aarch64_handle_attr_isa_flags (str_to_check);
11424
11425   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11426     {
11427       invert = true;
11428       str_to_check += 3;
11429     }
11430   char *arg = strchr (str_to_check, '=');
11431
11432   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11433      and point ARG to "foo".  */
11434   if (arg)
11435     {
11436       *arg = '\0';
11437       arg++;
11438     }
11439   const struct aarch64_attribute_info *p_attr;
11440   bool found = false;
11441   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11442     {
11443       /* If the names don't match up, or the user has given an argument
11444          to an attribute that doesn't accept one, or didn't give an argument
11445          to an attribute that expects one, fail to match.  */
11446       if (strcmp (str_to_check, p_attr->name) != 0)
11447         continue;
11448
11449       found = true;
11450       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11451                               || p_attr->attr_type == aarch64_attr_enum;
11452
11453       if (attr_need_arg_p ^ (arg != NULL))
11454         {
11455           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11456           return false;
11457         }
11458
11459       /* If the name matches but the attribute does not allow "no-" versions
11460          then we can't match.  */
11461       if (invert && !p_attr->allow_neg)
11462         {
11463           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11464           return false;
11465         }
11466
11467       switch (p_attr->attr_type)
11468         {
11469         /* Has a custom handler registered.
11470            For example, cpu=, arch=, tune=.  */
11471           case aarch64_attr_custom:
11472             gcc_assert (p_attr->handler);
11473             if (!p_attr->handler (arg))
11474               return false;
11475             break;
11476
11477           /* Either set or unset a boolean option.  */
11478           case aarch64_attr_bool:
11479             {
11480               struct cl_decoded_option decoded;
11481
11482               generate_option (p_attr->opt_num, NULL, !invert,
11483                                CL_TARGET, &decoded);
11484               aarch64_handle_option (&global_options, &global_options_set,
11485                                       &decoded, input_location);
11486               break;
11487             }
11488           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11489              should know what mask to apply given the option number.  */
11490           case aarch64_attr_mask:
11491             {
11492               struct cl_decoded_option decoded;
11493               /* We only need to specify the option number.
11494                  aarch64_handle_option will know which mask to apply.  */
11495               decoded.opt_index = p_attr->opt_num;
11496               decoded.value = !invert;
11497               aarch64_handle_option (&global_options, &global_options_set,
11498                                       &decoded, input_location);
11499               break;
11500             }
11501           /* Use the option setting machinery to set an option to an enum.  */
11502           case aarch64_attr_enum:
11503             {
11504               gcc_assert (arg);
11505               bool valid;
11506               int value;
11507               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11508                                               &value, CL_TARGET);
11509               if (valid)
11510                 {
11511                   set_option (&global_options, NULL, p_attr->opt_num, value,
11512                               NULL, DK_UNSPECIFIED, input_location,
11513                               global_dc);
11514                 }
11515               else
11516                 {
11517                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11518                 }
11519               break;
11520             }
11521           default:
11522             gcc_unreachable ();
11523         }
11524     }
11525
11526   /* If we reached here we either have found an attribute and validated
11527      it or didn't match any.  If we matched an attribute but its arguments
11528      were malformed we will have returned false already.  */
11529   return found;
11530 }
11531
11532 /* Count how many times the character C appears in
11533    NULL-terminated string STR.  */
11534
11535 static unsigned int
11536 num_occurences_in_str (char c, char *str)
11537 {
11538   unsigned int res = 0;
11539   while (*str != '\0')
11540     {
11541       if (*str == c)
11542         res++;
11543
11544       str++;
11545     }
11546
11547   return res;
11548 }
11549
11550 /* Parse the tree in ARGS that contains the target attribute information
11551    and update the global target options space.  */
11552
11553 bool
11554 aarch64_process_target_attr (tree args)
11555 {
11556   if (TREE_CODE (args) == TREE_LIST)
11557     {
11558       do
11559         {
11560           tree head = TREE_VALUE (args);
11561           if (head)
11562             {
11563               if (!aarch64_process_target_attr (head))
11564                 return false;
11565             }
11566           args = TREE_CHAIN (args);
11567         } while (args);
11568
11569       return true;
11570     }
11571
11572   if (TREE_CODE (args) != STRING_CST)
11573     {
11574       error ("attribute %<target%> argument not a string");
11575       return false;
11576     }
11577
11578   size_t len = strlen (TREE_STRING_POINTER (args));
11579   char *str_to_check = (char *) alloca (len + 1);
11580   strcpy (str_to_check, TREE_STRING_POINTER (args));
11581
11582   if (len == 0)
11583     {
11584       error ("malformed %<target()%> pragma or attribute");
11585       return false;
11586     }
11587
11588   /* Used to catch empty spaces between commas i.e.
11589      attribute ((target ("attr1,,attr2"))).  */
11590   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11591
11592   /* Handle multiple target attributes separated by ','.  */
11593   char *token = strtok (str_to_check, ",");
11594
11595   unsigned int num_attrs = 0;
11596   while (token)
11597     {
11598       num_attrs++;
11599       if (!aarch64_process_one_target_attr (token))
11600         {
11601           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11602           return false;
11603         }
11604
11605       token = strtok (NULL, ",");
11606     }
11607
11608   if (num_attrs != num_commas + 1)
11609     {
11610       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11611       return false;
11612     }
11613
11614   return true;
11615 }
11616
11617 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11618    process attribute ((target ("..."))).  */
11619
11620 static bool
11621 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11622 {
11623   struct cl_target_option cur_target;
11624   bool ret;
11625   tree old_optimize;
11626   tree new_target, new_optimize;
11627   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11628
11629   /* If what we're processing is the current pragma string then the
11630      target option node is already stored in target_option_current_node
11631      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11632      having to re-parse the string.  This is especially useful to keep
11633      arm_neon.h compile times down since that header contains a lot
11634      of intrinsics enclosed in pragmas.  */
11635   if (!existing_target && args == current_target_pragma)
11636     {
11637       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11638       return true;
11639     }
11640   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11641
11642   old_optimize = build_optimization_node (&global_options);
11643   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11644
11645   /* If the function changed the optimization levels as well as setting
11646      target options, start with the optimizations specified.  */
11647   if (func_optimize && func_optimize != old_optimize)
11648     cl_optimization_restore (&global_options,
11649                              TREE_OPTIMIZATION (func_optimize));
11650
11651   /* Save the current target options to restore at the end.  */
11652   cl_target_option_save (&cur_target, &global_options);
11653
11654   /* If fndecl already has some target attributes applied to it, unpack
11655      them so that we add this attribute on top of them, rather than
11656      overwriting them.  */
11657   if (existing_target)
11658     {
11659       struct cl_target_option *existing_options
11660         = TREE_TARGET_OPTION (existing_target);
11661
11662       if (existing_options)
11663         cl_target_option_restore (&global_options, existing_options);
11664     }
11665   else
11666     cl_target_option_restore (&global_options,
11667                         TREE_TARGET_OPTION (target_option_current_node));
11668
11669   ret = aarch64_process_target_attr (args);
11670
11671   /* Set up any additional state.  */
11672   if (ret)
11673     {
11674       aarch64_override_options_internal (&global_options);
11675       /* Initialize SIMD builtins if we haven't already.
11676          Set current_target_pragma to NULL for the duration so that
11677          the builtin initialization code doesn't try to tag the functions
11678          being built with the attributes specified by any current pragma, thus
11679          going into an infinite recursion.  */
11680       if (TARGET_SIMD)
11681         {
11682           tree saved_current_target_pragma = current_target_pragma;
11683           current_target_pragma = NULL;
11684           aarch64_init_simd_builtins ();
11685           current_target_pragma = saved_current_target_pragma;
11686         }
11687       new_target = build_target_option_node (&global_options);
11688     }
11689   else
11690     new_target = NULL;
11691
11692   new_optimize = build_optimization_node (&global_options);
11693
11694   if (fndecl && ret)
11695     {
11696       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11697
11698       if (old_optimize != new_optimize)
11699         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11700     }
11701
11702   cl_target_option_restore (&global_options, &cur_target);
11703
11704   if (old_optimize != new_optimize)
11705     cl_optimization_restore (&global_options,
11706                              TREE_OPTIMIZATION (old_optimize));
11707   return ret;
11708 }
11709
11710 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11711    tri-bool options (yes, no, don't care) and the default value is
11712    DEF, determine whether to reject inlining.  */
11713
11714 static bool
11715 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11716                                      int dont_care, int def)
11717 {
11718   /* If the callee doesn't care, always allow inlining.  */
11719   if (callee == dont_care)
11720     return true;
11721
11722   /* If the caller doesn't care, always allow inlining.  */
11723   if (caller == dont_care)
11724     return true;
11725
11726   /* Otherwise, allow inlining if either the callee and caller values
11727      agree, or if the callee is using the default value.  */
11728   return (callee == caller || callee == def);
11729 }
11730
11731 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11732    to inline CALLEE into CALLER based on target-specific info.
11733    Make sure that the caller and callee have compatible architectural
11734    features.  Then go through the other possible target attributes
11735    and see if they can block inlining.  Try not to reject always_inline
11736    callees unless they are incompatible architecturally.  */
11737
11738 static bool
11739 aarch64_can_inline_p (tree caller, tree callee)
11740 {
11741   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11742   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11743
11744   struct cl_target_option *caller_opts
11745         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11746                                            : target_option_default_node);
11747
11748   struct cl_target_option *callee_opts
11749         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11750                                            : target_option_default_node);
11751
11752   /* Callee's ISA flags should be a subset of the caller's.  */
11753   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11754        != callee_opts->x_aarch64_isa_flags)
11755     return false;
11756
11757   /* Allow non-strict aligned functions inlining into strict
11758      aligned ones.  */
11759   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11760        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11761       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11762            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11763     return false;
11764
11765   bool always_inline = lookup_attribute ("always_inline",
11766                                           DECL_ATTRIBUTES (callee));
11767
11768   /* If the architectural features match up and the callee is always_inline
11769      then the other attributes don't matter.  */
11770   if (always_inline)
11771     return true;
11772
11773   if (caller_opts->x_aarch64_cmodel_var
11774       != callee_opts->x_aarch64_cmodel_var)
11775     return false;
11776
11777   if (caller_opts->x_aarch64_tls_dialect
11778       != callee_opts->x_aarch64_tls_dialect)
11779     return false;
11780
11781   /* Honour explicit requests to workaround errata.  */
11782   if (!aarch64_tribools_ok_for_inlining_p (
11783           caller_opts->x_aarch64_fix_a53_err835769,
11784           callee_opts->x_aarch64_fix_a53_err835769,
11785           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11786     return false;
11787
11788   if (!aarch64_tribools_ok_for_inlining_p (
11789           caller_opts->x_aarch64_fix_a53_err843419,
11790           callee_opts->x_aarch64_fix_a53_err843419,
11791           2, TARGET_FIX_ERR_A53_843419))
11792     return false;
11793
11794   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11795      caller and calle and they don't match up, reject inlining.  */
11796   if (!aarch64_tribools_ok_for_inlining_p (
11797           caller_opts->x_flag_omit_leaf_frame_pointer,
11798           callee_opts->x_flag_omit_leaf_frame_pointer,
11799           2, 1))
11800     return false;
11801
11802   /* If the callee has specific tuning overrides, respect them.  */
11803   if (callee_opts->x_aarch64_override_tune_string != NULL
11804       && caller_opts->x_aarch64_override_tune_string == NULL)
11805     return false;
11806
11807   /* If the user specified tuning override strings for the
11808      caller and callee and they don't match up, reject inlining.
11809      We just do a string compare here, we don't analyze the meaning
11810      of the string, as it would be too costly for little gain.  */
11811   if (callee_opts->x_aarch64_override_tune_string
11812       && caller_opts->x_aarch64_override_tune_string
11813       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11814                   caller_opts->x_aarch64_override_tune_string) != 0))
11815     return false;
11816
11817   return true;
11818 }
11819
11820 /* Return true if SYMBOL_REF X binds locally.  */
11821
11822 static bool
11823 aarch64_symbol_binds_local_p (const_rtx x)
11824 {
11825   return (SYMBOL_REF_DECL (x)
11826           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11827           : SYMBOL_REF_LOCAL_P (x));
11828 }
11829
11830 /* Return true if SYMBOL_REF X is thread local */
11831 static bool
11832 aarch64_tls_symbol_p (rtx x)
11833 {
11834   if (! TARGET_HAVE_TLS)
11835     return false;
11836
11837   if (GET_CODE (x) != SYMBOL_REF)
11838     return false;
11839
11840   return SYMBOL_REF_TLS_MODEL (x) != 0;
11841 }
11842
11843 /* Classify a TLS symbol into one of the TLS kinds.  */
11844 enum aarch64_symbol_type
11845 aarch64_classify_tls_symbol (rtx x)
11846 {
11847   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11848
11849   switch (tls_kind)
11850     {
11851     case TLS_MODEL_GLOBAL_DYNAMIC:
11852     case TLS_MODEL_LOCAL_DYNAMIC:
11853       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11854
11855     case TLS_MODEL_INITIAL_EXEC:
11856       switch (aarch64_cmodel)
11857         {
11858         case AARCH64_CMODEL_TINY:
11859         case AARCH64_CMODEL_TINY_PIC:
11860           return SYMBOL_TINY_TLSIE;
11861         default:
11862           return SYMBOL_SMALL_TLSIE;
11863         }
11864
11865     case TLS_MODEL_LOCAL_EXEC:
11866       if (aarch64_tls_size == 12)
11867         return SYMBOL_TLSLE12;
11868       else if (aarch64_tls_size == 24)
11869         return SYMBOL_TLSLE24;
11870       else if (aarch64_tls_size == 32)
11871         return SYMBOL_TLSLE32;
11872       else if (aarch64_tls_size == 48)
11873         return SYMBOL_TLSLE48;
11874       else
11875         gcc_unreachable ();
11876
11877     case TLS_MODEL_EMULATED:
11878     case TLS_MODEL_NONE:
11879       return SYMBOL_FORCE_TO_MEM;
11880
11881     default:
11882       gcc_unreachable ();
11883     }
11884 }
11885
11886 /* Return the correct method for accessing X + OFFSET, where X is either
11887    a SYMBOL_REF or LABEL_REF.  */
11888
11889 enum aarch64_symbol_type
11890 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11891 {
11892   if (GET_CODE (x) == LABEL_REF)
11893     {
11894       switch (aarch64_cmodel)
11895         {
11896         case AARCH64_CMODEL_LARGE:
11897           return SYMBOL_FORCE_TO_MEM;
11898
11899         case AARCH64_CMODEL_TINY_PIC:
11900         case AARCH64_CMODEL_TINY:
11901           return SYMBOL_TINY_ABSOLUTE;
11902
11903         case AARCH64_CMODEL_SMALL_SPIC:
11904         case AARCH64_CMODEL_SMALL_PIC:
11905         case AARCH64_CMODEL_SMALL:
11906           return SYMBOL_SMALL_ABSOLUTE;
11907
11908         default:
11909           gcc_unreachable ();
11910         }
11911     }
11912
11913   if (GET_CODE (x) == SYMBOL_REF)
11914     {
11915       if (aarch64_tls_symbol_p (x))
11916         return aarch64_classify_tls_symbol (x);
11917
11918       switch (aarch64_cmodel)
11919         {
11920         case AARCH64_CMODEL_TINY:
11921           /* When we retrieve symbol + offset address, we have to make sure
11922              the offset does not cause overflow of the final address.  But
11923              we have no way of knowing the address of symbol at compile time
11924              so we can't accurately say if the distance between the PC and
11925              symbol + offset is outside the addressible range of +/-1M in the
11926              TINY code model.  So we rely on images not being greater than
11927              1M and cap the offset at 1M and anything beyond 1M will have to
11928              be loaded using an alternative mechanism.  Furthermore if the
11929              symbol is a weak reference to something that isn't known to
11930              resolve to a symbol in this module, then force to memory.  */
11931           if ((SYMBOL_REF_WEAK (x)
11932                && !aarch64_symbol_binds_local_p (x))
11933               || !IN_RANGE (offset, -1048575, 1048575))
11934             return SYMBOL_FORCE_TO_MEM;
11935           return SYMBOL_TINY_ABSOLUTE;
11936
11937         case AARCH64_CMODEL_SMALL:
11938           /* Same reasoning as the tiny code model, but the offset cap here is
11939              4G.  */
11940           if ((SYMBOL_REF_WEAK (x)
11941                && !aarch64_symbol_binds_local_p (x))
11942               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11943                             HOST_WIDE_INT_C (4294967264)))
11944             return SYMBOL_FORCE_TO_MEM;
11945           return SYMBOL_SMALL_ABSOLUTE;
11946
11947         case AARCH64_CMODEL_TINY_PIC:
11948           if (!aarch64_symbol_binds_local_p (x))
11949             return SYMBOL_TINY_GOT;
11950           return SYMBOL_TINY_ABSOLUTE;
11951
11952         case AARCH64_CMODEL_SMALL_SPIC:
11953         case AARCH64_CMODEL_SMALL_PIC:
11954           if (!aarch64_symbol_binds_local_p (x))
11955             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11956                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11957           return SYMBOL_SMALL_ABSOLUTE;
11958
11959         case AARCH64_CMODEL_LARGE:
11960           /* This is alright even in PIC code as the constant
11961              pool reference is always PC relative and within
11962              the same translation unit.  */
11963           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11964             return SYMBOL_SMALL_ABSOLUTE;
11965           else
11966             return SYMBOL_FORCE_TO_MEM;
11967
11968         default:
11969           gcc_unreachable ();
11970         }
11971     }
11972
11973   /* By default push everything into the constant pool.  */
11974   return SYMBOL_FORCE_TO_MEM;
11975 }
11976
11977 bool
11978 aarch64_constant_address_p (rtx x)
11979 {
11980   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11981 }
11982
11983 bool
11984 aarch64_legitimate_pic_operand_p (rtx x)
11985 {
11986   if (GET_CODE (x) == SYMBOL_REF
11987       || (GET_CODE (x) == CONST
11988           && GET_CODE (XEXP (x, 0)) == PLUS
11989           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11990      return false;
11991
11992   return true;
11993 }
11994
11995 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11996    that should be rematerialized rather than spilled.  */
11997
11998 static bool
11999 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12000 {
12001   /* Support CSE and rematerialization of common constants.  */
12002   if (CONST_INT_P (x)
12003       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12004       || GET_CODE (x) == CONST_VECTOR)
12005     return true;
12006
12007   /* Do not allow vector struct mode constants for Advanced SIMD.
12008      We could support 0 and -1 easily, but they need support in
12009      aarch64-simd.md.  */
12010   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12011   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12012     return false;
12013
12014   /* Only accept variable-length vector constants if they can be
12015      handled directly.
12016
12017      ??? It would be possible to handle rematerialization of other
12018      constants via secondary reloads.  */
12019   if (vec_flags & VEC_ANY_SVE)
12020     return aarch64_simd_valid_immediate (x, NULL);
12021
12022   if (GET_CODE (x) == HIGH)
12023     x = XEXP (x, 0);
12024
12025   /* Accept polynomial constants that can be calculated by using the
12026      destination of a move as the sole temporary.  Constants that
12027      require a second temporary cannot be rematerialized (they can't be
12028      forced to memory and also aren't legitimate constants).  */
12029   poly_int64 offset;
12030   if (poly_int_rtx_p (x, &offset))
12031     return aarch64_offset_temporaries (false, offset) <= 1;
12032
12033   /* If an offset is being added to something else, we need to allow the
12034      base to be moved into the destination register, meaning that there
12035      are no free temporaries for the offset.  */
12036   x = strip_offset (x, &offset);
12037   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12038     return false;
12039
12040   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12041   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12042     return false;
12043
12044   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12045      so spilling them is better than rematerialization.  */
12046   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12047     return true;
12048
12049   /* Label references are always constant.  */
12050   if (GET_CODE (x) == LABEL_REF)
12051     return true;
12052
12053   return false;
12054 }
12055
12056 rtx
12057 aarch64_load_tp (rtx target)
12058 {
12059   if (!target
12060       || GET_MODE (target) != Pmode
12061       || !register_operand (target, Pmode))
12062     target = gen_reg_rtx (Pmode);
12063
12064   /* Can return in any reg.  */
12065   emit_insn (gen_aarch64_load_tp_hard (target));
12066   return target;
12067 }
12068
12069 /* On AAPCS systems, this is the "struct __va_list".  */
12070 static GTY(()) tree va_list_type;
12071
12072 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12073    Return the type to use as __builtin_va_list.
12074
12075    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12076
12077    struct __va_list
12078    {
12079      void *__stack;
12080      void *__gr_top;
12081      void *__vr_top;
12082      int   __gr_offs;
12083      int   __vr_offs;
12084    };  */
12085
12086 static tree
12087 aarch64_build_builtin_va_list (void)
12088 {
12089   tree va_list_name;
12090   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12091
12092   /* Create the type.  */
12093   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12094   /* Give it the required name.  */
12095   va_list_name = build_decl (BUILTINS_LOCATION,
12096                              TYPE_DECL,
12097                              get_identifier ("__va_list"),
12098                              va_list_type);
12099   DECL_ARTIFICIAL (va_list_name) = 1;
12100   TYPE_NAME (va_list_type) = va_list_name;
12101   TYPE_STUB_DECL (va_list_type) = va_list_name;
12102
12103   /* Create the fields.  */
12104   f_stack = build_decl (BUILTINS_LOCATION,
12105                         FIELD_DECL, get_identifier ("__stack"),
12106                         ptr_type_node);
12107   f_grtop = build_decl (BUILTINS_LOCATION,
12108                         FIELD_DECL, get_identifier ("__gr_top"),
12109                         ptr_type_node);
12110   f_vrtop = build_decl (BUILTINS_LOCATION,
12111                         FIELD_DECL, get_identifier ("__vr_top"),
12112                         ptr_type_node);
12113   f_groff = build_decl (BUILTINS_LOCATION,
12114                         FIELD_DECL, get_identifier ("__gr_offs"),
12115                         integer_type_node);
12116   f_vroff = build_decl (BUILTINS_LOCATION,
12117                         FIELD_DECL, get_identifier ("__vr_offs"),
12118                         integer_type_node);
12119
12120   /* Tell tree-stdarg pass about our internal offset fields.
12121      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12122      purpose to identify whether the code is updating va_list internal
12123      offset fields through irregular way.  */
12124   va_list_gpr_counter_field = f_groff;
12125   va_list_fpr_counter_field = f_vroff;
12126
12127   DECL_ARTIFICIAL (f_stack) = 1;
12128   DECL_ARTIFICIAL (f_grtop) = 1;
12129   DECL_ARTIFICIAL (f_vrtop) = 1;
12130   DECL_ARTIFICIAL (f_groff) = 1;
12131   DECL_ARTIFICIAL (f_vroff) = 1;
12132
12133   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12134   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12135   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12136   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12137   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12138
12139   TYPE_FIELDS (va_list_type) = f_stack;
12140   DECL_CHAIN (f_stack) = f_grtop;
12141   DECL_CHAIN (f_grtop) = f_vrtop;
12142   DECL_CHAIN (f_vrtop) = f_groff;
12143   DECL_CHAIN (f_groff) = f_vroff;
12144
12145   /* Compute its layout.  */
12146   layout_type (va_list_type);
12147
12148   return va_list_type;
12149 }
12150
12151 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12152 static void
12153 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12154 {
12155   const CUMULATIVE_ARGS *cum;
12156   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12157   tree stack, grtop, vrtop, groff, vroff;
12158   tree t;
12159   int gr_save_area_size = cfun->va_list_gpr_size;
12160   int vr_save_area_size = cfun->va_list_fpr_size;
12161   int vr_offset;
12162
12163   cum = &crtl->args.info;
12164   if (cfun->va_list_gpr_size)
12165     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12166                              cfun->va_list_gpr_size);
12167   if (cfun->va_list_fpr_size)
12168     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12169                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12170
12171   if (!TARGET_FLOAT)
12172     {
12173       gcc_assert (cum->aapcs_nvrn == 0);
12174       vr_save_area_size = 0;
12175     }
12176
12177   f_stack = TYPE_FIELDS (va_list_type_node);
12178   f_grtop = DECL_CHAIN (f_stack);
12179   f_vrtop = DECL_CHAIN (f_grtop);
12180   f_groff = DECL_CHAIN (f_vrtop);
12181   f_vroff = DECL_CHAIN (f_groff);
12182
12183   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12184                   NULL_TREE);
12185   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12186                   NULL_TREE);
12187   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12188                   NULL_TREE);
12189   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12190                   NULL_TREE);
12191   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12192                   NULL_TREE);
12193
12194   /* Emit code to initialize STACK, which points to the next varargs stack
12195      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12196      by named arguments.  STACK is 8-byte aligned.  */
12197   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12198   if (cum->aapcs_stack_size > 0)
12199     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12200   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12201   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12202
12203   /* Emit code to initialize GRTOP, the top of the GR save area.
12204      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12205   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12206   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12207   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12208
12209   /* Emit code to initialize VRTOP, the top of the VR save area.
12210      This address is gr_save_area_bytes below GRTOP, rounded
12211      down to the next 16-byte boundary.  */
12212   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12213   vr_offset = ROUND_UP (gr_save_area_size,
12214                         STACK_BOUNDARY / BITS_PER_UNIT);
12215
12216   if (vr_offset)
12217     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12218   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12219   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12220
12221   /* Emit code to initialize GROFF, the offset from GRTOP of the
12222      next GPR argument.  */
12223   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12224               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12225   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12226
12227   /* Likewise emit code to initialize VROFF, the offset from FTOP
12228      of the next VR argument.  */
12229   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12230               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12231   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12232 }
12233
12234 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12235
12236 static tree
12237 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12238                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12239 {
12240   tree addr;
12241   bool indirect_p;
12242   bool is_ha;           /* is HFA or HVA.  */
12243   bool dw_align;        /* double-word align.  */
12244   machine_mode ag_mode = VOIDmode;
12245   int nregs;
12246   machine_mode mode;
12247
12248   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12249   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12250   HOST_WIDE_INT size, rsize, adjust, align;
12251   tree t, u, cond1, cond2;
12252
12253   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12254   if (indirect_p)
12255     type = build_pointer_type (type);
12256
12257   mode = TYPE_MODE (type);
12258
12259   f_stack = TYPE_FIELDS (va_list_type_node);
12260   f_grtop = DECL_CHAIN (f_stack);
12261   f_vrtop = DECL_CHAIN (f_grtop);
12262   f_groff = DECL_CHAIN (f_vrtop);
12263   f_vroff = DECL_CHAIN (f_groff);
12264
12265   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12266                   f_stack, NULL_TREE);
12267   size = int_size_in_bytes (type);
12268   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12269
12270   dw_align = false;
12271   adjust = 0;
12272   if (aarch64_vfp_is_call_or_return_candidate (mode,
12273                                                type,
12274                                                &ag_mode,
12275                                                &nregs,
12276                                                &is_ha))
12277     {
12278       /* No frontends can create types with variable-sized modes, so we
12279          shouldn't be asked to pass or return them.  */
12280       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12281
12282       /* TYPE passed in fp/simd registers.  */
12283       if (!TARGET_FLOAT)
12284         aarch64_err_no_fpadvsimd (mode);
12285
12286       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12287                       unshare_expr (valist), f_vrtop, NULL_TREE);
12288       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12289                       unshare_expr (valist), f_vroff, NULL_TREE);
12290
12291       rsize = nregs * UNITS_PER_VREG;
12292
12293       if (is_ha)
12294         {
12295           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12296             adjust = UNITS_PER_VREG - ag_size;
12297         }
12298       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12299                && size < UNITS_PER_VREG)
12300         {
12301           adjust = UNITS_PER_VREG - size;
12302         }
12303     }
12304   else
12305     {
12306       /* TYPE passed in general registers.  */
12307       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12308                       unshare_expr (valist), f_grtop, NULL_TREE);
12309       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12310                       unshare_expr (valist), f_groff, NULL_TREE);
12311       rsize = ROUND_UP (size, UNITS_PER_WORD);
12312       nregs = rsize / UNITS_PER_WORD;
12313
12314       if (align > 8)
12315         dw_align = true;
12316
12317       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12318           && size < UNITS_PER_WORD)
12319         {
12320           adjust = UNITS_PER_WORD  - size;
12321         }
12322     }
12323
12324   /* Get a local temporary for the field value.  */
12325   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12326
12327   /* Emit code to branch if off >= 0.  */
12328   t = build2 (GE_EXPR, boolean_type_node, off,
12329               build_int_cst (TREE_TYPE (off), 0));
12330   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12331
12332   if (dw_align)
12333     {
12334       /* Emit: offs = (offs + 15) & -16.  */
12335       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12336                   build_int_cst (TREE_TYPE (off), 15));
12337       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12338                   build_int_cst (TREE_TYPE (off), -16));
12339       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12340     }
12341   else
12342     roundup = NULL;
12343
12344   /* Update ap.__[g|v]r_offs  */
12345   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12346               build_int_cst (TREE_TYPE (off), rsize));
12347   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12348
12349   /* String up.  */
12350   if (roundup)
12351     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12352
12353   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12354   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12355               build_int_cst (TREE_TYPE (f_off), 0));
12356   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12357
12358   /* String up: make sure the assignment happens before the use.  */
12359   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12360   COND_EXPR_ELSE (cond1) = t;
12361
12362   /* Prepare the trees handling the argument that is passed on the stack;
12363      the top level node will store in ON_STACK.  */
12364   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12365   if (align > 8)
12366     {
12367       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12368       t = fold_build_pointer_plus_hwi (arg, 15);
12369       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12370                   build_int_cst (TREE_TYPE (t), -16));
12371       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12372     }
12373   else
12374     roundup = NULL;
12375   /* Advance ap.__stack  */
12376   t = fold_build_pointer_plus_hwi (arg, size + 7);
12377   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12378               build_int_cst (TREE_TYPE (t), -8));
12379   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12380   /* String up roundup and advance.  */
12381   if (roundup)
12382     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12383   /* String up with arg */
12384   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12385   /* Big-endianness related address adjustment.  */
12386   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12387       && size < UNITS_PER_WORD)
12388   {
12389     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12390                 size_int (UNITS_PER_WORD - size));
12391     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12392   }
12393
12394   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12395   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12396
12397   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12398   t = off;
12399   if (adjust)
12400     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12401                 build_int_cst (TREE_TYPE (off), adjust));
12402
12403   t = fold_convert (sizetype, t);
12404   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12405
12406   if (is_ha)
12407     {
12408       /* type ha; // treat as "struct {ftype field[n];}"
12409          ... [computing offs]
12410          for (i = 0; i <nregs; ++i, offs += 16)
12411            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12412          return ha;  */
12413       int i;
12414       tree tmp_ha, field_t, field_ptr_t;
12415
12416       /* Declare a local variable.  */
12417       tmp_ha = create_tmp_var_raw (type, "ha");
12418       gimple_add_tmp_var (tmp_ha);
12419
12420       /* Establish the base type.  */
12421       switch (ag_mode)
12422         {
12423         case E_SFmode:
12424           field_t = float_type_node;
12425           field_ptr_t = float_ptr_type_node;
12426           break;
12427         case E_DFmode:
12428           field_t = double_type_node;
12429           field_ptr_t = double_ptr_type_node;
12430           break;
12431         case E_TFmode:
12432           field_t = long_double_type_node;
12433           field_ptr_t = long_double_ptr_type_node;
12434           break;
12435         case E_HFmode:
12436           field_t = aarch64_fp16_type_node;
12437           field_ptr_t = aarch64_fp16_ptr_type_node;
12438           break;
12439         case E_V2SImode:
12440         case E_V4SImode:
12441             {
12442               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12443               field_t = build_vector_type_for_mode (innertype, ag_mode);
12444               field_ptr_t = build_pointer_type (field_t);
12445             }
12446           break;
12447         default:
12448           gcc_assert (0);
12449         }
12450
12451       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12452       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12453       addr = t;
12454       t = fold_convert (field_ptr_t, addr);
12455       t = build2 (MODIFY_EXPR, field_t,
12456                   build1 (INDIRECT_REF, field_t, tmp_ha),
12457                   build1 (INDIRECT_REF, field_t, t));
12458
12459       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12460       for (i = 1; i < nregs; ++i)
12461         {
12462           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12463           u = fold_convert (field_ptr_t, addr);
12464           u = build2 (MODIFY_EXPR, field_t,
12465                       build2 (MEM_REF, field_t, tmp_ha,
12466                               build_int_cst (field_ptr_t,
12467                                              (i *
12468                                               int_size_in_bytes (field_t)))),
12469                       build1 (INDIRECT_REF, field_t, u));
12470           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12471         }
12472
12473       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12474       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12475     }
12476
12477   COND_EXPR_ELSE (cond2) = t;
12478   addr = fold_convert (build_pointer_type (type), cond1);
12479   addr = build_va_arg_indirect_ref (addr);
12480
12481   if (indirect_p)
12482     addr = build_va_arg_indirect_ref (addr);
12483
12484   return addr;
12485 }
12486
12487 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12488
12489 static void
12490 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12491                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12492                                 int no_rtl)
12493 {
12494   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12495   CUMULATIVE_ARGS local_cum;
12496   int gr_saved = cfun->va_list_gpr_size;
12497   int vr_saved = cfun->va_list_fpr_size;
12498
12499   /* The caller has advanced CUM up to, but not beyond, the last named
12500      argument.  Advance a local copy of CUM past the last "real" named
12501      argument, to find out how many registers are left over.  */
12502   local_cum = *cum;
12503   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12504
12505   /* Found out how many registers we need to save.
12506      Honor tree-stdvar analysis results.  */
12507   if (cfun->va_list_gpr_size)
12508     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12509                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12510   if (cfun->va_list_fpr_size)
12511     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12512                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12513
12514   if (!TARGET_FLOAT)
12515     {
12516       gcc_assert (local_cum.aapcs_nvrn == 0);
12517       vr_saved = 0;
12518     }
12519
12520   if (!no_rtl)
12521     {
12522       if (gr_saved > 0)
12523         {
12524           rtx ptr, mem;
12525
12526           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12527           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12528                                - gr_saved * UNITS_PER_WORD);
12529           mem = gen_frame_mem (BLKmode, ptr);
12530           set_mem_alias_set (mem, get_varargs_alias_set ());
12531
12532           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12533                                mem, gr_saved);
12534         }
12535       if (vr_saved > 0)
12536         {
12537           /* We can't use move_block_from_reg, because it will use
12538              the wrong mode, storing D regs only.  */
12539           machine_mode mode = TImode;
12540           int off, i, vr_start;
12541
12542           /* Set OFF to the offset from virtual_incoming_args_rtx of
12543              the first vector register.  The VR save area lies below
12544              the GR one, and is aligned to 16 bytes.  */
12545           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12546                            STACK_BOUNDARY / BITS_PER_UNIT);
12547           off -= vr_saved * UNITS_PER_VREG;
12548
12549           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12550           for (i = 0; i < vr_saved; ++i)
12551             {
12552               rtx ptr, mem;
12553
12554               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12555               mem = gen_frame_mem (mode, ptr);
12556               set_mem_alias_set (mem, get_varargs_alias_set ());
12557               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12558               off += UNITS_PER_VREG;
12559             }
12560         }
12561     }
12562
12563   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12564      any complication of having crtl->args.pretend_args_size changed.  */
12565   cfun->machine->frame.saved_varargs_size
12566     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12567                  STACK_BOUNDARY / BITS_PER_UNIT)
12568        + vr_saved * UNITS_PER_VREG);
12569 }
12570
12571 static void
12572 aarch64_conditional_register_usage (void)
12573 {
12574   int i;
12575   if (!TARGET_FLOAT)
12576     {
12577       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12578         {
12579           fixed_regs[i] = 1;
12580           call_used_regs[i] = 1;
12581         }
12582     }
12583   if (!TARGET_SVE)
12584     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12585       {
12586         fixed_regs[i] = 1;
12587         call_used_regs[i] = 1;
12588       }
12589 }
12590
12591 /* Walk down the type tree of TYPE counting consecutive base elements.
12592    If *MODEP is VOIDmode, then set it to the first valid floating point
12593    type.  If a non-floating point type is found, or if a floating point
12594    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12595    otherwise return the count in the sub-tree.  */
12596 static int
12597 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12598 {
12599   machine_mode mode;
12600   HOST_WIDE_INT size;
12601
12602   switch (TREE_CODE (type))
12603     {
12604     case REAL_TYPE:
12605       mode = TYPE_MODE (type);
12606       if (mode != DFmode && mode != SFmode
12607           && mode != TFmode && mode != HFmode)
12608         return -1;
12609
12610       if (*modep == VOIDmode)
12611         *modep = mode;
12612
12613       if (*modep == mode)
12614         return 1;
12615
12616       break;
12617
12618     case COMPLEX_TYPE:
12619       mode = TYPE_MODE (TREE_TYPE (type));
12620       if (mode != DFmode && mode != SFmode
12621           && mode != TFmode && mode != HFmode)
12622         return -1;
12623
12624       if (*modep == VOIDmode)
12625         *modep = mode;
12626
12627       if (*modep == mode)
12628         return 2;
12629
12630       break;
12631
12632     case VECTOR_TYPE:
12633       /* Use V2SImode and V4SImode as representatives of all 64-bit
12634          and 128-bit vector types.  */
12635       size = int_size_in_bytes (type);
12636       switch (size)
12637         {
12638         case 8:
12639           mode = V2SImode;
12640           break;
12641         case 16:
12642           mode = V4SImode;
12643           break;
12644         default:
12645           return -1;
12646         }
12647
12648       if (*modep == VOIDmode)
12649         *modep = mode;
12650
12651       /* Vector modes are considered to be opaque: two vectors are
12652          equivalent for the purposes of being homogeneous aggregates
12653          if they are the same size.  */
12654       if (*modep == mode)
12655         return 1;
12656
12657       break;
12658
12659     case ARRAY_TYPE:
12660       {
12661         int count;
12662         tree index = TYPE_DOMAIN (type);
12663
12664         /* Can't handle incomplete types nor sizes that are not
12665            fixed.  */
12666         if (!COMPLETE_TYPE_P (type)
12667             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12668           return -1;
12669
12670         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12671         if (count == -1
12672             || !index
12673             || !TYPE_MAX_VALUE (index)
12674             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12675             || !TYPE_MIN_VALUE (index)
12676             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12677             || count < 0)
12678           return -1;
12679
12680         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12681                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12682
12683         /* There must be no padding.  */
12684         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12685                       count * GET_MODE_BITSIZE (*modep)))
12686           return -1;
12687
12688         return count;
12689       }
12690
12691     case RECORD_TYPE:
12692       {
12693         int count = 0;
12694         int sub_count;
12695         tree field;
12696
12697         /* Can't handle incomplete types nor sizes that are not
12698            fixed.  */
12699         if (!COMPLETE_TYPE_P (type)
12700             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12701           return -1;
12702
12703         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12704           {
12705             if (TREE_CODE (field) != FIELD_DECL)
12706               continue;
12707
12708             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12709             if (sub_count < 0)
12710               return -1;
12711             count += sub_count;
12712           }
12713
12714         /* There must be no padding.  */
12715         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12716                       count * GET_MODE_BITSIZE (*modep)))
12717           return -1;
12718
12719         return count;
12720       }
12721
12722     case UNION_TYPE:
12723     case QUAL_UNION_TYPE:
12724       {
12725         /* These aren't very interesting except in a degenerate case.  */
12726         int count = 0;
12727         int sub_count;
12728         tree field;
12729
12730         /* Can't handle incomplete types nor sizes that are not
12731            fixed.  */
12732         if (!COMPLETE_TYPE_P (type)
12733             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12734           return -1;
12735
12736         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12737           {
12738             if (TREE_CODE (field) != FIELD_DECL)
12739               continue;
12740
12741             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12742             if (sub_count < 0)
12743               return -1;
12744             count = count > sub_count ? count : sub_count;
12745           }
12746
12747         /* There must be no padding.  */
12748         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12749                       count * GET_MODE_BITSIZE (*modep)))
12750           return -1;
12751
12752         return count;
12753       }
12754
12755     default:
12756       break;
12757     }
12758
12759   return -1;
12760 }
12761
12762 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12763    type as described in AAPCS64 \S 4.1.2.
12764
12765    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12766
12767 static bool
12768 aarch64_short_vector_p (const_tree type,
12769                         machine_mode mode)
12770 {
12771   poly_int64 size = -1;
12772
12773   if (type && TREE_CODE (type) == VECTOR_TYPE)
12774     size = int_size_in_bytes (type);
12775   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12776             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12777     size = GET_MODE_SIZE (mode);
12778
12779   return known_eq (size, 8) || known_eq (size, 16);
12780 }
12781
12782 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12783    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12784    array types.  The C99 floating-point complex types are also considered
12785    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12786    types, which are GCC extensions and out of the scope of AAPCS64, are
12787    treated as composite types here as well.
12788
12789    Note that MODE itself is not sufficient in determining whether a type
12790    is such a composite type or not.  This is because
12791    stor-layout.c:compute_record_mode may have already changed the MODE
12792    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12793    structure with only one field may have its MODE set to the mode of the
12794    field.  Also an integer mode whose size matches the size of the
12795    RECORD_TYPE type may be used to substitute the original mode
12796    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12797    solely relied on.  */
12798
12799 static bool
12800 aarch64_composite_type_p (const_tree type,
12801                           machine_mode mode)
12802 {
12803   if (aarch64_short_vector_p (type, mode))
12804     return false;
12805
12806   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12807     return true;
12808
12809   if (mode == BLKmode
12810       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12811       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12812     return true;
12813
12814   return false;
12815 }
12816
12817 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12818    shall be passed or returned in simd/fp register(s) (providing these
12819    parameter passing registers are available).
12820
12821    Upon successful return, *COUNT returns the number of needed registers,
12822    *BASE_MODE returns the mode of the individual register and when IS_HAF
12823    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12824    floating-point aggregate or a homogeneous short-vector aggregate.  */
12825
12826 static bool
12827 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12828                                          const_tree type,
12829                                          machine_mode *base_mode,
12830                                          int *count,
12831                                          bool *is_ha)
12832 {
12833   machine_mode new_mode = VOIDmode;
12834   bool composite_p = aarch64_composite_type_p (type, mode);
12835
12836   if (is_ha != NULL) *is_ha = false;
12837
12838   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12839       || aarch64_short_vector_p (type, mode))
12840     {
12841       *count = 1;
12842       new_mode = mode;
12843     }
12844   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12845     {
12846       if (is_ha != NULL) *is_ha = true;
12847       *count = 2;
12848       new_mode = GET_MODE_INNER (mode);
12849     }
12850   else if (type && composite_p)
12851     {
12852       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12853
12854       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12855         {
12856           if (is_ha != NULL) *is_ha = true;
12857           *count = ag_count;
12858         }
12859       else
12860         return false;
12861     }
12862   else
12863     return false;
12864
12865   *base_mode = new_mode;
12866   return true;
12867 }
12868
12869 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12870
12871 static rtx
12872 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12873                           int incoming ATTRIBUTE_UNUSED)
12874 {
12875   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12876 }
12877
12878 /* Implements target hook vector_mode_supported_p.  */
12879 static bool
12880 aarch64_vector_mode_supported_p (machine_mode mode)
12881 {
12882   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12883   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12884 }
12885
12886 /* Return appropriate SIMD container
12887    for MODE within a vector of WIDTH bits.  */
12888 static machine_mode
12889 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12890 {
12891   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12892     switch (mode)
12893       {
12894       case E_DFmode:
12895         return VNx2DFmode;
12896       case E_SFmode:
12897         return VNx4SFmode;
12898       case E_HFmode:
12899         return VNx8HFmode;
12900       case E_DImode:
12901         return VNx2DImode;
12902       case E_SImode:
12903         return VNx4SImode;
12904       case E_HImode:
12905         return VNx8HImode;
12906       case E_QImode:
12907         return VNx16QImode;
12908       default:
12909         return word_mode;
12910       }
12911
12912   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12913   if (TARGET_SIMD)
12914     {
12915       if (known_eq (width, 128))
12916         switch (mode)
12917           {
12918           case E_DFmode:
12919             return V2DFmode;
12920           case E_SFmode:
12921             return V4SFmode;
12922           case E_HFmode:
12923             return V8HFmode;
12924           case E_SImode:
12925             return V4SImode;
12926           case E_HImode:
12927             return V8HImode;
12928           case E_QImode:
12929             return V16QImode;
12930           case E_DImode:
12931             return V2DImode;
12932           default:
12933             break;
12934           }
12935       else
12936         switch (mode)
12937           {
12938           case E_SFmode:
12939             return V2SFmode;
12940           case E_HFmode:
12941             return V4HFmode;
12942           case E_SImode:
12943             return V2SImode;
12944           case E_HImode:
12945             return V4HImode;
12946           case E_QImode:
12947             return V8QImode;
12948           default:
12949             break;
12950           }
12951     }
12952   return word_mode;
12953 }
12954
12955 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12956 static machine_mode
12957 aarch64_preferred_simd_mode (scalar_mode mode)
12958 {
12959   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12960   return aarch64_simd_container_mode (mode, bits);
12961 }
12962
12963 /* Return a list of possible vector sizes for the vectorizer
12964    to iterate over.  */
12965 static void
12966 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12967 {
12968   if (TARGET_SVE)
12969     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12970   sizes->safe_push (16);
12971   sizes->safe_push (8);
12972 }
12973
12974 /* Implement TARGET_MANGLE_TYPE.  */
12975
12976 static const char *
12977 aarch64_mangle_type (const_tree type)
12978 {
12979   /* The AArch64 ABI documents say that "__va_list" has to be
12980      managled as if it is in the "std" namespace.  */
12981   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12982     return "St9__va_list";
12983
12984   /* Half-precision float.  */
12985   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12986     return "Dh";
12987
12988   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12989      builtin types.  */
12990   if (TYPE_NAME (type) != NULL)
12991     return aarch64_mangle_builtin_type (type);
12992
12993   /* Use the default mangling.  */
12994   return NULL;
12995 }
12996
12997 /* Find the first rtx_insn before insn that will generate an assembly
12998    instruction.  */
12999
13000 static rtx_insn *
13001 aarch64_prev_real_insn (rtx_insn *insn)
13002 {
13003   if (!insn)
13004     return NULL;
13005
13006   do
13007     {
13008       insn = prev_real_insn (insn);
13009     }
13010   while (insn && recog_memoized (insn) < 0);
13011
13012   return insn;
13013 }
13014
13015 static bool
13016 is_madd_op (enum attr_type t1)
13017 {
13018   unsigned int i;
13019   /* A number of these may be AArch32 only.  */
13020   enum attr_type mlatypes[] = {
13021     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13022     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13023     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13024   };
13025
13026   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13027     {
13028       if (t1 == mlatypes[i])
13029         return true;
13030     }
13031
13032   return false;
13033 }
13034
13035 /* Check if there is a register dependency between a load and the insn
13036    for which we hold recog_data.  */
13037
13038 static bool
13039 dep_between_memop_and_curr (rtx memop)
13040 {
13041   rtx load_reg;
13042   int opno;
13043
13044   gcc_assert (GET_CODE (memop) == SET);
13045
13046   if (!REG_P (SET_DEST (memop)))
13047     return false;
13048
13049   load_reg = SET_DEST (memop);
13050   for (opno = 1; opno < recog_data.n_operands; opno++)
13051     {
13052       rtx operand = recog_data.operand[opno];
13053       if (REG_P (operand)
13054           && reg_overlap_mentioned_p (load_reg, operand))
13055         return true;
13056
13057     }
13058   return false;
13059 }
13060
13061
13062 /* When working around the Cortex-A53 erratum 835769,
13063    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13064    instruction and has a preceding memory instruction such that a NOP
13065    should be inserted between them.  */
13066
13067 bool
13068 aarch64_madd_needs_nop (rtx_insn* insn)
13069 {
13070   enum attr_type attr_type;
13071   rtx_insn *prev;
13072   rtx body;
13073
13074   if (!TARGET_FIX_ERR_A53_835769)
13075     return false;
13076
13077   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13078     return false;
13079
13080   attr_type = get_attr_type (insn);
13081   if (!is_madd_op (attr_type))
13082     return false;
13083
13084   prev = aarch64_prev_real_insn (insn);
13085   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13086      Restore recog state to INSN to avoid state corruption.  */
13087   extract_constrain_insn_cached (insn);
13088
13089   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13090     return false;
13091
13092   body = single_set (prev);
13093
13094   /* If the previous insn is a memory op and there is no dependency between
13095      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13096      have a complex memory operation, probably a load/store pair.
13097      Be conservative for now and emit a NOP.  */
13098   if (GET_MODE (recog_data.operand[0]) == DImode
13099       && (!body || !dep_between_memop_and_curr (body)))
13100     return true;
13101
13102   return false;
13103
13104 }
13105
13106
13107 /* Implement FINAL_PRESCAN_INSN.  */
13108
13109 void
13110 aarch64_final_prescan_insn (rtx_insn *insn)
13111 {
13112   if (aarch64_madd_needs_nop (insn))
13113     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13114 }
13115
13116
13117 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13118    instruction.  */
13119
13120 bool
13121 aarch64_sve_index_immediate_p (rtx base_or_step)
13122 {
13123   return (CONST_INT_P (base_or_step)
13124           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13125 }
13126
13127 /* Return true if X is a valid immediate for the SVE ADD and SUB
13128    instructions.  Negate X first if NEGATE_P is true.  */
13129
13130 bool
13131 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13132 {
13133   rtx elt;
13134
13135   if (!const_vec_duplicate_p (x, &elt)
13136       || !CONST_INT_P (elt))
13137     return false;
13138
13139   HOST_WIDE_INT val = INTVAL (elt);
13140   if (negate_p)
13141     val = -val;
13142   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13143
13144   if (val & 0xff)
13145     return IN_RANGE (val, 0, 0xff);
13146   return IN_RANGE (val, 0, 0xff00);
13147 }
13148
13149 /* Return true if X is a valid immediate operand for an SVE logical
13150    instruction such as AND.  */
13151
13152 bool
13153 aarch64_sve_bitmask_immediate_p (rtx x)
13154 {
13155   rtx elt;
13156
13157   return (const_vec_duplicate_p (x, &elt)
13158           && CONST_INT_P (elt)
13159           && aarch64_bitmask_imm (INTVAL (elt),
13160                                   GET_MODE_INNER (GET_MODE (x))));
13161 }
13162
13163 /* Return true if X is a valid immediate for the SVE DUP and CPY
13164    instructions.  */
13165
13166 bool
13167 aarch64_sve_dup_immediate_p (rtx x)
13168 {
13169   rtx elt;
13170
13171   if (!const_vec_duplicate_p (x, &elt)
13172       || !CONST_INT_P (elt))
13173     return false;
13174
13175   HOST_WIDE_INT val = INTVAL (elt);
13176   if (val & 0xff)
13177     return IN_RANGE (val, -0x80, 0x7f);
13178   return IN_RANGE (val, -0x8000, 0x7f00);
13179 }
13180
13181 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13182    SIGNED_P says whether the operand is signed rather than unsigned.  */
13183
13184 bool
13185 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13186 {
13187   rtx elt;
13188
13189   return (const_vec_duplicate_p (x, &elt)
13190           && CONST_INT_P (elt)
13191           && (signed_p
13192               ? IN_RANGE (INTVAL (elt), -16, 15)
13193               : IN_RANGE (INTVAL (elt), 0, 127)));
13194 }
13195
13196 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13197    instruction.  Negate X first if NEGATE_P is true.  */
13198
13199 bool
13200 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13201 {
13202   rtx elt;
13203   REAL_VALUE_TYPE r;
13204
13205   if (!const_vec_duplicate_p (x, &elt)
13206       || GET_CODE (elt) != CONST_DOUBLE)
13207     return false;
13208
13209   r = *CONST_DOUBLE_REAL_VALUE (elt);
13210
13211   if (negate_p)
13212     r = real_value_negate (&r);
13213
13214   if (real_equal (&r, &dconst1))
13215     return true;
13216   if (real_equal (&r, &dconsthalf))
13217     return true;
13218   return false;
13219 }
13220
13221 /* Return true if X is a valid immediate operand for an SVE FMUL
13222    instruction.  */
13223
13224 bool
13225 aarch64_sve_float_mul_immediate_p (rtx x)
13226 {
13227   rtx elt;
13228
13229   /* GCC will never generate a multiply with an immediate of 2, so there is no
13230      point testing for it (even though it is a valid constant).  */
13231   return (const_vec_duplicate_p (x, &elt)
13232           && GET_CODE (elt) == CONST_DOUBLE
13233           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13234 }
13235
13236 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13237    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13238    is nonnull, use it to describe valid immediates.  */
13239 static bool
13240 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13241                                     simd_immediate_info *info,
13242                                     enum simd_immediate_check which,
13243                                     simd_immediate_info::insn_type insn)
13244 {
13245   /* Try a 4-byte immediate with LSL.  */
13246   for (unsigned int shift = 0; shift < 32; shift += 8)
13247     if ((val32 & (0xff << shift)) == val32)
13248       {
13249         if (info)
13250           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13251                                        simd_immediate_info::LSL, shift);
13252         return true;
13253       }
13254
13255   /* Try a 2-byte immediate with LSL.  */
13256   unsigned int imm16 = val32 & 0xffff;
13257   if (imm16 == (val32 >> 16))
13258     for (unsigned int shift = 0; shift < 16; shift += 8)
13259       if ((imm16 & (0xff << shift)) == imm16)
13260         {
13261           if (info)
13262             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13263                                          simd_immediate_info::LSL, shift);
13264           return true;
13265         }
13266
13267   /* Try a 4-byte immediate with MSL, except for cases that MVN
13268      can handle.  */
13269   if (which == AARCH64_CHECK_MOV)
13270     for (unsigned int shift = 8; shift < 24; shift += 8)
13271       {
13272         unsigned int low = (1 << shift) - 1;
13273         if (((val32 & (0xff << shift)) | low) == val32)
13274           {
13275             if (info)
13276               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13277                                            simd_immediate_info::MSL, shift);
13278             return true;
13279           }
13280       }
13281
13282   return false;
13283 }
13284
13285 /* Return true if replicating VAL64 is a valid immediate for the
13286    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13287    use it to describe valid immediates.  */
13288 static bool
13289 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13290                                  simd_immediate_info *info,
13291                                  enum simd_immediate_check which)
13292 {
13293   unsigned int val32 = val64 & 0xffffffff;
13294   unsigned int val16 = val64 & 0xffff;
13295   unsigned int val8 = val64 & 0xff;
13296
13297   if (val32 == (val64 >> 32))
13298     {
13299       if ((which & AARCH64_CHECK_ORR) != 0
13300           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13301                                                  simd_immediate_info::MOV))
13302         return true;
13303
13304       if ((which & AARCH64_CHECK_BIC) != 0
13305           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13306                                                  simd_immediate_info::MVN))
13307         return true;
13308
13309       /* Try using a replicated byte.  */
13310       if (which == AARCH64_CHECK_MOV
13311           && val16 == (val32 >> 16)
13312           && val8 == (val16 >> 8))
13313         {
13314           if (info)
13315             *info = simd_immediate_info (QImode, val8);
13316           return true;
13317         }
13318     }
13319
13320   /* Try using a bit-to-bytemask.  */
13321   if (which == AARCH64_CHECK_MOV)
13322     {
13323       unsigned int i;
13324       for (i = 0; i < 64; i += 8)
13325         {
13326           unsigned char byte = (val64 >> i) & 0xff;
13327           if (byte != 0 && byte != 0xff)
13328             break;
13329         }
13330       if (i == 64)
13331         {
13332           if (info)
13333             *info = simd_immediate_info (DImode, val64);
13334           return true;
13335         }
13336     }
13337   return false;
13338 }
13339
13340 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13341    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13342
13343 static bool
13344 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13345                              simd_immediate_info *info)
13346 {
13347   scalar_int_mode mode = DImode;
13348   unsigned int val32 = val64 & 0xffffffff;
13349   if (val32 == (val64 >> 32))
13350     {
13351       mode = SImode;
13352       unsigned int val16 = val32 & 0xffff;
13353       if (val16 == (val32 >> 16))
13354         {
13355           mode = HImode;
13356           unsigned int val8 = val16 & 0xff;
13357           if (val8 == (val16 >> 8))
13358             mode = QImode;
13359         }
13360     }
13361   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13362   if (IN_RANGE (val, -0x80, 0x7f))
13363     {
13364       /* DUP with no shift.  */
13365       if (info)
13366         *info = simd_immediate_info (mode, val);
13367       return true;
13368     }
13369   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13370     {
13371       /* DUP with LSL #8.  */
13372       if (info)
13373         *info = simd_immediate_info (mode, val);
13374       return true;
13375     }
13376   if (aarch64_bitmask_imm (val64, mode))
13377     {
13378       /* DUPM.  */
13379       if (info)
13380         *info = simd_immediate_info (mode, val);
13381       return true;
13382     }
13383   return false;
13384 }
13385
13386 /* Return true if OP is a valid SIMD immediate for the operation
13387    described by WHICH.  If INFO is nonnull, use it to describe valid
13388    immediates.  */
13389 bool
13390 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13391                               enum simd_immediate_check which)
13392 {
13393   machine_mode mode = GET_MODE (op);
13394   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13395   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13396     return false;
13397
13398   scalar_mode elt_mode = GET_MODE_INNER (mode);
13399   rtx base, step;
13400   unsigned int n_elts;
13401   if (GET_CODE (op) == CONST_VECTOR
13402       && CONST_VECTOR_DUPLICATE_P (op))
13403     n_elts = CONST_VECTOR_NPATTERNS (op);
13404   else if ((vec_flags & VEC_SVE_DATA)
13405            && const_vec_series_p (op, &base, &step))
13406     {
13407       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13408       if (!aarch64_sve_index_immediate_p (base)
13409           || !aarch64_sve_index_immediate_p (step))
13410         return false;
13411
13412       if (info)
13413         *info = simd_immediate_info (elt_mode, base, step);
13414       return true;
13415     }
13416   else if (GET_CODE (op) == CONST_VECTOR
13417            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13418     /* N_ELTS set above.  */;
13419   else
13420     return false;
13421
13422   /* Handle PFALSE and PTRUE.  */
13423   if (vec_flags & VEC_SVE_PRED)
13424     return (op == CONST0_RTX (mode)
13425             || op == CONSTM1_RTX (mode));
13426
13427   scalar_float_mode elt_float_mode;
13428   if (n_elts == 1
13429       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13430     {
13431       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13432       if (aarch64_float_const_zero_rtx_p (elt)
13433           || aarch64_float_const_representable_p (elt))
13434         {
13435           if (info)
13436             *info = simd_immediate_info (elt_float_mode, elt);
13437           return true;
13438         }
13439     }
13440
13441   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13442   if (elt_size > 8)
13443     return false;
13444
13445   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13446
13447   /* Expand the vector constant out into a byte vector, with the least
13448      significant byte of the register first.  */
13449   auto_vec<unsigned char, 16> bytes;
13450   bytes.reserve (n_elts * elt_size);
13451   for (unsigned int i = 0; i < n_elts; i++)
13452     {
13453       /* The vector is provided in gcc endian-neutral fashion.
13454          For aarch64_be Advanced SIMD, it must be laid out in the vector
13455          register in reverse order.  */
13456       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13457       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13458
13459       if (elt_mode != elt_int_mode)
13460         elt = gen_lowpart (elt_int_mode, elt);
13461
13462       if (!CONST_INT_P (elt))
13463         return false;
13464
13465       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13466       for (unsigned int byte = 0; byte < elt_size; byte++)
13467         {
13468           bytes.quick_push (elt_val & 0xff);
13469           elt_val >>= BITS_PER_UNIT;
13470         }
13471     }
13472
13473   /* The immediate must repeat every eight bytes.  */
13474   unsigned int nbytes = bytes.length ();
13475   for (unsigned i = 8; i < nbytes; ++i)
13476     if (bytes[i] != bytes[i - 8])
13477       return false;
13478
13479   /* Get the repeating 8-byte value as an integer.  No endian correction
13480      is needed here because bytes is already in lsb-first order.  */
13481   unsigned HOST_WIDE_INT val64 = 0;
13482   for (unsigned int i = 0; i < 8; i++)
13483     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13484               << (i * BITS_PER_UNIT));
13485
13486   if (vec_flags & VEC_SVE_DATA)
13487     return aarch64_sve_valid_immediate (val64, info);
13488   else
13489     return aarch64_advsimd_valid_immediate (val64, info, which);
13490 }
13491
13492 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13493    has a step in the range of INDEX.  Return the index expression if so,
13494    otherwise return null.  */
13495 rtx
13496 aarch64_check_zero_based_sve_index_immediate (rtx x)
13497 {
13498   rtx base, step;
13499   if (const_vec_series_p (x, &base, &step)
13500       && base == const0_rtx
13501       && aarch64_sve_index_immediate_p (step))
13502     return step;
13503   return NULL_RTX;
13504 }
13505
13506 /* Check of immediate shift constants are within range.  */
13507 bool
13508 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13509 {
13510   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13511   if (left)
13512     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13513   else
13514     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13515 }
13516
13517 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13518    operation of width WIDTH at bit position POS.  */
13519
13520 rtx
13521 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13522 {
13523   gcc_assert (CONST_INT_P (width));
13524   gcc_assert (CONST_INT_P (pos));
13525
13526   unsigned HOST_WIDE_INT mask
13527     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13528   return GEN_INT (mask << UINTVAL (pos));
13529 }
13530
13531 bool
13532 aarch64_mov_operand_p (rtx x, machine_mode mode)
13533 {
13534   if (GET_CODE (x) == HIGH
13535       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13536     return true;
13537
13538   if (CONST_INT_P (x))
13539     return true;
13540
13541   if (VECTOR_MODE_P (GET_MODE (x)))
13542     return aarch64_simd_valid_immediate (x, NULL);
13543
13544   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13545     return true;
13546
13547   if (aarch64_sve_cnt_immediate_p (x))
13548     return true;
13549
13550   return aarch64_classify_symbolic_expression (x)
13551     == SYMBOL_TINY_ABSOLUTE;
13552 }
13553
13554 /* Return a const_int vector of VAL.  */
13555 rtx
13556 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13557 {
13558   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13559   return gen_const_vec_duplicate (mode, c);
13560 }
13561
13562 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13563
13564 bool
13565 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13566 {
13567   machine_mode vmode;
13568
13569   vmode = aarch64_simd_container_mode (mode, 64);
13570   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13571   return aarch64_simd_valid_immediate (op_v, NULL);
13572 }
13573
13574 /* Construct and return a PARALLEL RTX vector with elements numbering the
13575    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13576    the vector - from the perspective of the architecture.  This does not
13577    line up with GCC's perspective on lane numbers, so we end up with
13578    different masks depending on our target endian-ness.  The diagram
13579    below may help.  We must draw the distinction when building masks
13580    which select one half of the vector.  An instruction selecting
13581    architectural low-lanes for a big-endian target, must be described using
13582    a mask selecting GCC high-lanes.
13583
13584                  Big-Endian             Little-Endian
13585
13586 GCC             0   1   2   3           3   2   1   0
13587               | x | x | x | x |       | x | x | x | x |
13588 Architecture    3   2   1   0           3   2   1   0
13589
13590 Low Mask:         { 2, 3 }                { 0, 1 }
13591 High Mask:        { 0, 1 }                { 2, 3 }
13592
13593    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13594
13595 rtx
13596 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13597 {
13598   rtvec v = rtvec_alloc (nunits / 2);
13599   int high_base = nunits / 2;
13600   int low_base = 0;
13601   int base;
13602   rtx t1;
13603   int i;
13604
13605   if (BYTES_BIG_ENDIAN)
13606     base = high ? low_base : high_base;
13607   else
13608     base = high ? high_base : low_base;
13609
13610   for (i = 0; i < nunits / 2; i++)
13611     RTVEC_ELT (v, i) = GEN_INT (base + i);
13612
13613   t1 = gen_rtx_PARALLEL (mode, v);
13614   return t1;
13615 }
13616
13617 /* Check OP for validity as a PARALLEL RTX vector with elements
13618    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13619    from the perspective of the architecture.  See the diagram above
13620    aarch64_simd_vect_par_cnst_half for more details.  */
13621
13622 bool
13623 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13624                                        bool high)
13625 {
13626   int nelts;
13627   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13628     return false;
13629
13630   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13631   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13632   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13633   int i = 0;
13634
13635   if (count_op != count_ideal)
13636     return false;
13637
13638   for (i = 0; i < count_ideal; i++)
13639     {
13640       rtx elt_op = XVECEXP (op, 0, i);
13641       rtx elt_ideal = XVECEXP (ideal, 0, i);
13642
13643       if (!CONST_INT_P (elt_op)
13644           || INTVAL (elt_ideal) != INTVAL (elt_op))
13645         return false;
13646     }
13647   return true;
13648 }
13649
13650 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13651    HIGH (exclusive).  */
13652 void
13653 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13654                           const_tree exp)
13655 {
13656   HOST_WIDE_INT lane;
13657   gcc_assert (CONST_INT_P (operand));
13658   lane = INTVAL (operand);
13659
13660   if (lane < low || lane >= high)
13661   {
13662     if (exp)
13663       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13664     else
13665       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13666   }
13667 }
13668
13669 /* Peform endian correction on lane number N, which indexes a vector
13670    of mode MODE, and return the result as an SImode rtx.  */
13671
13672 rtx
13673 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13674 {
13675   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13676 }
13677
13678 /* Return TRUE if OP is a valid vector addressing mode.  */
13679
13680 bool
13681 aarch64_simd_mem_operand_p (rtx op)
13682 {
13683   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13684                         || REG_P (XEXP (op, 0)));
13685 }
13686
13687 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13688
13689 bool
13690 aarch64_sve_ld1r_operand_p (rtx op)
13691 {
13692   struct aarch64_address_info addr;
13693   scalar_mode mode;
13694
13695   return (MEM_P (op)
13696           && is_a <scalar_mode> (GET_MODE (op), &mode)
13697           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13698           && addr.type == ADDRESS_REG_IMM
13699           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13700 }
13701
13702 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13703    The conditions for STR are the same.  */
13704 bool
13705 aarch64_sve_ldr_operand_p (rtx op)
13706 {
13707   struct aarch64_address_info addr;
13708
13709   return (MEM_P (op)
13710           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13711                                        false, ADDR_QUERY_ANY)
13712           && addr.type == ADDRESS_REG_IMM);
13713 }
13714
13715 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13716    We need to be able to access the individual pieces, so the range
13717    is different from LD[234] and ST[234].  */
13718 bool
13719 aarch64_sve_struct_memory_operand_p (rtx op)
13720 {
13721   if (!MEM_P (op))
13722     return false;
13723
13724   machine_mode mode = GET_MODE (op);
13725   struct aarch64_address_info addr;
13726   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13727                                  ADDR_QUERY_ANY)
13728       || addr.type != ADDRESS_REG_IMM)
13729     return false;
13730
13731   poly_int64 first = addr.const_offset;
13732   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13733   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13734           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13735 }
13736
13737 /* Emit a register copy from operand to operand, taking care not to
13738    early-clobber source registers in the process.
13739
13740    COUNT is the number of components into which the copy needs to be
13741    decomposed.  */
13742 void
13743 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13744                                 unsigned int count)
13745 {
13746   unsigned int i;
13747   int rdest = REGNO (operands[0]);
13748   int rsrc = REGNO (operands[1]);
13749
13750   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13751       || rdest < rsrc)
13752     for (i = 0; i < count; i++)
13753       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13754                       gen_rtx_REG (mode, rsrc + i));
13755   else
13756     for (i = 0; i < count; i++)
13757       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13758                       gen_rtx_REG (mode, rsrc + count - i - 1));
13759 }
13760
13761 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13762    one of VSTRUCT modes: OI, CI, or XI.  */
13763 int
13764 aarch64_simd_attr_length_rglist (machine_mode mode)
13765 {
13766   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13767   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13768 }
13769
13770 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13771    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13772    16 bits.  */
13773 static HOST_WIDE_INT
13774 aarch64_simd_vector_alignment (const_tree type)
13775 {
13776   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13777     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13778        be set for non-predicate vectors of booleans.  Modes are the most
13779        direct way we have of identifying real SVE predicate types.  */
13780     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13781   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13782   return MIN (align, 128);
13783 }
13784
13785 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13786 static HOST_WIDE_INT
13787 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13788 {
13789   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13790     {
13791       /* If the length of the vector is fixed, try to align to that length,
13792          otherwise don't try to align at all.  */
13793       HOST_WIDE_INT result;
13794       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13795         result = TYPE_ALIGN (TREE_TYPE (type));
13796       return result;
13797     }
13798   return TYPE_ALIGN (type);
13799 }
13800
13801 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13802 static bool
13803 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13804 {
13805   if (is_packed)
13806     return false;
13807
13808   /* For fixed-length vectors, check that the vectorizer will aim for
13809      full-vector alignment.  This isn't true for generic GCC vectors
13810      that are wider than the ABI maximum of 128 bits.  */
13811   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13812       && (wi::to_widest (TYPE_SIZE (type))
13813           != aarch64_vectorize_preferred_vector_alignment (type)))
13814     return false;
13815
13816   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13817   return true;
13818 }
13819
13820 /* Return true if the vector misalignment factor is supported by the
13821    target.  */
13822 static bool
13823 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13824                                              const_tree type, int misalignment,
13825                                              bool is_packed)
13826 {
13827   if (TARGET_SIMD && STRICT_ALIGNMENT)
13828     {
13829       /* Return if movmisalign pattern is not supported for this mode.  */
13830       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13831         return false;
13832
13833       /* Misalignment factor is unknown at compile time.  */
13834       if (misalignment == -1)
13835         return false;
13836     }
13837   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13838                                                       is_packed);
13839 }
13840
13841 /* If VALS is a vector constant that can be loaded into a register
13842    using DUP, generate instructions to do so and return an RTX to
13843    assign to the register.  Otherwise return NULL_RTX.  */
13844 static rtx
13845 aarch64_simd_dup_constant (rtx vals)
13846 {
13847   machine_mode mode = GET_MODE (vals);
13848   machine_mode inner_mode = GET_MODE_INNER (mode);
13849   rtx x;
13850
13851   if (!const_vec_duplicate_p (vals, &x))
13852     return NULL_RTX;
13853
13854   /* We can load this constant by using DUP and a constant in a
13855      single ARM register.  This will be cheaper than a vector
13856      load.  */
13857   x = copy_to_mode_reg (inner_mode, x);
13858   return gen_vec_duplicate (mode, x);
13859 }
13860
13861
13862 /* Generate code to load VALS, which is a PARALLEL containing only
13863    constants (for vec_init) or CONST_VECTOR, efficiently into a
13864    register.  Returns an RTX to copy into the register, or NULL_RTX
13865    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13866 static rtx
13867 aarch64_simd_make_constant (rtx vals)
13868 {
13869   machine_mode mode = GET_MODE (vals);
13870   rtx const_dup;
13871   rtx const_vec = NULL_RTX;
13872   int n_const = 0;
13873   int i;
13874
13875   if (GET_CODE (vals) == CONST_VECTOR)
13876     const_vec = vals;
13877   else if (GET_CODE (vals) == PARALLEL)
13878     {
13879       /* A CONST_VECTOR must contain only CONST_INTs and
13880          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13881          Only store valid constants in a CONST_VECTOR.  */
13882       int n_elts = XVECLEN (vals, 0);
13883       for (i = 0; i < n_elts; ++i)
13884         {
13885           rtx x = XVECEXP (vals, 0, i);
13886           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13887             n_const++;
13888         }
13889       if (n_const == n_elts)
13890         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13891     }
13892   else
13893     gcc_unreachable ();
13894
13895   if (const_vec != NULL_RTX
13896       && aarch64_simd_valid_immediate (const_vec, NULL))
13897     /* Load using MOVI/MVNI.  */
13898     return const_vec;
13899   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13900     /* Loaded using DUP.  */
13901     return const_dup;
13902   else if (const_vec != NULL_RTX)
13903     /* Load from constant pool. We can not take advantage of single-cycle
13904        LD1 because we need a PC-relative addressing mode.  */
13905     return const_vec;
13906   else
13907     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13908        We can not construct an initializer.  */
13909     return NULL_RTX;
13910 }
13911
13912 /* Expand a vector initialisation sequence, such that TARGET is
13913    initialised to contain VALS.  */
13914
13915 void
13916 aarch64_expand_vector_init (rtx target, rtx vals)
13917 {
13918   machine_mode mode = GET_MODE (target);
13919   scalar_mode inner_mode = GET_MODE_INNER (mode);
13920   /* The number of vector elements.  */
13921   int n_elts = XVECLEN (vals, 0);
13922   /* The number of vector elements which are not constant.  */
13923   int n_var = 0;
13924   rtx any_const = NULL_RTX;
13925   /* The first element of vals.  */
13926   rtx v0 = XVECEXP (vals, 0, 0);
13927   bool all_same = true;
13928
13929   /* Count the number of variable elements to initialise.  */
13930   for (int i = 0; i < n_elts; ++i)
13931     {
13932       rtx x = XVECEXP (vals, 0, i);
13933       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13934         ++n_var;
13935       else
13936         any_const = x;
13937
13938       all_same &= rtx_equal_p (x, v0);
13939     }
13940
13941   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13942      how best to handle this.  */
13943   if (n_var == 0)
13944     {
13945       rtx constant = aarch64_simd_make_constant (vals);
13946       if (constant != NULL_RTX)
13947         {
13948           emit_move_insn (target, constant);
13949           return;
13950         }
13951     }
13952
13953   /* Splat a single non-constant element if we can.  */
13954   if (all_same)
13955     {
13956       rtx x = copy_to_mode_reg (inner_mode, v0);
13957       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13958       return;
13959     }
13960
13961   enum insn_code icode = optab_handler (vec_set_optab, mode);
13962   gcc_assert (icode != CODE_FOR_nothing);
13963
13964   /* If there are only variable elements, try to optimize
13965      the insertion using dup for the most common element
13966      followed by insertions.  */
13967
13968   /* The algorithm will fill matches[*][0] with the earliest matching element,
13969      and matches[X][1] with the count of duplicate elements (if X is the
13970      earliest element which has duplicates).  */
13971
13972   if (n_var == n_elts && n_elts <= 16)
13973     {
13974       int matches[16][2] = {0};
13975       for (int i = 0; i < n_elts; i++)
13976         {
13977           for (int j = 0; j <= i; j++)
13978             {
13979               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13980                 {
13981                   matches[i][0] = j;
13982                   matches[j][1]++;
13983                   break;
13984                 }
13985             }
13986         }
13987       int maxelement = 0;
13988       int maxv = 0;
13989       for (int i = 0; i < n_elts; i++)
13990         if (matches[i][1] > maxv)
13991           {
13992             maxelement = i;
13993             maxv = matches[i][1];
13994           }
13995
13996       /* Create a duplicate of the most common element, unless all elements
13997          are equally useless to us, in which case just immediately set the
13998          vector register using the first element.  */
13999
14000       if (maxv == 1)
14001         {
14002           /* For vectors of two 64-bit elements, we can do even better.  */
14003           if (n_elts == 2
14004               && (inner_mode == E_DImode
14005                   || inner_mode == E_DFmode))
14006
14007             {
14008               rtx x0 = XVECEXP (vals, 0, 0);
14009               rtx x1 = XVECEXP (vals, 0, 1);
14010               /* Combine can pick up this case, but handling it directly
14011                  here leaves clearer RTL.
14012
14013                  This is load_pair_lanes<mode>, and also gives us a clean-up
14014                  for store_pair_lanes<mode>.  */
14015               if (memory_operand (x0, inner_mode)
14016                   && memory_operand (x1, inner_mode)
14017                   && !STRICT_ALIGNMENT
14018                   && rtx_equal_p (XEXP (x1, 0),
14019                                   plus_constant (Pmode,
14020                                                  XEXP (x0, 0),
14021                                                  GET_MODE_SIZE (inner_mode))))
14022                 {
14023                   rtx t;
14024                   if (inner_mode == DFmode)
14025                     t = gen_load_pair_lanesdf (target, x0, x1);
14026                   else
14027                     t = gen_load_pair_lanesdi (target, x0, x1);
14028                   emit_insn (t);
14029                   return;
14030                 }
14031             }
14032           /* The subreg-move sequence below will move into lane zero of the
14033              vector register.  For big-endian we want that position to hold
14034              the last element of VALS.  */
14035           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14036           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14037           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14038         }
14039       else
14040         {
14041           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14042           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14043         }
14044
14045       /* Insert the rest.  */
14046       for (int i = 0; i < n_elts; i++)
14047         {
14048           rtx x = XVECEXP (vals, 0, i);
14049           if (matches[i][0] == maxelement)
14050             continue;
14051           x = copy_to_mode_reg (inner_mode, x);
14052           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14053         }
14054       return;
14055     }
14056
14057   /* Initialise a vector which is part-variable.  We want to first try
14058      to build those lanes which are constant in the most efficient way we
14059      can.  */
14060   if (n_var != n_elts)
14061     {
14062       rtx copy = copy_rtx (vals);
14063
14064       /* Load constant part of vector.  We really don't care what goes into the
14065          parts we will overwrite, but we're more likely to be able to load the
14066          constant efficiently if it has fewer, larger, repeating parts
14067          (see aarch64_simd_valid_immediate).  */
14068       for (int i = 0; i < n_elts; i++)
14069         {
14070           rtx x = XVECEXP (vals, 0, i);
14071           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14072             continue;
14073           rtx subst = any_const;
14074           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14075             {
14076               /* Look in the copied vector, as more elements are const.  */
14077               rtx test = XVECEXP (copy, 0, i ^ bit);
14078               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14079                 {
14080                   subst = test;
14081                   break;
14082                 }
14083             }
14084           XVECEXP (copy, 0, i) = subst;
14085         }
14086       aarch64_expand_vector_init (target, copy);
14087     }
14088
14089   /* Insert the variable lanes directly.  */
14090   for (int i = 0; i < n_elts; i++)
14091     {
14092       rtx x = XVECEXP (vals, 0, i);
14093       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14094         continue;
14095       x = copy_to_mode_reg (inner_mode, x);
14096       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14097     }
14098 }
14099
14100 static unsigned HOST_WIDE_INT
14101 aarch64_shift_truncation_mask (machine_mode mode)
14102 {
14103   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14104     return 0;
14105   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14106 }
14107
14108 /* Select a format to encode pointers in exception handling data.  */
14109 int
14110 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14111 {
14112    int type;
14113    switch (aarch64_cmodel)
14114      {
14115      case AARCH64_CMODEL_TINY:
14116      case AARCH64_CMODEL_TINY_PIC:
14117      case AARCH64_CMODEL_SMALL:
14118      case AARCH64_CMODEL_SMALL_PIC:
14119      case AARCH64_CMODEL_SMALL_SPIC:
14120        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14121           for everything.  */
14122        type = DW_EH_PE_sdata4;
14123        break;
14124      default:
14125        /* No assumptions here.  8-byte relocs required.  */
14126        type = DW_EH_PE_sdata8;
14127        break;
14128      }
14129    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14130 }
14131
14132 /* The last .arch and .tune assembly strings that we printed.  */
14133 static std::string aarch64_last_printed_arch_string;
14134 static std::string aarch64_last_printed_tune_string;
14135
14136 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14137    by the function fndecl.  */
14138
14139 void
14140 aarch64_declare_function_name (FILE *stream, const char* name,
14141                                 tree fndecl)
14142 {
14143   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14144
14145   struct cl_target_option *targ_options;
14146   if (target_parts)
14147     targ_options = TREE_TARGET_OPTION (target_parts);
14148   else
14149     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14150   gcc_assert (targ_options);
14151
14152   const struct processor *this_arch
14153     = aarch64_get_arch (targ_options->x_explicit_arch);
14154
14155   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14156   std::string extension
14157     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14158                                                   this_arch->flags);
14159   /* Only update the assembler .arch string if it is distinct from the last
14160      such string we printed.  */
14161   std::string to_print = this_arch->name + extension;
14162   if (to_print != aarch64_last_printed_arch_string)
14163     {
14164       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14165       aarch64_last_printed_arch_string = to_print;
14166     }
14167
14168   /* Print the cpu name we're tuning for in the comments, might be
14169      useful to readers of the generated asm.  Do it only when it changes
14170      from function to function and verbose assembly is requested.  */
14171   const struct processor *this_tune
14172     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14173
14174   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14175     {
14176       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14177                    this_tune->name);
14178       aarch64_last_printed_tune_string = this_tune->name;
14179     }
14180
14181   /* Don't forget the type directive for ELF.  */
14182   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14183   ASM_OUTPUT_LABEL (stream, name);
14184 }
14185
14186 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14187
14188 static void
14189 aarch64_start_file (void)
14190 {
14191   struct cl_target_option *default_options
14192     = TREE_TARGET_OPTION (target_option_default_node);
14193
14194   const struct processor *default_arch
14195     = aarch64_get_arch (default_options->x_explicit_arch);
14196   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14197   std::string extension
14198     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14199                                                   default_arch->flags);
14200
14201    aarch64_last_printed_arch_string = default_arch->name + extension;
14202    aarch64_last_printed_tune_string = "";
14203    asm_fprintf (asm_out_file, "\t.arch %s\n",
14204                 aarch64_last_printed_arch_string.c_str ());
14205
14206    default_file_start ();
14207 }
14208
14209 /* Emit load exclusive.  */
14210
14211 static void
14212 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14213                              rtx mem, rtx model_rtx)
14214 {
14215   rtx (*gen) (rtx, rtx, rtx);
14216
14217   switch (mode)
14218     {
14219     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14220     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14221     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14222     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14223     default:
14224       gcc_unreachable ();
14225     }
14226
14227   emit_insn (gen (rval, mem, model_rtx));
14228 }
14229
14230 /* Emit store exclusive.  */
14231
14232 static void
14233 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14234                               rtx rval, rtx mem, rtx model_rtx)
14235 {
14236   rtx (*gen) (rtx, rtx, rtx, rtx);
14237
14238   switch (mode)
14239     {
14240     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14241     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14242     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14243     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14244     default:
14245       gcc_unreachable ();
14246     }
14247
14248   emit_insn (gen (bval, rval, mem, model_rtx));
14249 }
14250
14251 /* Mark the previous jump instruction as unlikely.  */
14252
14253 static void
14254 aarch64_emit_unlikely_jump (rtx insn)
14255 {
14256   rtx_insn *jump = emit_jump_insn (insn);
14257   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14258 }
14259
14260 /* Expand a compare and swap pattern.  */
14261
14262 void
14263 aarch64_expand_compare_and_swap (rtx operands[])
14264 {
14265   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14266   machine_mode mode, cmp_mode;
14267   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14268   int idx;
14269   gen_cas_fn gen;
14270   const gen_cas_fn split_cas[] =
14271   {
14272     gen_aarch64_compare_and_swapqi,
14273     gen_aarch64_compare_and_swaphi,
14274     gen_aarch64_compare_and_swapsi,
14275     gen_aarch64_compare_and_swapdi
14276   };
14277   const gen_cas_fn atomic_cas[] =
14278   {
14279     gen_aarch64_compare_and_swapqi_lse,
14280     gen_aarch64_compare_and_swaphi_lse,
14281     gen_aarch64_compare_and_swapsi_lse,
14282     gen_aarch64_compare_and_swapdi_lse
14283   };
14284
14285   bval = operands[0];
14286   rval = operands[1];
14287   mem = operands[2];
14288   oldval = operands[3];
14289   newval = operands[4];
14290   is_weak = operands[5];
14291   mod_s = operands[6];
14292   mod_f = operands[7];
14293   mode = GET_MODE (mem);
14294   cmp_mode = mode;
14295
14296   /* Normally the succ memory model must be stronger than fail, but in the
14297      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14298      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14299
14300   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14301       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14302     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14303
14304   switch (mode)
14305     {
14306     case E_QImode:
14307     case E_HImode:
14308       /* For short modes, we're going to perform the comparison in SImode,
14309          so do the zero-extension now.  */
14310       cmp_mode = SImode;
14311       rval = gen_reg_rtx (SImode);
14312       oldval = convert_modes (SImode, mode, oldval, true);
14313       /* Fall through.  */
14314
14315     case E_SImode:
14316     case E_DImode:
14317       /* Force the value into a register if needed.  */
14318       if (!aarch64_plus_operand (oldval, mode))
14319         oldval = force_reg (cmp_mode, oldval);
14320       break;
14321
14322     default:
14323       gcc_unreachable ();
14324     }
14325
14326   switch (mode)
14327     {
14328     case E_QImode: idx = 0; break;
14329     case E_HImode: idx = 1; break;
14330     case E_SImode: idx = 2; break;
14331     case E_DImode: idx = 3; break;
14332     default:
14333       gcc_unreachable ();
14334     }
14335   if (TARGET_LSE)
14336     gen = atomic_cas[idx];
14337   else
14338     gen = split_cas[idx];
14339
14340   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14341
14342   if (mode == QImode || mode == HImode)
14343     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14344
14345   x = gen_rtx_REG (CCmode, CC_REGNUM);
14346   x = gen_rtx_EQ (SImode, x, const0_rtx);
14347   emit_insn (gen_rtx_SET (bval, x));
14348 }
14349
14350 /* Test whether the target supports using a atomic load-operate instruction.
14351    CODE is the operation and AFTER is TRUE if the data in memory after the
14352    operation should be returned and FALSE if the data before the operation
14353    should be returned.  Returns FALSE if the operation isn't supported by the
14354    architecture.  */
14355
14356 bool
14357 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14358 {
14359   if (!TARGET_LSE)
14360     return false;
14361
14362   switch (code)
14363     {
14364     case SET:
14365     case AND:
14366     case IOR:
14367     case XOR:
14368     case MINUS:
14369     case PLUS:
14370       return true;
14371     default:
14372       return false;
14373     }
14374 }
14375
14376 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14377    sequence implementing an atomic operation.  */
14378
14379 static void
14380 aarch64_emit_post_barrier (enum memmodel model)
14381 {
14382   const enum memmodel base_model = memmodel_base (model);
14383
14384   if (is_mm_sync (model)
14385       && (base_model == MEMMODEL_ACQUIRE
14386           || base_model == MEMMODEL_ACQ_REL
14387           || base_model == MEMMODEL_SEQ_CST))
14388     {
14389       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14390     }
14391 }
14392
14393 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14394    for the data in memory.  EXPECTED is the value expected to be in memory.
14395    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14396    is the memory ordering to use.  */
14397
14398 void
14399 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14400                         rtx expected, rtx desired,
14401                         rtx model)
14402 {
14403   rtx (*gen) (rtx, rtx, rtx, rtx);
14404   machine_mode mode;
14405
14406   mode = GET_MODE (mem);
14407
14408   switch (mode)
14409     {
14410     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14411     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14412     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14413     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14414     default:
14415       gcc_unreachable ();
14416     }
14417
14418   /* Move the expected value into the CAS destination register.  */
14419   emit_insn (gen_rtx_SET (rval, expected));
14420
14421   /* Emit the CAS.  */
14422   emit_insn (gen (rval, mem, desired, model));
14423
14424   /* Compare the expected value with the value loaded by the CAS, to establish
14425      whether the swap was made.  */
14426   aarch64_gen_compare_reg (EQ, rval, expected);
14427 }
14428
14429 /* Split a compare and swap pattern.  */
14430
14431 void
14432 aarch64_split_compare_and_swap (rtx operands[])
14433 {
14434   rtx rval, mem, oldval, newval, scratch;
14435   machine_mode mode;
14436   bool is_weak;
14437   rtx_code_label *label1, *label2;
14438   rtx x, cond;
14439   enum memmodel model;
14440   rtx model_rtx;
14441
14442   rval = operands[0];
14443   mem = operands[1];
14444   oldval = operands[2];
14445   newval = operands[3];
14446   is_weak = (operands[4] != const0_rtx);
14447   model_rtx = operands[5];
14448   scratch = operands[7];
14449   mode = GET_MODE (mem);
14450   model = memmodel_from_int (INTVAL (model_rtx));
14451
14452   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14453     loop:
14454     .label1:
14455         LD[A]XR rval, [mem]
14456         CBNZ    rval, .label2
14457         ST[L]XR scratch, newval, [mem]
14458         CBNZ    scratch, .label1
14459     .label2:
14460         CMP     rval, 0.  */
14461   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14462
14463   label1 = NULL;
14464   if (!is_weak)
14465     {
14466       label1 = gen_label_rtx ();
14467       emit_label (label1);
14468     }
14469   label2 = gen_label_rtx ();
14470
14471   /* The initial load can be relaxed for a __sync operation since a final
14472      barrier will be emitted to stop code hoisting.  */
14473   if (is_mm_sync (model))
14474     aarch64_emit_load_exclusive (mode, rval, mem,
14475                                  GEN_INT (MEMMODEL_RELAXED));
14476   else
14477     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14478
14479   if (strong_zero_p)
14480     {
14481       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14482       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14483                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14484       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14485     }
14486   else
14487     {
14488       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14489       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14490       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14491                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14492       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14493     }
14494
14495   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14496
14497   if (!is_weak)
14498     {
14499       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14500       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14501                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14502       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14503     }
14504   else
14505     {
14506       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14507       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14508       emit_insn (gen_rtx_SET (cond, x));
14509     }
14510
14511   emit_label (label2);
14512   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14513      to set the condition flags.  If this is not used it will be removed by
14514      later passes.  */
14515   if (strong_zero_p)
14516     {
14517       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14518       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14519       emit_insn (gen_rtx_SET (cond, x));
14520     }
14521   /* Emit any final barrier needed for a __sync operation.  */
14522   if (is_mm_sync (model))
14523     aarch64_emit_post_barrier (model);
14524 }
14525
14526 /* Emit a BIC instruction.  */
14527
14528 static void
14529 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14530 {
14531   rtx shift_rtx = GEN_INT (shift);
14532   rtx (*gen) (rtx, rtx, rtx, rtx);
14533
14534   switch (mode)
14535     {
14536     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14537     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14538     default:
14539       gcc_unreachable ();
14540     }
14541
14542   emit_insn (gen (dst, s2, shift_rtx, s1));
14543 }
14544
14545 /* Emit an atomic swap.  */
14546
14547 static void
14548 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14549                           rtx mem, rtx model)
14550 {
14551   rtx (*gen) (rtx, rtx, rtx, rtx);
14552
14553   switch (mode)
14554     {
14555     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14556     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14557     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14558     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14559     default:
14560       gcc_unreachable ();
14561     }
14562
14563   emit_insn (gen (dst, mem, value, model));
14564 }
14565
14566 /* Operations supported by aarch64_emit_atomic_load_op.  */
14567
14568 enum aarch64_atomic_load_op_code
14569 {
14570   AARCH64_LDOP_PLUS,    /* A + B  */
14571   AARCH64_LDOP_XOR,     /* A ^ B  */
14572   AARCH64_LDOP_OR,      /* A | B  */
14573   AARCH64_LDOP_BIC      /* A & ~B  */
14574 };
14575
14576 /* Emit an atomic load-operate.  */
14577
14578 static void
14579 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14580                              machine_mode mode, rtx dst, rtx src,
14581                              rtx mem, rtx model)
14582 {
14583   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14584   const aarch64_atomic_load_op_fn plus[] =
14585   {
14586     gen_aarch64_atomic_loadaddqi,
14587     gen_aarch64_atomic_loadaddhi,
14588     gen_aarch64_atomic_loadaddsi,
14589     gen_aarch64_atomic_loadadddi
14590   };
14591   const aarch64_atomic_load_op_fn eor[] =
14592   {
14593     gen_aarch64_atomic_loadeorqi,
14594     gen_aarch64_atomic_loadeorhi,
14595     gen_aarch64_atomic_loadeorsi,
14596     gen_aarch64_atomic_loadeordi
14597   };
14598   const aarch64_atomic_load_op_fn ior[] =
14599   {
14600     gen_aarch64_atomic_loadsetqi,
14601     gen_aarch64_atomic_loadsethi,
14602     gen_aarch64_atomic_loadsetsi,
14603     gen_aarch64_atomic_loadsetdi
14604   };
14605   const aarch64_atomic_load_op_fn bic[] =
14606   {
14607     gen_aarch64_atomic_loadclrqi,
14608     gen_aarch64_atomic_loadclrhi,
14609     gen_aarch64_atomic_loadclrsi,
14610     gen_aarch64_atomic_loadclrdi
14611   };
14612   aarch64_atomic_load_op_fn gen;
14613   int idx = 0;
14614
14615   switch (mode)
14616     {
14617     case E_QImode: idx = 0; break;
14618     case E_HImode: idx = 1; break;
14619     case E_SImode: idx = 2; break;
14620     case E_DImode: idx = 3; break;
14621     default:
14622       gcc_unreachable ();
14623     }
14624
14625   switch (code)
14626     {
14627     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14628     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14629     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14630     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14631     default:
14632       gcc_unreachable ();
14633     }
14634
14635   emit_insn (gen (dst, mem, src, model));
14636 }
14637
14638 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14639    location to store the data read from memory.  OUT_RESULT is the location to
14640    store the result of the operation.  MEM is the memory location to read and
14641    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14642    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14643    be NULL.  */
14644
14645 void
14646 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14647                          rtx mem, rtx value, rtx model_rtx)
14648 {
14649   machine_mode mode = GET_MODE (mem);
14650   machine_mode wmode = (mode == DImode ? DImode : SImode);
14651   const bool short_mode = (mode < SImode);
14652   aarch64_atomic_load_op_code ldop_code;
14653   rtx src;
14654   rtx x;
14655
14656   if (out_data)
14657     out_data = gen_lowpart (mode, out_data);
14658
14659   if (out_result)
14660     out_result = gen_lowpart (mode, out_result);
14661
14662   /* Make sure the value is in a register, putting it into a destination
14663      register if it needs to be manipulated.  */
14664   if (!register_operand (value, mode)
14665       || code == AND || code == MINUS)
14666     {
14667       src = out_result ? out_result : out_data;
14668       emit_move_insn (src, gen_lowpart (mode, value));
14669     }
14670   else
14671     src = value;
14672   gcc_assert (register_operand (src, mode));
14673
14674   /* Preprocess the data for the operation as necessary.  If the operation is
14675      a SET then emit a swap instruction and finish.  */
14676   switch (code)
14677     {
14678     case SET:
14679       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14680       return;
14681
14682     case MINUS:
14683       /* Negate the value and treat it as a PLUS.  */
14684       {
14685         rtx neg_src;
14686
14687         /* Resize the value if necessary.  */
14688         if (short_mode)
14689           src = gen_lowpart (wmode, src);
14690
14691         neg_src = gen_rtx_NEG (wmode, src);
14692         emit_insn (gen_rtx_SET (src, neg_src));
14693
14694         if (short_mode)
14695           src = gen_lowpart (mode, src);
14696       }
14697       /* Fall-through.  */
14698     case PLUS:
14699       ldop_code = AARCH64_LDOP_PLUS;
14700       break;
14701
14702     case IOR:
14703       ldop_code = AARCH64_LDOP_OR;
14704       break;
14705
14706     case XOR:
14707       ldop_code = AARCH64_LDOP_XOR;
14708       break;
14709
14710     case AND:
14711       {
14712         rtx not_src;
14713
14714         /* Resize the value if necessary.  */
14715         if (short_mode)
14716           src = gen_lowpart (wmode, src);
14717
14718         not_src = gen_rtx_NOT (wmode, src);
14719         emit_insn (gen_rtx_SET (src, not_src));
14720
14721         if (short_mode)
14722           src = gen_lowpart (mode, src);
14723       }
14724       ldop_code = AARCH64_LDOP_BIC;
14725       break;
14726
14727     default:
14728       /* The operation can't be done with atomic instructions.  */
14729       gcc_unreachable ();
14730     }
14731
14732   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14733
14734   /* If necessary, calculate the data in memory after the update by redoing the
14735      operation from values in registers.  */
14736   if (!out_result)
14737     return;
14738
14739   if (short_mode)
14740     {
14741       src = gen_lowpart (wmode, src);
14742       out_data = gen_lowpart (wmode, out_data);
14743       out_result = gen_lowpart (wmode, out_result);
14744     }
14745
14746   x = NULL_RTX;
14747
14748   switch (code)
14749     {
14750     case MINUS:
14751     case PLUS:
14752       x = gen_rtx_PLUS (wmode, out_data, src);
14753       break;
14754     case IOR:
14755       x = gen_rtx_IOR (wmode, out_data, src);
14756       break;
14757     case XOR:
14758       x = gen_rtx_XOR (wmode, out_data, src);
14759       break;
14760     case AND:
14761       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14762       return;
14763     default:
14764       gcc_unreachable ();
14765     }
14766
14767   emit_set_insn (out_result, x);
14768
14769   return;
14770 }
14771
14772 /* Split an atomic operation.  */
14773
14774 void
14775 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14776                          rtx value, rtx model_rtx, rtx cond)
14777 {
14778   machine_mode mode = GET_MODE (mem);
14779   machine_mode wmode = (mode == DImode ? DImode : SImode);
14780   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14781   const bool is_sync = is_mm_sync (model);
14782   rtx_code_label *label;
14783   rtx x;
14784
14785   /* Split the atomic operation into a sequence.  */
14786   label = gen_label_rtx ();
14787   emit_label (label);
14788
14789   if (new_out)
14790     new_out = gen_lowpart (wmode, new_out);
14791   if (old_out)
14792     old_out = gen_lowpart (wmode, old_out);
14793   else
14794     old_out = new_out;
14795   value = simplify_gen_subreg (wmode, value, mode, 0);
14796
14797   /* The initial load can be relaxed for a __sync operation since a final
14798      barrier will be emitted to stop code hoisting.  */
14799  if (is_sync)
14800     aarch64_emit_load_exclusive (mode, old_out, mem,
14801                                  GEN_INT (MEMMODEL_RELAXED));
14802   else
14803     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14804
14805   switch (code)
14806     {
14807     case SET:
14808       new_out = value;
14809       break;
14810
14811     case NOT:
14812       x = gen_rtx_AND (wmode, old_out, value);
14813       emit_insn (gen_rtx_SET (new_out, x));
14814       x = gen_rtx_NOT (wmode, new_out);
14815       emit_insn (gen_rtx_SET (new_out, x));
14816       break;
14817
14818     case MINUS:
14819       if (CONST_INT_P (value))
14820         {
14821           value = GEN_INT (-INTVAL (value));
14822           code = PLUS;
14823         }
14824       /* Fall through.  */
14825
14826     default:
14827       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14828       emit_insn (gen_rtx_SET (new_out, x));
14829       break;
14830     }
14831
14832   aarch64_emit_store_exclusive (mode, cond, mem,
14833                                 gen_lowpart (mode, new_out), model_rtx);
14834
14835   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14836   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14837                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14838   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14839
14840   /* Emit any final barrier needed for a __sync operation.  */
14841   if (is_sync)
14842     aarch64_emit_post_barrier (model);
14843 }
14844
14845 static void
14846 aarch64_init_libfuncs (void)
14847 {
14848    /* Half-precision float operations.  The compiler handles all operations
14849      with NULL libfuncs by converting to SFmode.  */
14850
14851   /* Conversions.  */
14852   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14853   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14854
14855   /* Arithmetic.  */
14856   set_optab_libfunc (add_optab, HFmode, NULL);
14857   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14858   set_optab_libfunc (smul_optab, HFmode, NULL);
14859   set_optab_libfunc (neg_optab, HFmode, NULL);
14860   set_optab_libfunc (sub_optab, HFmode, NULL);
14861
14862   /* Comparisons.  */
14863   set_optab_libfunc (eq_optab, HFmode, NULL);
14864   set_optab_libfunc (ne_optab, HFmode, NULL);
14865   set_optab_libfunc (lt_optab, HFmode, NULL);
14866   set_optab_libfunc (le_optab, HFmode, NULL);
14867   set_optab_libfunc (ge_optab, HFmode, NULL);
14868   set_optab_libfunc (gt_optab, HFmode, NULL);
14869   set_optab_libfunc (unord_optab, HFmode, NULL);
14870 }
14871
14872 /* Target hook for c_mode_for_suffix.  */
14873 static machine_mode
14874 aarch64_c_mode_for_suffix (char suffix)
14875 {
14876   if (suffix == 'q')
14877     return TFmode;
14878
14879   return VOIDmode;
14880 }
14881
14882 /* We can only represent floating point constants which will fit in
14883    "quarter-precision" values.  These values are characterised by
14884    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14885    by:
14886
14887    (-1)^s * (n/16) * 2^r
14888
14889    Where:
14890      's' is the sign bit.
14891      'n' is an integer in the range 16 <= n <= 31.
14892      'r' is an integer in the range -3 <= r <= 4.  */
14893
14894 /* Return true iff X can be represented by a quarter-precision
14895    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14896 bool
14897 aarch64_float_const_representable_p (rtx x)
14898 {
14899   /* This represents our current view of how many bits
14900      make up the mantissa.  */
14901   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14902   int exponent;
14903   unsigned HOST_WIDE_INT mantissa, mask;
14904   REAL_VALUE_TYPE r, m;
14905   bool fail;
14906
14907   if (!CONST_DOUBLE_P (x))
14908     return false;
14909
14910   /* We don't support HFmode constants yet.  */
14911   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14912     return false;
14913
14914   r = *CONST_DOUBLE_REAL_VALUE (x);
14915
14916   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14917      know if we have +zero until we analyse the mantissa, but we
14918      can reject the other invalid values.  */
14919   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14920       || REAL_VALUE_MINUS_ZERO (r))
14921     return false;
14922
14923   /* Extract exponent.  */
14924   r = real_value_abs (&r);
14925   exponent = REAL_EXP (&r);
14926
14927   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14928      highest (sign) bit, with a fixed binary point at bit point_pos.
14929      m1 holds the low part of the mantissa, m2 the high part.
14930      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14931      bits for the mantissa, this can fail (low bits will be lost).  */
14932   real_ldexp (&m, &r, point_pos - exponent);
14933   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14934
14935   /* If the low part of the mantissa has bits set we cannot represent
14936      the value.  */
14937   if (w.ulow () != 0)
14938     return false;
14939   /* We have rejected the lower HOST_WIDE_INT, so update our
14940      understanding of how many bits lie in the mantissa and
14941      look only at the high HOST_WIDE_INT.  */
14942   mantissa = w.elt (1);
14943   point_pos -= HOST_BITS_PER_WIDE_INT;
14944
14945   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14946   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14947   if ((mantissa & mask) != 0)
14948     return false;
14949
14950   /* Having filtered unrepresentable values, we may now remove all
14951      but the highest 5 bits.  */
14952   mantissa >>= point_pos - 5;
14953
14954   /* We cannot represent the value 0.0, so reject it.  This is handled
14955      elsewhere.  */
14956   if (mantissa == 0)
14957     return false;
14958
14959   /* Then, as bit 4 is always set, we can mask it off, leaving
14960      the mantissa in the range [0, 15].  */
14961   mantissa &= ~(1 << 4);
14962   gcc_assert (mantissa <= 15);
14963
14964   /* GCC internally does not use IEEE754-like encoding (where normalized
14965      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14966      Our mantissa values are shifted 4 places to the left relative to
14967      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14968      by 5 places to correct for GCC's representation.  */
14969   exponent = 5 - exponent;
14970
14971   return (exponent >= 0 && exponent <= 7);
14972 }
14973
14974 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14975    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14976    output MOVI/MVNI, ORR or BIC immediate.  */
14977 char*
14978 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14979                                    enum simd_immediate_check which)
14980 {
14981   bool is_valid;
14982   static char templ[40];
14983   const char *mnemonic;
14984   const char *shift_op;
14985   unsigned int lane_count = 0;
14986   char element_char;
14987
14988   struct simd_immediate_info info;
14989
14990   /* This will return true to show const_vector is legal for use as either
14991      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14992      It will also update INFO to show how the immediate should be generated.
14993      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14994   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14995   gcc_assert (is_valid);
14996
14997   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14998   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14999
15000   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15001     {
15002       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15003       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15004          move immediate path.  */
15005       if (aarch64_float_const_zero_rtx_p (info.value))
15006         info.value = GEN_INT (0);
15007       else
15008         {
15009           const unsigned int buf_size = 20;
15010           char float_buf[buf_size] = {'\0'};
15011           real_to_decimal_for_mode (float_buf,
15012                                     CONST_DOUBLE_REAL_VALUE (info.value),
15013                                     buf_size, buf_size, 1, info.elt_mode);
15014
15015           if (lane_count == 1)
15016             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15017           else
15018             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15019                       lane_count, element_char, float_buf);
15020           return templ;
15021         }
15022     }
15023
15024   gcc_assert (CONST_INT_P (info.value));
15025
15026   if (which == AARCH64_CHECK_MOV)
15027     {
15028       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15029       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15030       if (lane_count == 1)
15031         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15032                   mnemonic, UINTVAL (info.value));
15033       else if (info.shift)
15034         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15035                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15036                   element_char, UINTVAL (info.value), shift_op, info.shift);
15037       else
15038         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15039                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15040                   element_char, UINTVAL (info.value));
15041     }
15042   else
15043     {
15044       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15045       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15046       if (info.shift)
15047         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15048                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15049                   element_char, UINTVAL (info.value), "lsl", info.shift);
15050       else
15051         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15052                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15053                   element_char, UINTVAL (info.value));
15054     }
15055   return templ;
15056 }
15057
15058 char*
15059 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15060 {
15061
15062   /* If a floating point number was passed and we desire to use it in an
15063      integer mode do the conversion to integer.  */
15064   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15065     {
15066       unsigned HOST_WIDE_INT ival;
15067       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15068           gcc_unreachable ();
15069       immediate = gen_int_mode (ival, mode);
15070     }
15071
15072   machine_mode vmode;
15073   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15074      a 128 bit vector mode.  */
15075   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15076
15077   vmode = aarch64_simd_container_mode (mode, width);
15078   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15079   return aarch64_output_simd_mov_immediate (v_op, width);
15080 }
15081
15082 /* Return the output string to use for moving immediate CONST_VECTOR
15083    into an SVE register.  */
15084
15085 char *
15086 aarch64_output_sve_mov_immediate (rtx const_vector)
15087 {
15088   static char templ[40];
15089   struct simd_immediate_info info;
15090   char element_char;
15091
15092   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15093   gcc_assert (is_valid);
15094
15095   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15096
15097   if (info.step)
15098     {
15099       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15100                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15101                 element_char, INTVAL (info.value), INTVAL (info.step));
15102       return templ;
15103     }
15104
15105   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15106     {
15107       if (aarch64_float_const_zero_rtx_p (info.value))
15108         info.value = GEN_INT (0);
15109       else
15110         {
15111           const int buf_size = 20;
15112           char float_buf[buf_size] = {};
15113           real_to_decimal_for_mode (float_buf,
15114                                     CONST_DOUBLE_REAL_VALUE (info.value),
15115                                     buf_size, buf_size, 1, info.elt_mode);
15116
15117           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15118                     element_char, float_buf);
15119           return templ;
15120         }
15121     }
15122
15123   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15124             element_char, INTVAL (info.value));
15125   return templ;
15126 }
15127
15128 /* Return the asm format for a PTRUE instruction whose destination has
15129    mode MODE.  SUFFIX is the element size suffix.  */
15130
15131 char *
15132 aarch64_output_ptrue (machine_mode mode, char suffix)
15133 {
15134   unsigned int nunits;
15135   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15136   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15137     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15138   else
15139     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15140   return buf;
15141 }
15142
15143 /* Split operands into moves from op[1] + op[2] into op[0].  */
15144
15145 void
15146 aarch64_split_combinev16qi (rtx operands[3])
15147 {
15148   unsigned int dest = REGNO (operands[0]);
15149   unsigned int src1 = REGNO (operands[1]);
15150   unsigned int src2 = REGNO (operands[2]);
15151   machine_mode halfmode = GET_MODE (operands[1]);
15152   unsigned int halfregs = REG_NREGS (operands[1]);
15153   rtx destlo, desthi;
15154
15155   gcc_assert (halfmode == V16QImode);
15156
15157   if (src1 == dest && src2 == dest + halfregs)
15158     {
15159       /* No-op move.  Can't split to nothing; emit something.  */
15160       emit_note (NOTE_INSN_DELETED);
15161       return;
15162     }
15163
15164   /* Preserve register attributes for variable tracking.  */
15165   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15166   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15167                                GET_MODE_SIZE (halfmode));
15168
15169   /* Special case of reversed high/low parts.  */
15170   if (reg_overlap_mentioned_p (operands[2], destlo)
15171       && reg_overlap_mentioned_p (operands[1], desthi))
15172     {
15173       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15174       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15175       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15176     }
15177   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15178     {
15179       /* Try to avoid unnecessary moves if part of the result
15180          is in the right place already.  */
15181       if (src1 != dest)
15182         emit_move_insn (destlo, operands[1]);
15183       if (src2 != dest + halfregs)
15184         emit_move_insn (desthi, operands[2]);
15185     }
15186   else
15187     {
15188       if (src2 != dest + halfregs)
15189         emit_move_insn (desthi, operands[2]);
15190       if (src1 != dest)
15191         emit_move_insn (destlo, operands[1]);
15192     }
15193 }
15194
15195 /* vec_perm support.  */
15196
15197 struct expand_vec_perm_d
15198 {
15199   rtx target, op0, op1;
15200   vec_perm_indices perm;
15201   machine_mode vmode;
15202   unsigned int vec_flags;
15203   bool one_vector_p;
15204   bool testing_p;
15205 };
15206
15207 /* Generate a variable permutation.  */
15208
15209 static void
15210 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15211 {
15212   machine_mode vmode = GET_MODE (target);
15213   bool one_vector_p = rtx_equal_p (op0, op1);
15214
15215   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15216   gcc_checking_assert (GET_MODE (op0) == vmode);
15217   gcc_checking_assert (GET_MODE (op1) == vmode);
15218   gcc_checking_assert (GET_MODE (sel) == vmode);
15219   gcc_checking_assert (TARGET_SIMD);
15220
15221   if (one_vector_p)
15222     {
15223       if (vmode == V8QImode)
15224         {
15225           /* Expand the argument to a V16QI mode by duplicating it.  */
15226           rtx pair = gen_reg_rtx (V16QImode);
15227           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15228           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15229         }
15230       else
15231         {
15232           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15233         }
15234     }
15235   else
15236     {
15237       rtx pair;
15238
15239       if (vmode == V8QImode)
15240         {
15241           pair = gen_reg_rtx (V16QImode);
15242           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15243           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15244         }
15245       else
15246         {
15247           pair = gen_reg_rtx (OImode);
15248           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15249           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15250         }
15251     }
15252 }
15253
15254 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15255    NELT is the number of elements in the vector.  */
15256
15257 void
15258 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15259                          unsigned int nelt)
15260 {
15261   machine_mode vmode = GET_MODE (target);
15262   bool one_vector_p = rtx_equal_p (op0, op1);
15263   rtx mask;
15264
15265   /* The TBL instruction does not use a modulo index, so we must take care
15266      of that ourselves.  */
15267   mask = aarch64_simd_gen_const_vector_dup (vmode,
15268       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15269   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15270
15271   /* For big-endian, we also need to reverse the index within the vector
15272      (but not which vector).  */
15273   if (BYTES_BIG_ENDIAN)
15274     {
15275       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15276       if (!one_vector_p)
15277         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15278       sel = expand_simple_binop (vmode, XOR, sel, mask,
15279                                  NULL, 0, OPTAB_LIB_WIDEN);
15280     }
15281   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15282 }
15283
15284 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15285
15286 static void
15287 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15288 {
15289   emit_insn (gen_rtx_SET (target,
15290                           gen_rtx_UNSPEC (GET_MODE (target),
15291                                           gen_rtvec (2, op0, op1), code)));
15292 }
15293
15294 /* Expand an SVE vec_perm with the given operands.  */
15295
15296 void
15297 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15298 {
15299   machine_mode data_mode = GET_MODE (target);
15300   machine_mode sel_mode = GET_MODE (sel);
15301   /* Enforced by the pattern condition.  */
15302   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15303
15304   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15305      size of the two value vectors, i.e. the upper bits of the indices
15306      are effectively ignored.  SVE TBL instead produces 0 for any
15307      out-of-range indices, so we need to modulo all the vec_perm indices
15308      to ensure they are all in range.  */
15309   rtx sel_reg = force_reg (sel_mode, sel);
15310
15311   /* Check if the sel only references the first values vector.  */
15312   if (GET_CODE (sel) == CONST_VECTOR
15313       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15314     {
15315       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15316       return;
15317     }
15318
15319   /* Check if the two values vectors are the same.  */
15320   if (rtx_equal_p (op0, op1))
15321     {
15322       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15323       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15324                                          NULL, 0, OPTAB_DIRECT);
15325       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15326       return;
15327     }
15328
15329   /* Run TBL on for each value vector and combine the results.  */
15330
15331   rtx res0 = gen_reg_rtx (data_mode);
15332   rtx res1 = gen_reg_rtx (data_mode);
15333   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15334   if (GET_CODE (sel) != CONST_VECTOR
15335       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15336     {
15337       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15338                                                        2 * nunits - 1);
15339       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15340                                      NULL, 0, OPTAB_DIRECT);
15341     }
15342   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15343   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15344                                      NULL, 0, OPTAB_DIRECT);
15345   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15346   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15347     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15348   else
15349     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15350 }
15351
15352 /* Recognize patterns suitable for the TRN instructions.  */
15353 static bool
15354 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15355 {
15356   HOST_WIDE_INT odd;
15357   poly_uint64 nelt = d->perm.length ();
15358   rtx out, in0, in1, x;
15359   machine_mode vmode = d->vmode;
15360
15361   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15362     return false;
15363
15364   /* Note that these are little-endian tests.
15365      We correct for big-endian later.  */
15366   if (!d->perm[0].is_constant (&odd)
15367       || (odd != 0 && odd != 1)
15368       || !d->perm.series_p (0, 2, odd, 2)
15369       || !d->perm.series_p (1, 2, nelt + odd, 2))
15370     return false;
15371
15372   /* Success!  */
15373   if (d->testing_p)
15374     return true;
15375
15376   in0 = d->op0;
15377   in1 = d->op1;
15378   /* We don't need a big-endian lane correction for SVE; see the comment
15379      at the head of aarch64-sve.md for details.  */
15380   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15381     {
15382       x = in0, in0 = in1, in1 = x;
15383       odd = !odd;
15384     }
15385   out = d->target;
15386
15387   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15388                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15389   return true;
15390 }
15391
15392 /* Recognize patterns suitable for the UZP instructions.  */
15393 static bool
15394 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15395 {
15396   HOST_WIDE_INT odd;
15397   rtx out, in0, in1, x;
15398   machine_mode vmode = d->vmode;
15399
15400   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15401     return false;
15402
15403   /* Note that these are little-endian tests.
15404      We correct for big-endian later.  */
15405   if (!d->perm[0].is_constant (&odd)
15406       || (odd != 0 && odd != 1)
15407       || !d->perm.series_p (0, 1, odd, 2))
15408     return false;
15409
15410   /* Success!  */
15411   if (d->testing_p)
15412     return true;
15413
15414   in0 = d->op0;
15415   in1 = d->op1;
15416   /* We don't need a big-endian lane correction for SVE; see the comment
15417      at the head of aarch64-sve.md for details.  */
15418   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15419     {
15420       x = in0, in0 = in1, in1 = x;
15421       odd = !odd;
15422     }
15423   out = d->target;
15424
15425   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15426                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15427   return true;
15428 }
15429
15430 /* Recognize patterns suitable for the ZIP instructions.  */
15431 static bool
15432 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15433 {
15434   unsigned int high;
15435   poly_uint64 nelt = d->perm.length ();
15436   rtx out, in0, in1, x;
15437   machine_mode vmode = d->vmode;
15438
15439   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15440     return false;
15441
15442   /* Note that these are little-endian tests.
15443      We correct for big-endian later.  */
15444   poly_uint64 first = d->perm[0];
15445   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15446       || !d->perm.series_p (0, 2, first, 1)
15447       || !d->perm.series_p (1, 2, first + nelt, 1))
15448     return false;
15449   high = maybe_ne (first, 0U);
15450
15451   /* Success!  */
15452   if (d->testing_p)
15453     return true;
15454
15455   in0 = d->op0;
15456   in1 = d->op1;
15457   /* We don't need a big-endian lane correction for SVE; see the comment
15458      at the head of aarch64-sve.md for details.  */
15459   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15460     {
15461       x = in0, in0 = in1, in1 = x;
15462       high = !high;
15463     }
15464   out = d->target;
15465
15466   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15467                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15468   return true;
15469 }
15470
15471 /* Recognize patterns for the EXT insn.  */
15472
15473 static bool
15474 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15475 {
15476   HOST_WIDE_INT location;
15477   rtx offset;
15478
15479   /* The first element always refers to the first vector.
15480      Check if the extracted indices are increasing by one.  */
15481   if (d->vec_flags == VEC_SVE_PRED
15482       || !d->perm[0].is_constant (&location)
15483       || !d->perm.series_p (0, 1, location, 1))
15484     return false;
15485
15486   /* Success! */
15487   if (d->testing_p)
15488     return true;
15489
15490   /* The case where (location == 0) is a no-op for both big- and little-endian,
15491      and is removed by the mid-end at optimization levels -O1 and higher.
15492
15493      We don't need a big-endian lane correction for SVE; see the comment
15494      at the head of aarch64-sve.md for details.  */
15495   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15496     {
15497       /* After setup, we want the high elements of the first vector (stored
15498          at the LSB end of the register), and the low elements of the second
15499          vector (stored at the MSB end of the register). So swap.  */
15500       std::swap (d->op0, d->op1);
15501       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15502          to_constant () is safe since this is restricted to Advanced SIMD
15503          vectors.  */
15504       location = d->perm.length ().to_constant () - location;
15505     }
15506
15507   offset = GEN_INT (location);
15508   emit_set_insn (d->target,
15509                  gen_rtx_UNSPEC (d->vmode,
15510                                  gen_rtvec (3, d->op0, d->op1, offset),
15511                                  UNSPEC_EXT));
15512   return true;
15513 }
15514
15515 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15516    within each 64-bit, 32-bit or 16-bit granule.  */
15517
15518 static bool
15519 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15520 {
15521   HOST_WIDE_INT diff;
15522   unsigned int i, size, unspec;
15523   machine_mode pred_mode;
15524
15525   if (d->vec_flags == VEC_SVE_PRED
15526       || !d->one_vector_p
15527       || !d->perm[0].is_constant (&diff))
15528     return false;
15529
15530   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15531   if (size == 8)
15532     {
15533       unspec = UNSPEC_REV64;
15534       pred_mode = VNx2BImode;
15535     }
15536   else if (size == 4)
15537     {
15538       unspec = UNSPEC_REV32;
15539       pred_mode = VNx4BImode;
15540     }
15541   else if (size == 2)
15542     {
15543       unspec = UNSPEC_REV16;
15544       pred_mode = VNx8BImode;
15545     }
15546   else
15547     return false;
15548
15549   unsigned int step = diff + 1;
15550   for (i = 0; i < step; ++i)
15551     if (!d->perm.series_p (i, step, diff - i, step))
15552       return false;
15553
15554   /* Success! */
15555   if (d->testing_p)
15556     return true;
15557
15558   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15559   if (d->vec_flags == VEC_SVE_DATA)
15560     {
15561       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15562       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15563                             UNSPEC_MERGE_PTRUE);
15564     }
15565   emit_set_insn (d->target, src);
15566   return true;
15567 }
15568
15569 /* Recognize patterns for the REV insn, which reverses elements within
15570    a full vector.  */
15571
15572 static bool
15573 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15574 {
15575   poly_uint64 nelt = d->perm.length ();
15576
15577   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15578     return false;
15579
15580   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15581     return false;
15582
15583   /* Success! */
15584   if (d->testing_p)
15585     return true;
15586
15587   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15588   emit_set_insn (d->target, src);
15589   return true;
15590 }
15591
15592 static bool
15593 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15594 {
15595   rtx out = d->target;
15596   rtx in0;
15597   HOST_WIDE_INT elt;
15598   machine_mode vmode = d->vmode;
15599   rtx lane;
15600
15601   if (d->vec_flags == VEC_SVE_PRED
15602       || d->perm.encoding ().encoded_nelts () != 1
15603       || !d->perm[0].is_constant (&elt))
15604     return false;
15605
15606   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15607     return false;
15608
15609   /* Success! */
15610   if (d->testing_p)
15611     return true;
15612
15613   /* The generic preparation in aarch64_expand_vec_perm_const_1
15614      swaps the operand order and the permute indices if it finds
15615      d->perm[0] to be in the second operand.  Thus, we can always
15616      use d->op0 and need not do any extra arithmetic to get the
15617      correct lane number.  */
15618   in0 = d->op0;
15619   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15620
15621   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15622   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15623   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15624   return true;
15625 }
15626
15627 static bool
15628 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15629 {
15630   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15631   machine_mode vmode = d->vmode;
15632
15633   /* Make sure that the indices are constant.  */
15634   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15635   for (unsigned int i = 0; i < encoded_nelts; ++i)
15636     if (!d->perm[i].is_constant ())
15637       return false;
15638
15639   if (d->testing_p)
15640     return true;
15641
15642   /* Generic code will try constant permutation twice.  Once with the
15643      original mode and again with the elements lowered to QImode.
15644      So wait and don't do the selector expansion ourselves.  */
15645   if (vmode != V8QImode && vmode != V16QImode)
15646     return false;
15647
15648   /* to_constant is safe since this routine is specific to Advanced SIMD
15649      vectors.  */
15650   unsigned int nelt = d->perm.length ().to_constant ();
15651   for (unsigned int i = 0; i < nelt; ++i)
15652     /* If big-endian and two vectors we end up with a weird mixed-endian
15653        mode on NEON.  Reverse the index within each word but not the word
15654        itself.  to_constant is safe because we checked is_constant above.  */
15655     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15656                         ? d->perm[i].to_constant () ^ (nelt - 1)
15657                         : d->perm[i].to_constant ());
15658
15659   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15660   sel = force_reg (vmode, sel);
15661
15662   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15663   return true;
15664 }
15665
15666 /* Try to implement D using an SVE TBL instruction.  */
15667
15668 static bool
15669 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15670 {
15671   unsigned HOST_WIDE_INT nelt;
15672
15673   /* Permuting two variable-length vectors could overflow the
15674      index range.  */
15675   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15676     return false;
15677
15678   if (d->testing_p)
15679     return true;
15680
15681   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15682   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15683   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15684   return true;
15685 }
15686
15687 static bool
15688 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15689 {
15690   /* The pattern matching functions above are written to look for a small
15691      number to begin the sequence (0, 1, N/2).  If we begin with an index
15692      from the second operand, we can swap the operands.  */
15693   poly_int64 nelt = d->perm.length ();
15694   if (known_ge (d->perm[0], nelt))
15695     {
15696       d->perm.rotate_inputs (1);
15697       std::swap (d->op0, d->op1);
15698     }
15699
15700   if ((d->vec_flags == VEC_ADVSIMD
15701        || d->vec_flags == VEC_SVE_DATA
15702        || d->vec_flags == VEC_SVE_PRED)
15703       && known_gt (nelt, 1))
15704     {
15705       if (aarch64_evpc_rev_local (d))
15706         return true;
15707       else if (aarch64_evpc_rev_global (d))
15708         return true;
15709       else if (aarch64_evpc_ext (d))
15710         return true;
15711       else if (aarch64_evpc_dup (d))
15712         return true;
15713       else if (aarch64_evpc_zip (d))
15714         return true;
15715       else if (aarch64_evpc_uzp (d))
15716         return true;
15717       else if (aarch64_evpc_trn (d))
15718         return true;
15719       if (d->vec_flags == VEC_SVE_DATA)
15720         return aarch64_evpc_sve_tbl (d);
15721       else if (d->vec_flags == VEC_SVE_DATA)
15722         return aarch64_evpc_tbl (d);
15723     }
15724   return false;
15725 }
15726
15727 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15728
15729 static bool
15730 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15731                                   rtx op1, const vec_perm_indices &sel)
15732 {
15733   struct expand_vec_perm_d d;
15734
15735   /* Check whether the mask can be applied to a single vector.  */
15736   if (op0 && rtx_equal_p (op0, op1))
15737     d.one_vector_p = true;
15738   else if (sel.all_from_input_p (0))
15739     {
15740       d.one_vector_p = true;
15741       op1 = op0;
15742     }
15743   else if (sel.all_from_input_p (1))
15744     {
15745       d.one_vector_p = true;
15746       op0 = op1;
15747     }
15748   else
15749     d.one_vector_p = false;
15750
15751   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15752                      sel.nelts_per_input ());
15753   d.vmode = vmode;
15754   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15755   d.target = target;
15756   d.op0 = op0;
15757   d.op1 = op1;
15758   d.testing_p = !target;
15759
15760   if (!d.testing_p)
15761     return aarch64_expand_vec_perm_const_1 (&d);
15762
15763   rtx_insn *last = get_last_insn ();
15764   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15765   gcc_assert (last == get_last_insn ());
15766
15767   return ret;
15768 }
15769
15770 /* Generate a byte permute mask for a register of mode MODE,
15771    which has NUNITS units.  */
15772
15773 rtx
15774 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15775 {
15776   /* We have to reverse each vector because we dont have
15777      a permuted load that can reverse-load according to ABI rules.  */
15778   rtx mask;
15779   rtvec v = rtvec_alloc (16);
15780   unsigned int i, j;
15781   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15782
15783   gcc_assert (BYTES_BIG_ENDIAN);
15784   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15785
15786   for (i = 0; i < nunits; i++)
15787     for (j = 0; j < usize; j++)
15788       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15789   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15790   return force_reg (V16QImode, mask);
15791 }
15792
15793 /* Return true if X is a valid second operand for the SVE instruction
15794    that implements integer comparison OP_CODE.  */
15795
15796 static bool
15797 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15798 {
15799   if (register_operand (x, VOIDmode))
15800     return true;
15801
15802   switch (op_code)
15803     {
15804     case LTU:
15805     case LEU:
15806     case GEU:
15807     case GTU:
15808       return aarch64_sve_cmp_immediate_p (x, false);
15809     case LT:
15810     case LE:
15811     case GE:
15812     case GT:
15813     case NE:
15814     case EQ:
15815       return aarch64_sve_cmp_immediate_p (x, true);
15816     default:
15817       gcc_unreachable ();
15818     }
15819 }
15820
15821 /* Use predicated SVE instructions to implement the equivalent of:
15822
15823      (set TARGET OP)
15824
15825    given that PTRUE is an all-true predicate of the appropriate mode.  */
15826
15827 static void
15828 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15829 {
15830   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15831                                gen_rtvec (2, ptrue, op),
15832                                UNSPEC_MERGE_PTRUE);
15833   rtx_insn *insn = emit_set_insn (target, unspec);
15834   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15835 }
15836
15837 /* Likewise, but also clobber the condition codes.  */
15838
15839 static void
15840 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15841 {
15842   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15843                                gen_rtvec (2, ptrue, op),
15844                                UNSPEC_MERGE_PTRUE);
15845   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15846   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15847 }
15848
15849 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15850
15851 static unsigned int
15852 aarch64_unspec_cond_code (rtx_code code)
15853 {
15854   switch (code)
15855     {
15856     case NE:
15857       return UNSPEC_COND_NE;
15858     case EQ:
15859       return UNSPEC_COND_EQ;
15860     case LT:
15861       return UNSPEC_COND_LT;
15862     case GT:
15863       return UNSPEC_COND_GT;
15864     case LE:
15865       return UNSPEC_COND_LE;
15866     case GE:
15867       return UNSPEC_COND_GE;
15868     default:
15869       gcc_unreachable ();
15870     }
15871 }
15872
15873 /* Emit:
15874
15875       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15876
15877    where <X> is the operation associated with comparison CODE.  This form
15878    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15879    semantics, such as when PRED might not be all-true and when comparing
15880    inactive lanes could have side effects.  */
15881
15882 static void
15883 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15884                                   rtx pred, rtx op0, rtx op1)
15885 {
15886   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15887                                gen_rtvec (3, pred, op0, op1),
15888                                aarch64_unspec_cond_code (code));
15889   emit_set_insn (target, unspec);
15890 }
15891
15892 /* Expand an SVE integer comparison using the SVE equivalent of:
15893
15894      (set TARGET (CODE OP0 OP1)).  */
15895
15896 void
15897 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15898 {
15899   machine_mode pred_mode = GET_MODE (target);
15900   machine_mode data_mode = GET_MODE (op0);
15901
15902   if (!aarch64_sve_cmp_operand_p (code, op1))
15903     op1 = force_reg (data_mode, op1);
15904
15905   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15906   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15907   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15908 }
15909
15910 /* Emit the SVE equivalent of:
15911
15912       (set TMP1 (CODE1 OP0 OP1))
15913       (set TMP2 (CODE2 OP0 OP1))
15914       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15915
15916    PTRUE is an all-true predicate with the same mode as TARGET.  */
15917
15918 static void
15919 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15920                            rtx ptrue, rtx op0, rtx op1)
15921 {
15922   machine_mode pred_mode = GET_MODE (ptrue);
15923   rtx tmp1 = gen_reg_rtx (pred_mode);
15924   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15925                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15926   rtx tmp2 = gen_reg_rtx (pred_mode);
15927   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15928                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15929   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15930 }
15931
15932 /* Emit the SVE equivalent of:
15933
15934       (set TMP (CODE OP0 OP1))
15935       (set TARGET (not TMP))
15936
15937    PTRUE is an all-true predicate with the same mode as TARGET.  */
15938
15939 static void
15940 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15941                                 rtx op0, rtx op1)
15942 {
15943   machine_mode pred_mode = GET_MODE (ptrue);
15944   rtx tmp = gen_reg_rtx (pred_mode);
15945   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15946                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15947   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15948 }
15949
15950 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15951
15952      (set TARGET (CODE OP0 OP1))
15953
15954    If CAN_INVERT_P is true, the caller can also handle inverted results;
15955    return true if the result is in fact inverted.  */
15956
15957 bool
15958 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15959                                   rtx op0, rtx op1, bool can_invert_p)
15960 {
15961   machine_mode pred_mode = GET_MODE (target);
15962   machine_mode data_mode = GET_MODE (op0);
15963
15964   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15965   switch (code)
15966     {
15967     case UNORDERED:
15968       /* UNORDERED has no immediate form.  */
15969       op1 = force_reg (data_mode, op1);
15970       /* fall through */
15971     case LT:
15972     case LE:
15973     case GT:
15974     case GE:
15975     case EQ:
15976     case NE:
15977       {
15978         /* There is native support for the comparison.  */
15979         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15980         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15981         return false;
15982       }
15983
15984     case LTGT:
15985       /* This is a trapping operation (LT or GT).  */
15986       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15987       return false;
15988
15989     case UNEQ:
15990       if (!flag_trapping_math)
15991         {
15992           /* This would trap for signaling NaNs.  */
15993           op1 = force_reg (data_mode, op1);
15994           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15995           return false;
15996         }
15997       /* fall through */
15998     case UNLT:
15999     case UNLE:
16000     case UNGT:
16001     case UNGE:
16002       if (flag_trapping_math)
16003         {
16004           /* Work out which elements are ordered.  */
16005           rtx ordered = gen_reg_rtx (pred_mode);
16006           op1 = force_reg (data_mode, op1);
16007           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16008
16009           /* Test the opposite condition for the ordered elements,
16010              then invert the result.  */
16011           if (code == UNEQ)
16012             code = NE;
16013           else
16014             code = reverse_condition_maybe_unordered (code);
16015           if (can_invert_p)
16016             {
16017               aarch64_emit_sve_predicated_cond (target, code,
16018                                                 ordered, op0, op1);
16019               return true;
16020             }
16021           rtx tmp = gen_reg_rtx (pred_mode);
16022           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16023           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16024           return false;
16025         }
16026       break;
16027
16028     case ORDERED:
16029       /* ORDERED has no immediate form.  */
16030       op1 = force_reg (data_mode, op1);
16031       break;
16032
16033     default:
16034       gcc_unreachable ();
16035     }
16036
16037   /* There is native support for the inverse comparison.  */
16038   code = reverse_condition_maybe_unordered (code);
16039   if (can_invert_p)
16040     {
16041       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16042       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16043       return true;
16044     }
16045   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16046   return false;
16047 }
16048
16049 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16050    of the data being selected and CMP_MODE is the mode of the values being
16051    compared.  */
16052
16053 void
16054 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16055                           rtx *ops)
16056 {
16057   machine_mode pred_mode
16058     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16059                              GET_MODE_SIZE (cmp_mode)).require ();
16060   rtx pred = gen_reg_rtx (pred_mode);
16061   if (FLOAT_MODE_P (cmp_mode))
16062     {
16063       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16064                                             ops[4], ops[5], true))
16065         std::swap (ops[1], ops[2]);
16066     }
16067   else
16068     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16069
16070   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16071   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16072 }
16073
16074 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16075    true.  However due to issues with register allocation it is preferable
16076    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16077    operations in general registers is better than treating them as scalar
16078    vector operations.  This reduces latency and avoids redundant int<->FP
16079    moves.  So tie modes if they are either the same class, or vector modes
16080    with other vector modes, vector structs or any scalar mode.  */
16081
16082 static bool
16083 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16084 {
16085   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16086     return true;
16087
16088   /* We specifically want to allow elements of "structure" modes to
16089      be tieable to the structure.  This more general condition allows
16090      other rarer situations too.  The reason we don't extend this to
16091      predicate modes is that there are no predicate structure modes
16092      nor any specific instructions for extracting part of a predicate
16093      register.  */
16094   if (aarch64_vector_data_mode_p (mode1)
16095       && aarch64_vector_data_mode_p (mode2))
16096     return true;
16097
16098   /* Also allow any scalar modes with vectors.  */
16099   if (aarch64_vector_mode_supported_p (mode1)
16100       || aarch64_vector_mode_supported_p (mode2))
16101     return true;
16102
16103   return false;
16104 }
16105
16106 /* Return a new RTX holding the result of moving POINTER forward by
16107    AMOUNT bytes.  */
16108
16109 static rtx
16110 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16111 {
16112   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16113
16114   return adjust_automodify_address (pointer, GET_MODE (pointer),
16115                                     next, amount);
16116 }
16117
16118 /* Return a new RTX holding the result of moving POINTER forward by the
16119    size of the mode it points to.  */
16120
16121 static rtx
16122 aarch64_progress_pointer (rtx pointer)
16123 {
16124   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16125 }
16126
16127 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16128    MODE bytes.  */
16129
16130 static void
16131 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16132                                               machine_mode mode)
16133 {
16134   rtx reg = gen_reg_rtx (mode);
16135
16136   /* "Cast" the pointers to the correct mode.  */
16137   *src = adjust_address (*src, mode, 0);
16138   *dst = adjust_address (*dst, mode, 0);
16139   /* Emit the memcpy.  */
16140   emit_move_insn (reg, *src);
16141   emit_move_insn (*dst, reg);
16142   /* Move the pointers forward.  */
16143   *src = aarch64_progress_pointer (*src);
16144   *dst = aarch64_progress_pointer (*dst);
16145 }
16146
16147 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16148    we succeed, otherwise return false.  */
16149
16150 bool
16151 aarch64_expand_movmem (rtx *operands)
16152 {
16153   int n, mode_bits;
16154   rtx dst = operands[0];
16155   rtx src = operands[1];
16156   rtx base;
16157   machine_mode cur_mode = BLKmode, next_mode;
16158   bool speed_p = !optimize_function_for_size_p (cfun);
16159
16160   /* When optimizing for size, give a better estimate of the length of a
16161      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16162      will always require an even number of instructions to do now.  And each
16163      operation requires both a load+store, so devide the max number by 2.  */
16164   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16165
16166   /* We can't do anything smart if the amount to copy is not constant.  */
16167   if (!CONST_INT_P (operands[2]))
16168     return false;
16169
16170   n = INTVAL (operands[2]);
16171
16172   /* Try to keep the number of instructions low.  For all cases we will do at
16173      most two moves for the residual amount, since we'll always overlap the
16174      remainder.  */
16175   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16176     return false;
16177
16178   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16179   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16180
16181   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16182   src = adjust_automodify_address (src, VOIDmode, base, 0);
16183
16184   /* Convert n to bits to make the rest of the code simpler.  */
16185   n = n * BITS_PER_UNIT;
16186
16187   while (n > 0)
16188     {
16189       /* Find the largest mode in which to do the copy in without over reading
16190          or writing.  */
16191       opt_scalar_int_mode mode_iter;
16192       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16193         if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
16194           cur_mode = mode_iter.require ();
16195
16196       gcc_assert (cur_mode != BLKmode);
16197
16198       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16199       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16200
16201       n -= mode_bits;
16202
16203       /* Do certain trailing copies as overlapping if it's going to be
16204          cheaper.  i.e. less instructions to do so.  For instance doing a 15
16205          byte copy it's more efficient to do two overlapping 8 byte copies than
16206          8 + 6 + 1.  */
16207       next_mode = smallest_mode_for_size (n, MODE_INT);
16208       int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16209       if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
16210         {
16211           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16212           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16213           n = n_bits;
16214         }
16215     }
16216
16217   return true;
16218 }
16219
16220 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16221    SImode stores.  Handle the case when the constant has identical
16222    bottom and top halves.  This is beneficial when the two stores can be
16223    merged into an STP and we avoid synthesising potentially expensive
16224    immediates twice.  Return true if such a split is possible.  */
16225
16226 bool
16227 aarch64_split_dimode_const_store (rtx dst, rtx src)
16228 {
16229   rtx lo = gen_lowpart (SImode, src);
16230   rtx hi = gen_highpart_mode (SImode, DImode, src);
16231
16232   bool size_p = optimize_function_for_size_p (cfun);
16233
16234   if (!rtx_equal_p (lo, hi))
16235     return false;
16236
16237   unsigned int orig_cost
16238     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16239   unsigned int lo_cost
16240     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16241
16242   /* We want to transform:
16243      MOV        x1, 49370
16244      MOVK       x1, 0x140, lsl 16
16245      MOVK       x1, 0xc0da, lsl 32
16246      MOVK       x1, 0x140, lsl 48
16247      STR        x1, [x0]
16248    into:
16249      MOV        w1, 49370
16250      MOVK       w1, 0x140, lsl 16
16251      STP        w1, w1, [x0]
16252    So we want to perform this only when we save two instructions
16253    or more.  When optimizing for size, however, accept any code size
16254    savings we can.  */
16255   if (size_p && orig_cost <= lo_cost)
16256     return false;
16257
16258   if (!size_p
16259       && (orig_cost <= lo_cost + 1))
16260     return false;
16261
16262   rtx mem_lo = adjust_address (dst, SImode, 0);
16263   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16264     return false;
16265
16266   rtx tmp_reg = gen_reg_rtx (SImode);
16267   aarch64_expand_mov_immediate (tmp_reg, lo);
16268   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16269   /* Don't emit an explicit store pair as this may not be always profitable.
16270      Let the sched-fusion logic decide whether to merge them.  */
16271   emit_move_insn (mem_lo, tmp_reg);
16272   emit_move_insn (mem_hi, tmp_reg);
16273
16274   return true;
16275 }
16276
16277 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16278
16279 static unsigned HOST_WIDE_INT
16280 aarch64_asan_shadow_offset (void)
16281 {
16282   return (HOST_WIDE_INT_1 << 36);
16283 }
16284
16285 static rtx
16286 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16287                         int code, tree treeop0, tree treeop1)
16288 {
16289   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16290   rtx op0, op1;
16291   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16292   insn_code icode;
16293   struct expand_operand ops[4];
16294
16295   start_sequence ();
16296   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16297
16298   op_mode = GET_MODE (op0);
16299   if (op_mode == VOIDmode)
16300     op_mode = GET_MODE (op1);
16301
16302   switch (op_mode)
16303     {
16304     case E_QImode:
16305     case E_HImode:
16306     case E_SImode:
16307       cmp_mode = SImode;
16308       icode = CODE_FOR_cmpsi;
16309       break;
16310
16311     case E_DImode:
16312       cmp_mode = DImode;
16313       icode = CODE_FOR_cmpdi;
16314       break;
16315
16316     case E_SFmode:
16317       cmp_mode = SFmode;
16318       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16319       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16320       break;
16321
16322     case E_DFmode:
16323       cmp_mode = DFmode;
16324       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16325       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16326       break;
16327
16328     default:
16329       end_sequence ();
16330       return NULL_RTX;
16331     }
16332
16333   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16334   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16335   if (!op0 || !op1)
16336     {
16337       end_sequence ();
16338       return NULL_RTX;
16339     }
16340   *prep_seq = get_insns ();
16341   end_sequence ();
16342
16343   create_fixed_operand (&ops[0], op0);
16344   create_fixed_operand (&ops[1], op1);
16345
16346   start_sequence ();
16347   if (!maybe_expand_insn (icode, 2, ops))
16348     {
16349       end_sequence ();
16350       return NULL_RTX;
16351     }
16352   *gen_seq = get_insns ();
16353   end_sequence ();
16354
16355   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16356                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16357 }
16358
16359 static rtx
16360 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16361                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16362 {
16363   rtx op0, op1, target;
16364   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16365   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16366   insn_code icode;
16367   struct expand_operand ops[6];
16368   int aarch64_cond;
16369
16370   push_to_sequence (*prep_seq);
16371   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16372
16373   op_mode = GET_MODE (op0);
16374   if (op_mode == VOIDmode)
16375     op_mode = GET_MODE (op1);
16376
16377   switch (op_mode)
16378     {
16379     case E_QImode:
16380     case E_HImode:
16381     case E_SImode:
16382       cmp_mode = SImode;
16383       icode = CODE_FOR_ccmpsi;
16384       break;
16385
16386     case E_DImode:
16387       cmp_mode = DImode;
16388       icode = CODE_FOR_ccmpdi;
16389       break;
16390
16391     case E_SFmode:
16392       cmp_mode = SFmode;
16393       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16394       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16395       break;
16396
16397     case E_DFmode:
16398       cmp_mode = DFmode;
16399       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16400       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16401       break;
16402
16403     default:
16404       end_sequence ();
16405       return NULL_RTX;
16406     }
16407
16408   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16409   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16410   if (!op0 || !op1)
16411     {
16412       end_sequence ();
16413       return NULL_RTX;
16414     }
16415   *prep_seq = get_insns ();
16416   end_sequence ();
16417
16418   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16419   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16420
16421   if (bit_code != AND)
16422     {
16423       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16424                                                 GET_MODE (XEXP (prev, 0))),
16425                              VOIDmode, XEXP (prev, 0), const0_rtx);
16426       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16427     }
16428
16429   create_fixed_operand (&ops[0], XEXP (prev, 0));
16430   create_fixed_operand (&ops[1], target);
16431   create_fixed_operand (&ops[2], op0);
16432   create_fixed_operand (&ops[3], op1);
16433   create_fixed_operand (&ops[4], prev);
16434   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16435
16436   push_to_sequence (*gen_seq);
16437   if (!maybe_expand_insn (icode, 6, ops))
16438     {
16439       end_sequence ();
16440       return NULL_RTX;
16441     }
16442
16443   *gen_seq = get_insns ();
16444   end_sequence ();
16445
16446   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16447 }
16448
16449 #undef TARGET_GEN_CCMP_FIRST
16450 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16451
16452 #undef TARGET_GEN_CCMP_NEXT
16453 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16454
16455 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16456    instruction fusion of some sort.  */
16457
16458 static bool
16459 aarch64_macro_fusion_p (void)
16460 {
16461   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16462 }
16463
16464
16465 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16466    should be kept together during scheduling.  */
16467
16468 static bool
16469 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16470 {
16471   rtx set_dest;
16472   rtx prev_set = single_set (prev);
16473   rtx curr_set = single_set (curr);
16474   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16475   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16476
16477   if (!aarch64_macro_fusion_p ())
16478     return false;
16479
16480   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16481     {
16482       /* We are trying to match:
16483          prev (mov)  == (set (reg r0) (const_int imm16))
16484          curr (movk) == (set (zero_extract (reg r0)
16485                                            (const_int 16)
16486                                            (const_int 16))
16487                              (const_int imm16_1))  */
16488
16489       set_dest = SET_DEST (curr_set);
16490
16491       if (GET_CODE (set_dest) == ZERO_EXTRACT
16492           && CONST_INT_P (SET_SRC (curr_set))
16493           && CONST_INT_P (SET_SRC (prev_set))
16494           && CONST_INT_P (XEXP (set_dest, 2))
16495           && INTVAL (XEXP (set_dest, 2)) == 16
16496           && REG_P (XEXP (set_dest, 0))
16497           && REG_P (SET_DEST (prev_set))
16498           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16499         {
16500           return true;
16501         }
16502     }
16503
16504   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16505     {
16506
16507       /*  We're trying to match:
16508           prev (adrp) == (set (reg r1)
16509                               (high (symbol_ref ("SYM"))))
16510           curr (add) == (set (reg r0)
16511                              (lo_sum (reg r1)
16512                                      (symbol_ref ("SYM"))))
16513           Note that r0 need not necessarily be the same as r1, especially
16514           during pre-regalloc scheduling.  */
16515
16516       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16517           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16518         {
16519           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16520               && REG_P (XEXP (SET_SRC (curr_set), 0))
16521               && REGNO (XEXP (SET_SRC (curr_set), 0))
16522                  == REGNO (SET_DEST (prev_set))
16523               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16524                               XEXP (SET_SRC (curr_set), 1)))
16525             return true;
16526         }
16527     }
16528
16529   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16530     {
16531
16532       /* We're trying to match:
16533          prev (movk) == (set (zero_extract (reg r0)
16534                                            (const_int 16)
16535                                            (const_int 32))
16536                              (const_int imm16_1))
16537          curr (movk) == (set (zero_extract (reg r0)
16538                                            (const_int 16)
16539                                            (const_int 48))
16540                              (const_int imm16_2))  */
16541
16542       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16543           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16544           && REG_P (XEXP (SET_DEST (prev_set), 0))
16545           && REG_P (XEXP (SET_DEST (curr_set), 0))
16546           && REGNO (XEXP (SET_DEST (prev_set), 0))
16547              == REGNO (XEXP (SET_DEST (curr_set), 0))
16548           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16549           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16550           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16551           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16552           && CONST_INT_P (SET_SRC (prev_set))
16553           && CONST_INT_P (SET_SRC (curr_set)))
16554         return true;
16555
16556     }
16557   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16558     {
16559       /* We're trying to match:
16560           prev (adrp) == (set (reg r0)
16561                               (high (symbol_ref ("SYM"))))
16562           curr (ldr) == (set (reg r1)
16563                              (mem (lo_sum (reg r0)
16564                                              (symbol_ref ("SYM")))))
16565                  or
16566           curr (ldr) == (set (reg r1)
16567                              (zero_extend (mem
16568                                            (lo_sum (reg r0)
16569                                                    (symbol_ref ("SYM"))))))  */
16570       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16571           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16572         {
16573           rtx curr_src = SET_SRC (curr_set);
16574
16575           if (GET_CODE (curr_src) == ZERO_EXTEND)
16576             curr_src = XEXP (curr_src, 0);
16577
16578           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16579               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16580               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16581                  == REGNO (SET_DEST (prev_set))
16582               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16583                               XEXP (SET_SRC (prev_set), 0)))
16584               return true;
16585         }
16586     }
16587
16588   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16589        && aarch_crypto_can_dual_issue (prev, curr))
16590     return true;
16591
16592   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16593       && any_condjump_p (curr))
16594     {
16595       enum attr_type prev_type = get_attr_type (prev);
16596
16597       unsigned int condreg1, condreg2;
16598       rtx cc_reg_1;
16599       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16600       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16601
16602       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16603           && prev
16604           && modified_in_p (cc_reg_1, prev))
16605         {
16606           /* FIXME: this misses some which is considered simple arthematic
16607              instructions for ThunderX.  Simple shifts are missed here.  */
16608           if (prev_type == TYPE_ALUS_SREG
16609               || prev_type == TYPE_ALUS_IMM
16610               || prev_type == TYPE_LOGICS_REG
16611               || prev_type == TYPE_LOGICS_IMM)
16612             return true;
16613         }
16614     }
16615
16616   if (prev_set
16617       && curr_set
16618       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16619       && any_condjump_p (curr))
16620     {
16621       /* We're trying to match:
16622           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16623           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16624                                                          (const_int 0))
16625                                                  (label_ref ("SYM"))
16626                                                  (pc))  */
16627       if (SET_DEST (curr_set) == (pc_rtx)
16628           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16629           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16630           && REG_P (SET_DEST (prev_set))
16631           && REGNO (SET_DEST (prev_set))
16632              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16633         {
16634           /* Fuse ALU operations followed by conditional branch instruction.  */
16635           switch (get_attr_type (prev))
16636             {
16637             case TYPE_ALU_IMM:
16638             case TYPE_ALU_SREG:
16639             case TYPE_ADC_REG:
16640             case TYPE_ADC_IMM:
16641             case TYPE_ADCS_REG:
16642             case TYPE_ADCS_IMM:
16643             case TYPE_LOGIC_REG:
16644             case TYPE_LOGIC_IMM:
16645             case TYPE_CSEL:
16646             case TYPE_ADR:
16647             case TYPE_MOV_IMM:
16648             case TYPE_SHIFT_REG:
16649             case TYPE_SHIFT_IMM:
16650             case TYPE_BFM:
16651             case TYPE_RBIT:
16652             case TYPE_REV:
16653             case TYPE_EXTEND:
16654               return true;
16655
16656             default:;
16657             }
16658         }
16659     }
16660
16661   return false;
16662 }
16663
16664 /* Return true iff the instruction fusion described by OP is enabled.  */
16665
16666 bool
16667 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16668 {
16669   return (aarch64_tune_params.fusible_ops & op) != 0;
16670 }
16671
16672 /* If MEM is in the form of [base+offset], extract the two parts
16673    of address and set to BASE and OFFSET, otherwise return false
16674    after clearing BASE and OFFSET.  */
16675
16676 bool
16677 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16678 {
16679   rtx addr;
16680
16681   gcc_assert (MEM_P (mem));
16682
16683   addr = XEXP (mem, 0);
16684
16685   if (REG_P (addr))
16686     {
16687       *base = addr;
16688       *offset = const0_rtx;
16689       return true;
16690     }
16691
16692   if (GET_CODE (addr) == PLUS
16693       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16694     {
16695       *base = XEXP (addr, 0);
16696       *offset = XEXP (addr, 1);
16697       return true;
16698     }
16699
16700   *base = NULL_RTX;
16701   *offset = NULL_RTX;
16702
16703   return false;
16704 }
16705
16706 /* Types for scheduling fusion.  */
16707 enum sched_fusion_type
16708 {
16709   SCHED_FUSION_NONE = 0,
16710   SCHED_FUSION_LD_SIGN_EXTEND,
16711   SCHED_FUSION_LD_ZERO_EXTEND,
16712   SCHED_FUSION_LD,
16713   SCHED_FUSION_ST,
16714   SCHED_FUSION_NUM
16715 };
16716
16717 /* If INSN is a load or store of address in the form of [base+offset],
16718    extract the two parts and set to BASE and OFFSET.  Return scheduling
16719    fusion type this INSN is.  */
16720
16721 static enum sched_fusion_type
16722 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16723 {
16724   rtx x, dest, src;
16725   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16726
16727   gcc_assert (INSN_P (insn));
16728   x = PATTERN (insn);
16729   if (GET_CODE (x) != SET)
16730     return SCHED_FUSION_NONE;
16731
16732   src = SET_SRC (x);
16733   dest = SET_DEST (x);
16734
16735   machine_mode dest_mode = GET_MODE (dest);
16736
16737   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16738     return SCHED_FUSION_NONE;
16739
16740   if (GET_CODE (src) == SIGN_EXTEND)
16741     {
16742       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16743       src = XEXP (src, 0);
16744       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16745         return SCHED_FUSION_NONE;
16746     }
16747   else if (GET_CODE (src) == ZERO_EXTEND)
16748     {
16749       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16750       src = XEXP (src, 0);
16751       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16752         return SCHED_FUSION_NONE;
16753     }
16754
16755   if (GET_CODE (src) == MEM && REG_P (dest))
16756     extract_base_offset_in_addr (src, base, offset);
16757   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16758     {
16759       fusion = SCHED_FUSION_ST;
16760       extract_base_offset_in_addr (dest, base, offset);
16761     }
16762   else
16763     return SCHED_FUSION_NONE;
16764
16765   if (*base == NULL_RTX || *offset == NULL_RTX)
16766     fusion = SCHED_FUSION_NONE;
16767
16768   return fusion;
16769 }
16770
16771 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16772
16773    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16774    and PRI are only calculated for these instructions.  For other instruction,
16775    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16776    type instruction fusion can be added by returning different priorities.
16777
16778    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16779
16780 static void
16781 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16782                                int *fusion_pri, int *pri)
16783 {
16784   int tmp, off_val;
16785   rtx base, offset;
16786   enum sched_fusion_type fusion;
16787
16788   gcc_assert (INSN_P (insn));
16789
16790   tmp = max_pri - 1;
16791   fusion = fusion_load_store (insn, &base, &offset);
16792   if (fusion == SCHED_FUSION_NONE)
16793     {
16794       *pri = tmp;
16795       *fusion_pri = tmp;
16796       return;
16797     }
16798
16799   /* Set FUSION_PRI according to fusion type and base register.  */
16800   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16801
16802   /* Calculate PRI.  */
16803   tmp /= 2;
16804
16805   /* INSN with smaller offset goes first.  */
16806   off_val = (int)(INTVAL (offset));
16807   if (off_val >= 0)
16808     tmp -= (off_val & 0xfffff);
16809   else
16810     tmp += ((- off_val) & 0xfffff);
16811
16812   *pri = tmp;
16813   return;
16814 }
16815
16816 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16817    Adjust priority of sha1h instructions so they are scheduled before
16818    other SHA1 instructions.  */
16819
16820 static int
16821 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16822 {
16823   rtx x = PATTERN (insn);
16824
16825   if (GET_CODE (x) == SET)
16826     {
16827       x = SET_SRC (x);
16828
16829       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16830         return priority + 10;
16831     }
16832
16833   return priority;
16834 }
16835
16836 /* Given OPERANDS of consecutive load/store, check if we can merge
16837    them into ldp/stp.  LOAD is true if they are load instructions.
16838    MODE is the mode of memory operands.  */
16839
16840 bool
16841 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16842                                 machine_mode mode)
16843 {
16844   HOST_WIDE_INT offval_1, offval_2, msize;
16845   enum reg_class rclass_1, rclass_2;
16846   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16847
16848   if (load)
16849     {
16850       mem_1 = operands[1];
16851       mem_2 = operands[3];
16852       reg_1 = operands[0];
16853       reg_2 = operands[2];
16854       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16855       if (REGNO (reg_1) == REGNO (reg_2))
16856         return false;
16857     }
16858   else
16859     {
16860       mem_1 = operands[0];
16861       mem_2 = operands[2];
16862       reg_1 = operands[1];
16863       reg_2 = operands[3];
16864     }
16865
16866   /* The mems cannot be volatile.  */
16867   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16868     return false;
16869
16870   /* If we have SImode and slow unaligned ldp,
16871      check the alignment to be at least 8 byte. */
16872   if (mode == SImode
16873       && (aarch64_tune_params.extra_tuning_flags
16874           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16875       && !optimize_size
16876       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16877     return false;
16878
16879   /* Check if the addresses are in the form of [base+offset].  */
16880   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16881   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16882     return false;
16883   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16884   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16885     return false;
16886
16887   /* Check if the bases are same.  */
16888   if (!rtx_equal_p (base_1, base_2))
16889     return false;
16890
16891   /* The operands must be of the same size.  */
16892   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16893                          GET_MODE_SIZE (GET_MODE (mem_2))));
16894
16895   offval_1 = INTVAL (offset_1);
16896   offval_2 = INTVAL (offset_2);
16897   /* We should only be trying this for fixed-sized modes.  There is no
16898      SVE LDP/STP instruction.  */
16899   msize = GET_MODE_SIZE (mode).to_constant ();
16900   /* Check if the offsets are consecutive.  */
16901   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16902     return false;
16903
16904   /* Check if the addresses are clobbered by load.  */
16905   if (load)
16906     {
16907       if (reg_mentioned_p (reg_1, mem_1))
16908         return false;
16909
16910       /* In increasing order, the last load can clobber the address.  */
16911       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16912         return false;
16913     }
16914
16915   /* One of the memory accesses must be a mempair operand.
16916      If it is not the first one, they need to be swapped by the
16917      peephole.  */
16918   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16919        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16920     return false;
16921
16922   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16923     rclass_1 = FP_REGS;
16924   else
16925     rclass_1 = GENERAL_REGS;
16926
16927   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16928     rclass_2 = FP_REGS;
16929   else
16930     rclass_2 = GENERAL_REGS;
16931
16932   /* Check if the registers are of same class.  */
16933   if (rclass_1 != rclass_2)
16934     return false;
16935
16936   return true;
16937 }
16938
16939 /* Given OPERANDS of consecutive load/store that can be merged,
16940    swap them if they are not in ascending order.  */
16941 void
16942 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16943 {
16944   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16945   HOST_WIDE_INT offval_1, offval_2;
16946
16947   if (load)
16948     {
16949       mem_1 = operands[1];
16950       mem_2 = operands[3];
16951     }
16952   else
16953     {
16954       mem_1 = operands[0];
16955       mem_2 = operands[2];
16956     }
16957
16958   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16959   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16960
16961   offval_1 = INTVAL (offset_1);
16962   offval_2 = INTVAL (offset_2);
16963
16964   if (offval_1 > offval_2)
16965     {
16966       /* Irrespective of whether this is a load or a store,
16967          we do the same swap.  */
16968       std::swap (operands[0], operands[2]);
16969       std::swap (operands[1], operands[3]);
16970     }
16971 }
16972
16973 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16974    comparison between the two.  */
16975 int
16976 aarch64_host_wide_int_compare (const void *x, const void *y)
16977 {
16978   return wi::cmps (* ((const HOST_WIDE_INT *) x),
16979                    * ((const HOST_WIDE_INT *) y));
16980 }
16981
16982 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16983    other pointing to a REG rtx containing an offset, compare the offsets
16984    of the two pairs.
16985
16986    Return:
16987
16988         1 iff offset (X) > offset (Y)
16989         0 iff offset (X) == offset (Y)
16990         -1 iff offset (X) < offset (Y)  */
16991 int
16992 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16993 {
16994   const rtx * operands_1 = (const rtx *) x;
16995   const rtx * operands_2 = (const rtx *) y;
16996   rtx mem_1, mem_2, base, offset_1, offset_2;
16997
16998   if (MEM_P (operands_1[0]))
16999     mem_1 = operands_1[0];
17000   else
17001     mem_1 = operands_1[1];
17002
17003   if (MEM_P (operands_2[0]))
17004     mem_2 = operands_2[0];
17005   else
17006     mem_2 = operands_2[1];
17007
17008   /* Extract the offsets.  */
17009   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17010   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17011
17012   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17013
17014   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17015 }
17016
17017 /* Given OPERANDS of consecutive load/store, check if we can merge
17018    them into ldp/stp by adjusting the offset.  LOAD is true if they
17019    are load instructions.  MODE is the mode of memory operands.
17020
17021    Given below consecutive stores:
17022
17023      str  w1, [xb, 0x100]
17024      str  w1, [xb, 0x104]
17025      str  w1, [xb, 0x108]
17026      str  w1, [xb, 0x10c]
17027
17028    Though the offsets are out of the range supported by stp, we can
17029    still pair them after adjusting the offset, like:
17030
17031      add  scratch, xb, 0x100
17032      stp  w1, w1, [scratch]
17033      stp  w1, w1, [scratch, 0x8]
17034
17035    The peephole patterns detecting this opportunity should guarantee
17036    the scratch register is avaliable.  */
17037
17038 bool
17039 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17040                                        scalar_mode mode)
17041 {
17042   const int num_insns = 4;
17043   enum reg_class rclass;
17044   HOST_WIDE_INT offvals[num_insns], msize;
17045   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17046
17047   if (load)
17048     {
17049       for (int i = 0; i < num_insns; i++)
17050         {
17051           reg[i] = operands[2 * i];
17052           mem[i] = operands[2 * i + 1];
17053
17054           gcc_assert (REG_P (reg[i]));
17055         }
17056
17057       /* Do not attempt to merge the loads if the loads clobber each other.  */
17058       for (int i = 0; i < 8; i += 2)
17059         for (int j = i + 2; j < 8; j += 2)
17060           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17061             return false;
17062     }
17063   else
17064     for (int i = 0; i < num_insns; i++)
17065       {
17066         mem[i] = operands[2 * i];
17067         reg[i] = operands[2 * i + 1];
17068       }
17069
17070   /* Skip if memory operand is by itself valid for ldp/stp.  */
17071   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17072     return false;
17073
17074   for (int i = 0; i < num_insns; i++)
17075     {
17076       /* The mems cannot be volatile.  */
17077       if (MEM_VOLATILE_P (mem[i]))
17078         return false;
17079
17080       /* Check if the addresses are in the form of [base+offset].  */
17081       extract_base_offset_in_addr (mem[i], base + i, offset + i);
17082       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17083         return false;
17084     }
17085
17086   /* Check if addresses are clobbered by load.  */
17087   if (load)
17088     for (int i = 0; i < num_insns; i++)
17089       if (reg_mentioned_p (reg[i], mem[i]))
17090         return false;
17091
17092   /* Check if the bases are same.  */
17093   for (int i = 0; i < num_insns - 1; i++)
17094     if (!rtx_equal_p (base[i], base[i + 1]))
17095       return false;
17096
17097   for (int i = 0; i < num_insns; i++)
17098     offvals[i] = INTVAL (offset[i]);
17099
17100   msize = GET_MODE_SIZE (mode);
17101
17102   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17103   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17104          aarch64_host_wide_int_compare);
17105
17106   if (!(offvals[1] == offvals[0] + msize
17107         && offvals[3] == offvals[2] + msize))
17108     return false;
17109
17110   /* Check that offsets are within range of each other.  The ldp/stp
17111      instructions have 7 bit immediate offsets, so use 0x80.  */
17112   if (offvals[2] - offvals[0] >= msize * 0x80)
17113     return false;
17114
17115   /* The offsets must be aligned with respect to each other.  */
17116   if (offvals[0] % msize != offvals[2] % msize)
17117     return false;
17118
17119   /* If we have SImode and slow unaligned ldp,
17120      check the alignment to be at least 8 byte. */
17121   if (mode == SImode
17122       && (aarch64_tune_params.extra_tuning_flags
17123           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17124       && !optimize_size
17125       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17126     return false;
17127
17128   /* Check if the registers are of same class.  */
17129   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17130     ? FP_REGS : GENERAL_REGS;
17131
17132   for (int i = 1; i < num_insns; i++)
17133     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17134       {
17135         if (rclass != FP_REGS)
17136           return false;
17137       }
17138     else
17139       {
17140         if (rclass != GENERAL_REGS)
17141           return false;
17142       }
17143
17144   return true;
17145 }
17146
17147 /* Given OPERANDS of consecutive load/store, this function pairs them
17148    into LDP/STP after adjusting the offset.  It depends on the fact
17149    that the operands can be sorted so the offsets are correct for STP.
17150    MODE is the mode of memory operands.  CODE is the rtl operator
17151    which should be applied to all memory operands, it's SIGN_EXTEND,
17152    ZERO_EXTEND or UNKNOWN.  */
17153
17154 bool
17155 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17156                              scalar_mode mode, RTX_CODE code)
17157 {
17158   rtx base, offset_1, offset_3, t1, t2;
17159   rtx mem_1, mem_2, mem_3, mem_4;
17160   rtx temp_operands[8];
17161   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17162                 stp_off_upper_limit, stp_off_lower_limit, msize;
17163
17164   /* We make changes on a copy as we may still bail out.  */
17165   for (int i = 0; i < 8; i ++)
17166     temp_operands[i] = operands[i];
17167
17168   /* Sort the operands.  */
17169   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17170
17171   if (load)
17172     {
17173       mem_1 = temp_operands[1];
17174       mem_2 = temp_operands[3];
17175       mem_3 = temp_operands[5];
17176       mem_4 = temp_operands[7];
17177     }
17178   else
17179     {
17180       mem_1 = temp_operands[0];
17181       mem_2 = temp_operands[2];
17182       mem_3 = temp_operands[4];
17183       mem_4 = temp_operands[6];
17184       gcc_assert (code == UNKNOWN);
17185     }
17186
17187   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17188   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17189   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17190               && offset_3 != NULL_RTX);
17191
17192   /* Adjust offset so it can fit in LDP/STP instruction.  */
17193   msize = GET_MODE_SIZE (mode);
17194   stp_off_upper_limit = msize * (0x40 - 1);
17195   stp_off_lower_limit = - msize * 0x40;
17196
17197   off_val_1 = INTVAL (offset_1);
17198   off_val_3 = INTVAL (offset_3);
17199
17200   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17201   if (msize <= 4)
17202     base_off = (off_val_1 + off_val_3) / 2;
17203   else
17204     /* However, due to issues with negative LDP/STP offset generation for
17205        larger modes, for DF, DI and vector modes. we must not use negative
17206        addresses smaller than 9 signed unadjusted bits can store.  This
17207        provides the most range in this case.  */
17208     base_off = off_val_1;
17209
17210   /* Adjust the base so that it is aligned with the addresses but still
17211      optimal.  */
17212   if (base_off % msize != off_val_1 % msize)
17213     /* Fix the offset, bearing in mind we want to make it bigger not
17214        smaller.  */
17215     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17216   else if (msize <= 4)
17217     /* The negative range of LDP/STP is one larger than the positive range.  */
17218     base_off += msize;
17219
17220   /* Check if base offset is too big or too small.  We can attempt to resolve
17221      this issue by setting it to the maximum value and seeing if the offsets
17222      still fit.  */
17223   if (base_off >= 0x1000)
17224     {
17225       base_off = 0x1000 - 1;
17226       /* We must still make sure that the base offset is aligned with respect
17227          to the address.  But it may may not be made any bigger.  */
17228       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17229     }
17230
17231   /* Likewise for the case where the base is too small.  */
17232   if (base_off <= -0x1000)
17233     {
17234       base_off = -0x1000 + 1;
17235       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17236     }
17237
17238   /* Offset of the first STP/LDP.  */
17239   new_off_1 = off_val_1 - base_off;
17240
17241   /* Offset of the second STP/LDP.  */
17242   new_off_3 = off_val_3 - base_off;
17243
17244   /* The offsets must be within the range of the LDP/STP instructions.  */
17245   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17246       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17247     return false;
17248
17249   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17250                                                   new_off_1), true);
17251   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17252                                                   new_off_1 + msize), true);
17253   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17254                                                   new_off_3), true);
17255   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17256                                                   new_off_3 + msize), true);
17257
17258   if (!aarch64_mem_pair_operand (mem_1, mode)
17259       || !aarch64_mem_pair_operand (mem_3, mode))
17260     return false;
17261
17262   if (code == ZERO_EXTEND)
17263     {
17264       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17265       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17266       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17267       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17268     }
17269   else if (code == SIGN_EXTEND)
17270     {
17271       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17272       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17273       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17274       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17275     }
17276
17277   if (load)
17278     {
17279       operands[0] = temp_operands[0];
17280       operands[1] = mem_1;
17281       operands[2] = temp_operands[2];
17282       operands[3] = mem_2;
17283       operands[4] = temp_operands[4];
17284       operands[5] = mem_3;
17285       operands[6] = temp_operands[6];
17286       operands[7] = mem_4;
17287     }
17288   else
17289     {
17290       operands[0] = mem_1;
17291       operands[1] = temp_operands[1];
17292       operands[2] = mem_2;
17293       operands[3] = temp_operands[3];
17294       operands[4] = mem_3;
17295       operands[5] = temp_operands[5];
17296       operands[6] = mem_4;
17297       operands[7] = temp_operands[7];
17298     }
17299
17300   /* Emit adjusting instruction.  */
17301   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17302   /* Emit ldp/stp instructions.  */
17303   t1 = gen_rtx_SET (operands[0], operands[1]);
17304   t2 = gen_rtx_SET (operands[2], operands[3]);
17305   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17306   t1 = gen_rtx_SET (operands[4], operands[5]);
17307   t2 = gen_rtx_SET (operands[6], operands[7]);
17308   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17309   return true;
17310 }
17311
17312 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17313    it isn't worth branching around empty masked ops (including masked
17314    stores).  */
17315
17316 static bool
17317 aarch64_empty_mask_is_expensive (unsigned)
17318 {
17319   return false;
17320 }
17321
17322 /* Return 1 if pseudo register should be created and used to hold
17323    GOT address for PIC code.  */
17324
17325 bool
17326 aarch64_use_pseudo_pic_reg (void)
17327 {
17328   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17329 }
17330
17331 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17332
17333 static int
17334 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17335 {
17336   switch (XINT (x, 1))
17337     {
17338     case UNSPEC_GOTSMALLPIC:
17339     case UNSPEC_GOTSMALLPIC28K:
17340     case UNSPEC_GOTTINYPIC:
17341       return 0;
17342     default:
17343       break;
17344     }
17345
17346   return default_unspec_may_trap_p (x, flags);
17347 }
17348
17349
17350 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17351    return the log2 of that value.  Otherwise return -1.  */
17352
17353 int
17354 aarch64_fpconst_pow_of_2 (rtx x)
17355 {
17356   const REAL_VALUE_TYPE *r;
17357
17358   if (!CONST_DOUBLE_P (x))
17359     return -1;
17360
17361   r = CONST_DOUBLE_REAL_VALUE (x);
17362
17363   if (REAL_VALUE_NEGATIVE (*r)
17364       || REAL_VALUE_ISNAN (*r)
17365       || REAL_VALUE_ISINF (*r)
17366       || !real_isinteger (r, DFmode))
17367     return -1;
17368
17369   return exact_log2 (real_to_integer (r));
17370 }
17371
17372 /* If X is a vector of equal CONST_DOUBLE values and that value is
17373    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17374
17375 int
17376 aarch64_vec_fpconst_pow_of_2 (rtx x)
17377 {
17378   int nelts;
17379   if (GET_CODE (x) != CONST_VECTOR
17380       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17381     return -1;
17382
17383   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17384     return -1;
17385
17386   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17387   if (firstval <= 0)
17388     return -1;
17389
17390   for (int i = 1; i < nelts; i++)
17391     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17392       return -1;
17393
17394   return firstval;
17395 }
17396
17397 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17398    to float.
17399
17400    __fp16 always promotes through this hook.
17401    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17402    through the generic excess precision logic rather than here.  */
17403
17404 static tree
17405 aarch64_promoted_type (const_tree t)
17406 {
17407   if (SCALAR_FLOAT_TYPE_P (t)
17408       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17409     return float_type_node;
17410
17411   return NULL_TREE;
17412 }
17413
17414 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17415
17416 static bool
17417 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17418                            optimization_type opt_type)
17419 {
17420   switch (op)
17421     {
17422     case rsqrt_optab:
17423       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17424
17425     default:
17426       return true;
17427     }
17428 }
17429
17430 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17431
17432 static unsigned int
17433 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17434                                         int *offset)
17435 {
17436   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17437   gcc_assert (i == 1);
17438   *factor = 2;
17439   *offset = 1;
17440   return AARCH64_DWARF_VG;
17441 }
17442
17443 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17444    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17445
17446 static bool
17447 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17448 {
17449   return (mode == HFmode
17450           ? true
17451           : default_libgcc_floating_mode_supported_p (mode));
17452 }
17453
17454 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17455    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17456
17457 static bool
17458 aarch64_scalar_mode_supported_p (scalar_mode mode)
17459 {
17460   return (mode == HFmode
17461           ? true
17462           : default_scalar_mode_supported_p (mode));
17463 }
17464
17465 /* Set the value of FLT_EVAL_METHOD.
17466    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17467
17468     0: evaluate all operations and constants, whose semantic type has at
17469        most the range and precision of type float, to the range and
17470        precision of float; evaluate all other operations and constants to
17471        the range and precision of the semantic type;
17472
17473     N, where _FloatN is a supported interchange floating type
17474        evaluate all operations and constants, whose semantic type has at
17475        most the range and precision of _FloatN type, to the range and
17476        precision of the _FloatN type; evaluate all other operations and
17477        constants to the range and precision of the semantic type;
17478
17479    If we have the ARMv8.2-A extensions then we support _Float16 in native
17480    precision, so we should set this to 16.  Otherwise, we support the type,
17481    but want to evaluate expressions in float precision, so set this to
17482    0.  */
17483
17484 static enum flt_eval_method
17485 aarch64_excess_precision (enum excess_precision_type type)
17486 {
17487   switch (type)
17488     {
17489       case EXCESS_PRECISION_TYPE_FAST:
17490       case EXCESS_PRECISION_TYPE_STANDARD:
17491         /* We can calculate either in 16-bit range and precision or
17492            32-bit range and precision.  Make that decision based on whether
17493            we have native support for the ARMv8.2-A 16-bit floating-point
17494            instructions or not.  */
17495         return (TARGET_FP_F16INST
17496                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17497                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17498       case EXCESS_PRECISION_TYPE_IMPLICIT:
17499         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17500       default:
17501         gcc_unreachable ();
17502     }
17503   return FLT_EVAL_METHOD_UNPREDICTABLE;
17504 }
17505
17506 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17507    scheduled for speculative execution.  Reject the long-running division
17508    and square-root instructions.  */
17509
17510 static bool
17511 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17512 {
17513   switch (get_attr_type (insn))
17514     {
17515       case TYPE_SDIV:
17516       case TYPE_UDIV:
17517       case TYPE_FDIVS:
17518       case TYPE_FDIVD:
17519       case TYPE_FSQRTS:
17520       case TYPE_FSQRTD:
17521       case TYPE_NEON_FP_SQRT_S:
17522       case TYPE_NEON_FP_SQRT_D:
17523       case TYPE_NEON_FP_SQRT_S_Q:
17524       case TYPE_NEON_FP_SQRT_D_Q:
17525       case TYPE_NEON_FP_DIV_S:
17526       case TYPE_NEON_FP_DIV_D:
17527       case TYPE_NEON_FP_DIV_S_Q:
17528       case TYPE_NEON_FP_DIV_D_Q:
17529         return false;
17530       default:
17531         return true;
17532     }
17533 }
17534
17535 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17536
17537 static int
17538 aarch64_compute_pressure_classes (reg_class *classes)
17539 {
17540   int i = 0;
17541   classes[i++] = GENERAL_REGS;
17542   classes[i++] = FP_REGS;
17543   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17544      registers need to go in PR_LO_REGS at some point during their
17545      lifetime.  Splitting it into two halves has the effect of making
17546      all predicates count against PR_LO_REGS, so that we try whenever
17547      possible to restrict the number of live predicates to 8.  This
17548      greatly reduces the amount of spilling in certain loops.  */
17549   classes[i++] = PR_LO_REGS;
17550   classes[i++] = PR_HI_REGS;
17551   return i;
17552 }
17553
17554 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17555
17556 static bool
17557 aarch64_can_change_mode_class (machine_mode from,
17558                                machine_mode to, reg_class_t)
17559 {
17560   if (BYTES_BIG_ENDIAN)
17561     {
17562       bool from_sve_p = aarch64_sve_data_mode_p (from);
17563       bool to_sve_p = aarch64_sve_data_mode_p (to);
17564
17565       /* Don't allow changes between SVE data modes and non-SVE modes.
17566          See the comment at the head of aarch64-sve.md for details.  */
17567       if (from_sve_p != to_sve_p)
17568         return false;
17569
17570       /* Don't allow changes in element size: lane 0 of the new vector
17571          would not then be lane 0 of the old vector.  See the comment
17572          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17573          description.
17574
17575          In the worst case, this forces a register to be spilled in
17576          one mode and reloaded in the other, which handles the
17577          endianness correctly.  */
17578       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17579         return false;
17580     }
17581   return true;
17582 }
17583
17584 /* Implement TARGET_EARLY_REMAT_MODES.  */
17585
17586 static void
17587 aarch64_select_early_remat_modes (sbitmap modes)
17588 {
17589   /* SVE values are not normally live across a call, so it should be
17590      worth doing early rematerialization even in VL-specific mode.  */
17591   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17592     {
17593       machine_mode mode = (machine_mode) i;
17594       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17595       if (vec_flags & VEC_ANY_SVE)
17596         bitmap_set_bit (modes, i);
17597     }
17598 }
17599
17600 /* Target-specific selftests.  */
17601
17602 #if CHECKING_P
17603
17604 namespace selftest {
17605
17606 /* Selftest for the RTL loader.
17607    Verify that the RTL loader copes with a dump from
17608    print_rtx_function.  This is essentially just a test that class
17609    function_reader can handle a real dump, but it also verifies
17610    that lookup_reg_by_dump_name correctly handles hard regs.
17611    The presence of hard reg names in the dump means that the test is
17612    target-specific, hence it is in this file.  */
17613
17614 static void
17615 aarch64_test_loading_full_dump ()
17616 {
17617   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17618
17619   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17620
17621   rtx_insn *insn_1 = get_insn_by_uid (1);
17622   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17623
17624   rtx_insn *insn_15 = get_insn_by_uid (15);
17625   ASSERT_EQ (INSN, GET_CODE (insn_15));
17626   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17627
17628   /* Verify crtl->return_rtx.  */
17629   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17630   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17631   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17632 }
17633
17634 /* Run all target-specific selftests.  */
17635
17636 static void
17637 aarch64_run_selftests (void)
17638 {
17639   aarch64_test_loading_full_dump ();
17640 }
17641
17642 } // namespace selftest
17643
17644 #endif /* #if CHECKING_P */
17645
17646 #undef TARGET_ADDRESS_COST
17647 #define TARGET_ADDRESS_COST aarch64_address_cost
17648
17649 /* This hook will determines whether unnamed bitfields affect the alignment
17650    of the containing structure.  The hook returns true if the structure
17651    should inherit the alignment requirements of an unnamed bitfield's
17652    type.  */
17653 #undef TARGET_ALIGN_ANON_BITFIELD
17654 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17655
17656 #undef TARGET_ASM_ALIGNED_DI_OP
17657 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17658
17659 #undef TARGET_ASM_ALIGNED_HI_OP
17660 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17661
17662 #undef TARGET_ASM_ALIGNED_SI_OP
17663 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17664
17665 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17666 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17667   hook_bool_const_tree_hwi_hwi_const_tree_true
17668
17669 #undef TARGET_ASM_FILE_START
17670 #define TARGET_ASM_FILE_START aarch64_start_file
17671
17672 #undef TARGET_ASM_OUTPUT_MI_THUNK
17673 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17674
17675 #undef TARGET_ASM_SELECT_RTX_SECTION
17676 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17677
17678 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17679 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17680
17681 #undef TARGET_BUILD_BUILTIN_VA_LIST
17682 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17683
17684 #undef TARGET_CALLEE_COPIES
17685 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17686
17687 #undef TARGET_CAN_ELIMINATE
17688 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17689
17690 #undef TARGET_CAN_INLINE_P
17691 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17692
17693 #undef TARGET_CANNOT_FORCE_CONST_MEM
17694 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17695
17696 #undef TARGET_CASE_VALUES_THRESHOLD
17697 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17698
17699 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17700 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17701
17702 /* Only the least significant bit is used for initialization guard
17703    variables.  */
17704 #undef TARGET_CXX_GUARD_MASK_BIT
17705 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17706
17707 #undef TARGET_C_MODE_FOR_SUFFIX
17708 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17709
17710 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17711 #undef  TARGET_DEFAULT_TARGET_FLAGS
17712 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17713 #endif
17714
17715 #undef TARGET_CLASS_MAX_NREGS
17716 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17717
17718 #undef TARGET_BUILTIN_DECL
17719 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17720
17721 #undef TARGET_BUILTIN_RECIPROCAL
17722 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17723
17724 #undef TARGET_C_EXCESS_PRECISION
17725 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17726
17727 #undef  TARGET_EXPAND_BUILTIN
17728 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17729
17730 #undef TARGET_EXPAND_BUILTIN_VA_START
17731 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17732
17733 #undef TARGET_FOLD_BUILTIN
17734 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17735
17736 #undef TARGET_FUNCTION_ARG
17737 #define TARGET_FUNCTION_ARG aarch64_function_arg
17738
17739 #undef TARGET_FUNCTION_ARG_ADVANCE
17740 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17741
17742 #undef TARGET_FUNCTION_ARG_BOUNDARY
17743 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17744
17745 #undef TARGET_FUNCTION_ARG_PADDING
17746 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17747
17748 #undef TARGET_GET_RAW_RESULT_MODE
17749 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17750 #undef TARGET_GET_RAW_ARG_MODE
17751 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17752
17753 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17754 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17755
17756 #undef TARGET_FUNCTION_VALUE
17757 #define TARGET_FUNCTION_VALUE aarch64_function_value
17758
17759 #undef TARGET_FUNCTION_VALUE_REGNO_P
17760 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17761
17762 #undef TARGET_GIMPLE_FOLD_BUILTIN
17763 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17764
17765 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17766 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17767
17768 #undef  TARGET_INIT_BUILTINS
17769 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17770
17771 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17772 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17773   aarch64_ira_change_pseudo_allocno_class
17774
17775 #undef TARGET_LEGITIMATE_ADDRESS_P
17776 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17777
17778 #undef TARGET_LEGITIMATE_CONSTANT_P
17779 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17780
17781 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17782 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17783   aarch64_legitimize_address_displacement
17784
17785 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17786 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17787
17788 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17789 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17790 aarch64_libgcc_floating_mode_supported_p
17791
17792 #undef TARGET_MANGLE_TYPE
17793 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17794
17795 #undef TARGET_MEMORY_MOVE_COST
17796 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17797
17798 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17799 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17800
17801 #undef TARGET_MUST_PASS_IN_STACK
17802 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17803
17804 /* This target hook should return true if accesses to volatile bitfields
17805    should use the narrowest mode possible.  It should return false if these
17806    accesses should use the bitfield container type.  */
17807 #undef TARGET_NARROW_VOLATILE_BITFIELD
17808 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17809
17810 #undef  TARGET_OPTION_OVERRIDE
17811 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17812
17813 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17814 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17815   aarch64_override_options_after_change
17816
17817 #undef TARGET_OPTION_SAVE
17818 #define TARGET_OPTION_SAVE aarch64_option_save
17819
17820 #undef TARGET_OPTION_RESTORE
17821 #define TARGET_OPTION_RESTORE aarch64_option_restore
17822
17823 #undef TARGET_OPTION_PRINT
17824 #define TARGET_OPTION_PRINT aarch64_option_print
17825
17826 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17827 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17828
17829 #undef TARGET_SET_CURRENT_FUNCTION
17830 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17831
17832 #undef TARGET_PASS_BY_REFERENCE
17833 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17834
17835 #undef TARGET_PREFERRED_RELOAD_CLASS
17836 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17837
17838 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17839 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17840
17841 #undef TARGET_PROMOTED_TYPE
17842 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17843
17844 #undef TARGET_SECONDARY_RELOAD
17845 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17846
17847 #undef TARGET_SHIFT_TRUNCATION_MASK
17848 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17849
17850 #undef TARGET_SETUP_INCOMING_VARARGS
17851 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17852
17853 #undef TARGET_STRUCT_VALUE_RTX
17854 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17855
17856 #undef TARGET_REGISTER_MOVE_COST
17857 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17858
17859 #undef TARGET_RETURN_IN_MEMORY
17860 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17861
17862 #undef TARGET_RETURN_IN_MSB
17863 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17864
17865 #undef TARGET_RTX_COSTS
17866 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17867
17868 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17869 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17870
17871 #undef TARGET_SCHED_ISSUE_RATE
17872 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17873
17874 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17875 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17876   aarch64_sched_first_cycle_multipass_dfa_lookahead
17877
17878 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17879 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17880   aarch64_first_cycle_multipass_dfa_lookahead_guard
17881
17882 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17883 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17884   aarch64_get_separate_components
17885
17886 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17887 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17888   aarch64_components_for_bb
17889
17890 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17891 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17892   aarch64_disqualify_components
17893
17894 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17895 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17896   aarch64_emit_prologue_components
17897
17898 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17899 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17900   aarch64_emit_epilogue_components
17901
17902 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17903 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17904   aarch64_set_handled_components
17905
17906 #undef TARGET_TRAMPOLINE_INIT
17907 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17908
17909 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17910 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17911
17912 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17913 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17914
17915 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17916 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17917   aarch64_builtin_support_vector_misalignment
17918
17919 #undef TARGET_ARRAY_MODE
17920 #define TARGET_ARRAY_MODE aarch64_array_mode
17921
17922 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17923 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17924
17925 #undef TARGET_VECTORIZE_ADD_STMT_COST
17926 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17927
17928 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17929 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17930   aarch64_builtin_vectorization_cost
17931
17932 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17933 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17934
17935 #undef TARGET_VECTORIZE_BUILTINS
17936 #define TARGET_VECTORIZE_BUILTINS
17937
17938 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17939 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17940   aarch64_builtin_vectorized_function
17941
17942 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17943 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17944   aarch64_autovectorize_vector_sizes
17945
17946 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17947 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17948   aarch64_atomic_assign_expand_fenv
17949
17950 /* Section anchor support.  */
17951
17952 #undef TARGET_MIN_ANCHOR_OFFSET
17953 #define TARGET_MIN_ANCHOR_OFFSET -256
17954
17955 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17956    byte offset; we can do much more for larger data types, but have no way
17957    to determine the size of the access.  We assume accesses are aligned.  */
17958 #undef TARGET_MAX_ANCHOR_OFFSET
17959 #define TARGET_MAX_ANCHOR_OFFSET 4095
17960
17961 #undef TARGET_VECTOR_ALIGNMENT
17962 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17963
17964 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17965 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17966   aarch64_vectorize_preferred_vector_alignment
17967 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17968 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17969   aarch64_simd_vector_alignment_reachable
17970
17971 /* vec_perm support.  */
17972
17973 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17974 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17975   aarch64_vectorize_vec_perm_const
17976
17977 #undef TARGET_VECTORIZE_GET_MASK_MODE
17978 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17979 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17980 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17981   aarch64_empty_mask_is_expensive
17982 #undef TARGET_PREFERRED_ELSE_VALUE
17983 #define TARGET_PREFERRED_ELSE_VALUE \
17984   aarch64_preferred_else_value
17985
17986 #undef TARGET_INIT_LIBFUNCS
17987 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17988
17989 #undef TARGET_FIXED_CONDITION_CODE_REGS
17990 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17991
17992 #undef TARGET_FLAGS_REGNUM
17993 #define TARGET_FLAGS_REGNUM CC_REGNUM
17994
17995 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17996 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17997
17998 #undef TARGET_ASAN_SHADOW_OFFSET
17999 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18000
18001 #undef TARGET_LEGITIMIZE_ADDRESS
18002 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18003
18004 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18005 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18006
18007 #undef TARGET_CAN_USE_DOLOOP_P
18008 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18009
18010 #undef TARGET_SCHED_ADJUST_PRIORITY
18011 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18012
18013 #undef TARGET_SCHED_MACRO_FUSION_P
18014 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18015
18016 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18017 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18018
18019 #undef TARGET_SCHED_FUSION_PRIORITY
18020 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18021
18022 #undef TARGET_UNSPEC_MAY_TRAP_P
18023 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18024
18025 #undef TARGET_USE_PSEUDO_PIC_REG
18026 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18027
18028 #undef TARGET_PRINT_OPERAND
18029 #define TARGET_PRINT_OPERAND aarch64_print_operand
18030
18031 #undef TARGET_PRINT_OPERAND_ADDRESS
18032 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18033
18034 #undef TARGET_OPTAB_SUPPORTED_P
18035 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18036
18037 #undef TARGET_OMIT_STRUCT_RETURN_REG
18038 #define TARGET_OMIT_STRUCT_RETURN_REG true
18039
18040 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18041 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18042   aarch64_dwarf_poly_indeterminate_value
18043
18044 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18045 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18046 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18047
18048 #undef TARGET_HARD_REGNO_NREGS
18049 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18050 #undef TARGET_HARD_REGNO_MODE_OK
18051 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18052
18053 #undef TARGET_MODES_TIEABLE_P
18054 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18055
18056 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18057 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18058   aarch64_hard_regno_call_part_clobbered
18059
18060 #undef TARGET_CONSTANT_ALIGNMENT
18061 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18062
18063 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18064 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18065
18066 #undef TARGET_CAN_CHANGE_MODE_CLASS
18067 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18068
18069 #undef TARGET_SELECT_EARLY_REMAT_MODES
18070 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18071
18072 #if CHECKING_P
18073 #undef TARGET_RUN_TARGET_SELFTESTS
18074 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18075 #endif /* #if CHECKING_P */
18076
18077 struct gcc_target targetm = TARGET_INITIALIZER;
18078
18079 #include "gt-aarch64.h"