gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77 #include "function-abi.h"
  78
  79 /* This file should be included last.  */
  80 #include "target-def.h"
  81
  82 /* Defined for convenience.  */
  83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  84
  85 /* Information about a legitimate vector immediate operand.  */
  86 struct simd_immediate_info
  87 {
  88   enum insn_type { MOV, MVN, INDEX, PTRUE };
  89   enum modifier_type { LSL, MSL };
  90
  91   simd_immediate_info () {}
  92   simd_immediate_info (scalar_float_mode, rtx);
  93   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  94                        insn_type = MOV, modifier_type = LSL,
  95                        unsigned int = 0);
  96   simd_immediate_info (scalar_mode, rtx, rtx);
  97   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  98
  99   /* The mode of the elements.  */
 100   scalar_mode elt_mode;
 101
 102   /* The instruction to use to move the immediate into a vector.  */
 103   insn_type insn;
 104
 105   union
 106   {
 107     /* For MOV and MVN.  */
 108     struct
 109     {
 110       /* The value of each element.  */
 111       rtx value;
 112
 113       /* The kind of shift modifier to use, and the number of bits to shift.
 114          This is (LSL, 0) if no shift is needed.  */
 115       modifier_type modifier;
 116       unsigned int shift;
 117     } mov;
 118
 119     /* For INDEX.  */
 120     struct
 121     {
 122       /* The value of the first element and the step to be added for each
 123          subsequent element.  */
 124       rtx base, step;
 125     } index;
 126
 127     /* For PTRUE.  */
 128     aarch64_svpattern pattern;
 129   } u;
 130 };
 131
 132 /* Construct a floating-point immediate in which each element has mode
 133    ELT_MODE_IN and value VALUE_IN.  */
 134 inline simd_immediate_info
 135 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 136   : elt_mode (elt_mode_in), insn (MOV)
 137 {
 138   u.mov.value = value_in;
 139   u.mov.modifier = LSL;
 140   u.mov.shift = 0;
 141 }
 142
 143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 144    and value VALUE_IN.  The other parameters are as for the structure
 145    fields.  */
 146 inline simd_immediate_info
 147 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 148                        unsigned HOST_WIDE_INT value_in,
 149                        insn_type insn_in, modifier_type modifier_in,
 150                        unsigned int shift_in)
 151   : elt_mode (elt_mode_in), insn (insn_in)
 152 {
 153   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 154   u.mov.modifier = modifier_in;
 155   u.mov.shift = shift_in;
 156 }
 157
 158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 159    and where element I is equal to BASE_IN + I * STEP_IN.  */
 160 inline simd_immediate_info
 161 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 162   : elt_mode (elt_mode_in), insn (INDEX)
 163 {
 164   u.index.base = base_in;
 165   u.index.step = step_in;
 166 }
 167
 168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 169    and has PTRUE pattern PATTERN_IN.  */
 170 inline simd_immediate_info
 171 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 172                        aarch64_svpattern pattern_in)
 173   : elt_mode (elt_mode_in), insn (PTRUE)
 174 {
 175   u.pattern = pattern_in;
 176 }
 177
 178 /* The current code model.  */
 179 enum aarch64_code_model aarch64_cmodel;
 180
 181 /* The number of 64-bit elements in an SVE vector.  */
 182 poly_uint16 aarch64_sve_vg;
 183
 184 #ifdef HAVE_AS_TLS
 185 #undef TARGET_HAVE_TLS
 186 #define TARGET_HAVE_TLS 1
 187 #endif
 188
 189 static bool aarch64_composite_type_p (const_tree, machine_mode);
 190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 191                                                      const_tree,
 192                                                      machine_mode *, int *,
 193                                                      bool *);
 194 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 196 static void aarch64_override_options_after_change (void);
 197 static bool aarch64_vector_mode_supported_p (machine_mode);
 198 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 200                                                          const_tree type,
 201                                                          int misalignment,
 202                                                          bool is_packed);
 203 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 204 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 205                                             aarch64_addr_query_type);
 206 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 207
 208 /* Major revision number of the ARM Architecture implemented by the target.  */
 209 unsigned aarch64_architecture_version;
 210
 211 /* The processor for which instructions should be scheduled.  */
 212 enum aarch64_processor aarch64_tune = cortexa53;
 213
 214 /* Mask to specify which instruction scheduling options should be used.  */
 215 uint64_t aarch64_tune_flags = 0;
 216
 217 /* Global flag for PC relative loads.  */
 218 bool aarch64_pcrelative_literal_loads;
 219
 220 /* Global flag for whether frame pointer is enabled.  */
 221 bool aarch64_use_frame_pointer;
 222
 223 #define BRANCH_PROTECT_STR_MAX 255
 224 char *accepted_branch_protection_string = NULL;
 225
 226 static enum aarch64_parse_opt_result
 227 aarch64_parse_branch_protection (const char*, char**);
 228
 229 /* Support for command line parsing of boolean flags in the tuning
 230    structures.  */
 231 struct aarch64_flag_desc
 232 {
 233   const char* name;
 234   unsigned int flag;
 235 };
 236
 237 #define AARCH64_FUSION_PAIR(name, internal_name) \
 238   { name, AARCH64_FUSE_##internal_name },
 239 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 240 {
 241   { "none", AARCH64_FUSE_NOTHING },
 242 #include "aarch64-fusion-pairs.def"
 243   { "all", AARCH64_FUSE_ALL },
 244   { NULL, AARCH64_FUSE_NOTHING }
 245 };
 246
 247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 248   { name, AARCH64_EXTRA_TUNE_##internal_name },
 249 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 250 {
 251   { "none", AARCH64_EXTRA_TUNE_NONE },
 252 #include "aarch64-tuning-flags.def"
 253   { "all", AARCH64_EXTRA_TUNE_ALL },
 254   { NULL, AARCH64_EXTRA_TUNE_NONE }
 255 };
 256
 257 /* Tuning parameters.  */
 258
 259 static const struct cpu_addrcost_table generic_addrcost_table =
 260 {
 261     {
 262       1, /* hi  */
 263       0, /* si  */
 264       0, /* di  */
 265       1, /* ti  */
 266     },
 267   0, /* pre_modify  */
 268   0, /* post_modify  */
 269   0, /* register_offset  */
 270   0, /* register_sextend  */
 271   0, /* register_zextend  */
 272   0 /* imm_offset  */
 273 };
 274
 275 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 276 {
 277     {
 278       0, /* hi  */
 279       0, /* si  */
 280       0, /* di  */
 281       2, /* ti  */
 282     },
 283   0, /* pre_modify  */
 284   0, /* post_modify  */
 285   1, /* register_offset  */
 286   1, /* register_sextend  */
 287   2, /* register_zextend  */
 288   0, /* imm_offset  */
 289 };
 290
 291 static const struct cpu_addrcost_table xgene1_addrcost_table =
 292 {
 293     {
 294       1, /* hi  */
 295       0, /* si  */
 296       0, /* di  */
 297       1, /* ti  */
 298     },
 299   1, /* pre_modify  */
 300   1, /* post_modify  */
 301   0, /* register_offset  */
 302   1, /* register_sextend  */
 303   1, /* register_zextend  */
 304   0, /* imm_offset  */
 305 };
 306
 307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 308 {
 309     {
 310       1, /* hi  */
 311       1, /* si  */
 312       1, /* di  */
 313       2, /* ti  */
 314     },
 315   0, /* pre_modify  */
 316   0, /* post_modify  */
 317   2, /* register_offset  */
 318   3, /* register_sextend  */
 319   3, /* register_zextend  */
 320   0, /* imm_offset  */
 321 };
 322
 323 static const struct cpu_addrcost_table tsv110_addrcost_table =
 324 {
 325     {
 326       1, /* hi  */
 327       0, /* si  */
 328       0, /* di  */
 329       1, /* ti  */
 330     },
 331   0, /* pre_modify  */
 332   0, /* post_modify  */
 333   0, /* register_offset  */
 334   1, /* register_sextend  */
 335   1, /* register_zextend  */
 336   0, /* imm_offset  */
 337 };
 338
 339 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 340 {
 341     {
 342       1, /* hi  */
 343       1, /* si  */
 344       1, /* di  */
 345       2, /* ti  */
 346     },
 347   1, /* pre_modify  */
 348   1, /* post_modify  */
 349   3, /* register_offset  */
 350   3, /* register_sextend  */
 351   3, /* register_zextend  */
 352   2, /* imm_offset  */
 353 };
 354
 355 static const struct cpu_regmove_cost generic_regmove_cost =
 356 {
 357   1, /* GP2GP  */
 358   /* Avoid the use of slow int<->fp moves for spilling by setting
 359      their cost higher than memmov_cost.  */
 360   5, /* GP2FP  */
 361   5, /* FP2GP  */
 362   2 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   5, /* GP2FP  */
 371   5, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 376 {
 377   1, /* GP2GP  */
 378   /* Avoid the use of slow int<->fp moves for spilling by setting
 379      their cost higher than memmov_cost.  */
 380   5, /* GP2FP  */
 381   5, /* FP2GP  */
 382   2 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of slow int<->fp moves for spilling by setting
 389      their cost higher than memmov_cost (actual, 4 and 9).  */
 390   9, /* GP2FP  */
 391   9, /* FP2GP  */
 392   1 /* FP2FP  */
 393 };
 394
 395 static const struct cpu_regmove_cost thunderx_regmove_cost =
 396 {
 397   2, /* GP2GP  */
 398   2, /* GP2FP  */
 399   6, /* FP2GP  */
 400   4 /* FP2FP  */
 401 };
 402
 403 static const struct cpu_regmove_cost xgene1_regmove_cost =
 404 {
 405   1, /* GP2GP  */
 406   /* Avoid the use of slow int<->fp moves for spilling by setting
 407      their cost higher than memmov_cost.  */
 408   8, /* GP2FP  */
 409   8, /* FP2GP  */
 410   2 /* FP2FP  */
 411 };
 412
 413 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 414 {
 415   2, /* GP2GP  */
 416   /* Avoid the use of int<->fp moves for spilling.  */
 417   6, /* GP2FP  */
 418   6, /* FP2GP  */
 419   4 /* FP2FP  */
 420 };
 421
 422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 423 {
 424   1, /* GP2GP  */
 425   /* Avoid the use of int<->fp moves for spilling.  */
 426   8, /* GP2FP  */
 427   8, /* FP2GP  */
 428   4  /* FP2FP  */
 429 };
 430
 431 static const struct cpu_regmove_cost tsv110_regmove_cost =
 432 {
 433   1, /* GP2GP  */
 434   /* Avoid the use of slow int<->fp moves for spilling by setting
 435      their cost higher than memmov_cost.  */
 436   2, /* GP2FP  */
 437   3, /* FP2GP  */
 438   2  /* FP2FP  */
 439 };
 440
 441 /* Generic costs for vector insn classes.  */
 442 static const struct cpu_vector_cost generic_vector_cost =
 443 {
 444   1, /* scalar_int_stmt_cost  */
 445   1, /* scalar_fp_stmt_cost  */
 446   1, /* scalar_load_cost  */
 447   1, /* scalar_store_cost  */
 448   1, /* vec_int_stmt_cost  */
 449   1, /* vec_fp_stmt_cost  */
 450   2, /* vec_permute_cost  */
 451   1, /* vec_to_scalar_cost  */
 452   1, /* scalar_to_vec_cost  */
 453   1, /* vec_align_load_cost  */
 454   1, /* vec_unalign_load_cost  */
 455   1, /* vec_unalign_store_cost  */
 456   1, /* vec_store_cost  */
 457   3, /* cond_taken_branch_cost  */
 458   1 /* cond_not_taken_branch_cost  */
 459 };
 460
 461 /* QDF24XX costs for vector insn classes.  */
 462 static const struct cpu_vector_cost qdf24xx_vector_cost =
 463 {
 464   1, /* scalar_int_stmt_cost  */
 465   1, /* scalar_fp_stmt_cost  */
 466   1, /* scalar_load_cost  */
 467   1, /* scalar_store_cost  */
 468   1, /* vec_int_stmt_cost  */
 469   3, /* vec_fp_stmt_cost  */
 470   2, /* vec_permute_cost  */
 471   1, /* vec_to_scalar_cost  */
 472   1, /* scalar_to_vec_cost  */
 473   1, /* vec_align_load_cost  */
 474   1, /* vec_unalign_load_cost  */
 475   1, /* vec_unalign_store_cost  */
 476   1, /* vec_store_cost  */
 477   3, /* cond_taken_branch_cost  */
 478   1 /* cond_not_taken_branch_cost  */
 479 };
 480
 481 /* ThunderX costs for vector insn classes.  */
 482 static const struct cpu_vector_cost thunderx_vector_cost =
 483 {
 484   1, /* scalar_int_stmt_cost  */
 485   1, /* scalar_fp_stmt_cost  */
 486   3, /* scalar_load_cost  */
 487   1, /* scalar_store_cost  */
 488   4, /* vec_int_stmt_cost  */
 489   1, /* vec_fp_stmt_cost  */
 490   4, /* vec_permute_cost  */
 491   2, /* vec_to_scalar_cost  */
 492   2, /* scalar_to_vec_cost  */
 493   3, /* vec_align_load_cost  */
 494   5, /* vec_unalign_load_cost  */
 495   5, /* vec_unalign_store_cost  */
 496   1, /* vec_store_cost  */
 497   3, /* cond_taken_branch_cost  */
 498   3 /* cond_not_taken_branch_cost  */
 499 };
 500
 501 static const struct cpu_vector_cost tsv110_vector_cost =
 502 {
 503   1, /* scalar_int_stmt_cost  */
 504   1, /* scalar_fp_stmt_cost  */
 505   5, /* scalar_load_cost  */
 506   1, /* scalar_store_cost  */
 507   2, /* vec_int_stmt_cost  */
 508   2, /* vec_fp_stmt_cost  */
 509   2, /* vec_permute_cost  */
 510   3, /* vec_to_scalar_cost  */
 511   2, /* scalar_to_vec_cost  */
 512   5, /* vec_align_load_cost  */
 513   5, /* vec_unalign_load_cost  */
 514   1, /* vec_unalign_store_cost  */
 515   1, /* vec_store_cost  */
 516   1, /* cond_taken_branch_cost  */
 517   1 /* cond_not_taken_branch_cost  */
 518 };
 519
 520 /* Generic costs for vector insn classes.  */
 521 static const struct cpu_vector_cost cortexa57_vector_cost =
 522 {
 523   1, /* scalar_int_stmt_cost  */
 524   1, /* scalar_fp_stmt_cost  */
 525   4, /* scalar_load_cost  */
 526   1, /* scalar_store_cost  */
 527   2, /* vec_int_stmt_cost  */
 528   2, /* vec_fp_stmt_cost  */
 529   3, /* vec_permute_cost  */
 530   8, /* vec_to_scalar_cost  */
 531   8, /* scalar_to_vec_cost  */
 532   4, /* vec_align_load_cost  */
 533   4, /* vec_unalign_load_cost  */
 534   1, /* vec_unalign_store_cost  */
 535   1, /* vec_store_cost  */
 536   1, /* cond_taken_branch_cost  */
 537   1 /* cond_not_taken_branch_cost  */
 538 };
 539
 540 static const struct cpu_vector_cost exynosm1_vector_cost =
 541 {
 542   1, /* scalar_int_stmt_cost  */
 543   1, /* scalar_fp_stmt_cost  */
 544   5, /* scalar_load_cost  */
 545   1, /* scalar_store_cost  */
 546   3, /* vec_int_stmt_cost  */
 547   3, /* vec_fp_stmt_cost  */
 548   3, /* vec_permute_cost  */
 549   3, /* vec_to_scalar_cost  */
 550   3, /* scalar_to_vec_cost  */
 551   5, /* vec_align_load_cost  */
 552   5, /* vec_unalign_load_cost  */
 553   1, /* vec_unalign_store_cost  */
 554   1, /* vec_store_cost  */
 555   1, /* cond_taken_branch_cost  */
 556   1 /* cond_not_taken_branch_cost  */
 557 };
 558
 559 /* Generic costs for vector insn classes.  */
 560 static const struct cpu_vector_cost xgene1_vector_cost =
 561 {
 562   1, /* scalar_int_stmt_cost  */
 563   1, /* scalar_fp_stmt_cost  */
 564   5, /* scalar_load_cost  */
 565   1, /* scalar_store_cost  */
 566   2, /* vec_int_stmt_cost  */
 567   2, /* vec_fp_stmt_cost  */
 568   2, /* vec_permute_cost  */
 569   4, /* vec_to_scalar_cost  */
 570   4, /* scalar_to_vec_cost  */
 571   10, /* vec_align_load_cost  */
 572   10, /* vec_unalign_load_cost  */
 573   2, /* vec_unalign_store_cost  */
 574   2, /* vec_store_cost  */
 575   2, /* cond_taken_branch_cost  */
 576   1 /* cond_not_taken_branch_cost  */
 577 };
 578
 579 /* Costs for vector insn classes for Vulcan.  */
 580 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 581 {
 582   1, /* scalar_int_stmt_cost  */
 583   6, /* scalar_fp_stmt_cost  */
 584   4, /* scalar_load_cost  */
 585   1, /* scalar_store_cost  */
 586   5, /* vec_int_stmt_cost  */
 587   6, /* vec_fp_stmt_cost  */
 588   3, /* vec_permute_cost  */
 589   6, /* vec_to_scalar_cost  */
 590   5, /* scalar_to_vec_cost  */
 591   8, /* vec_align_load_cost  */
 592   8, /* vec_unalign_load_cost  */
 593   4, /* vec_unalign_store_cost  */
 594   4, /* vec_store_cost  */
 595   2, /* cond_taken_branch_cost  */
 596   1  /* cond_not_taken_branch_cost  */
 597 };
 598
 599 /* Generic costs for branch instructions.  */
 600 static const struct cpu_branch_cost generic_branch_cost =
 601 {
 602   1,  /* Predictable.  */
 603   3   /* Unpredictable.  */
 604 };
 605
 606 /* Generic approximation modes.  */
 607 static const cpu_approx_modes generic_approx_modes =
 608 {
 609   AARCH64_APPROX_NONE,  /* division  */
 610   AARCH64_APPROX_NONE,  /* sqrt  */
 611   AARCH64_APPROX_NONE   /* recip_sqrt  */
 612 };
 613
 614 /* Approximation modes for Exynos M1.  */
 615 static const cpu_approx_modes exynosm1_approx_modes =
 616 {
 617   AARCH64_APPROX_NONE,  /* division  */
 618   AARCH64_APPROX_ALL,   /* sqrt  */
 619   AARCH64_APPROX_ALL    /* recip_sqrt  */
 620 };
 621
 622 /* Approximation modes for X-Gene 1.  */
 623 static const cpu_approx_modes xgene1_approx_modes =
 624 {
 625   AARCH64_APPROX_NONE,  /* division  */
 626   AARCH64_APPROX_NONE,  /* sqrt  */
 627   AARCH64_APPROX_ALL    /* recip_sqrt  */
 628 };
 629
 630 /* Generic prefetch settings (which disable prefetch).  */
 631 static const cpu_prefetch_tune generic_prefetch_tune =
 632 {
 633   0,                    /* num_slots  */
 634   -1,                   /* l1_cache_size  */
 635   -1,                   /* l1_cache_line_size  */
 636   -1,                   /* l2_cache_size  */
 637   true,                 /* prefetch_dynamic_strides */
 638   -1,                   /* minimum_stride */
 639   -1                    /* default_opt_level  */
 640 };
 641
 642 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 643 {
 644   0,                    /* num_slots  */
 645   -1,                   /* l1_cache_size  */
 646   64,                   /* l1_cache_line_size  */
 647   -1,                   /* l2_cache_size  */
 648   true,                 /* prefetch_dynamic_strides */
 649   -1,                   /* minimum_stride */
 650   -1                    /* default_opt_level  */
 651 };
 652
 653 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 654 {
 655   4,                    /* num_slots  */
 656   32,                   /* l1_cache_size  */
 657   64,                   /* l1_cache_line_size  */
 658   512,                  /* l2_cache_size  */
 659   false,                /* prefetch_dynamic_strides */
 660   2048,                 /* minimum_stride */
 661   3                     /* default_opt_level  */
 662 };
 663
 664 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 665 {
 666   8,                    /* num_slots  */
 667   32,                   /* l1_cache_size  */
 668   128,                  /* l1_cache_line_size  */
 669   16*1024,              /* l2_cache_size  */
 670   true,                 /* prefetch_dynamic_strides */
 671   -1,                   /* minimum_stride */
 672   3                     /* default_opt_level  */
 673 };
 674
 675 static const cpu_prefetch_tune thunderx_prefetch_tune =
 676 {
 677   8,                    /* num_slots  */
 678   32,                   /* l1_cache_size  */
 679   128,                  /* l1_cache_line_size  */
 680   -1,                   /* l2_cache_size  */
 681   true,                 /* prefetch_dynamic_strides */
 682   -1,                   /* minimum_stride */
 683   -1                    /* default_opt_level  */
 684 };
 685
 686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 687 {
 688   8,                    /* num_slots  */
 689   32,                   /* l1_cache_size  */
 690   64,                   /* l1_cache_line_size  */
 691   256,                  /* l2_cache_size  */
 692   true,                 /* prefetch_dynamic_strides */
 693   -1,                   /* minimum_stride */
 694   -1                    /* default_opt_level  */
 695 };
 696
 697 static const cpu_prefetch_tune tsv110_prefetch_tune =
 698 {
 699   0,                    /* num_slots  */
 700   64,                   /* l1_cache_size  */
 701   64,                   /* l1_cache_line_size  */
 702   512,                  /* l2_cache_size  */
 703   true,                 /* prefetch_dynamic_strides */
 704   -1,                   /* minimum_stride */
 705   -1                    /* default_opt_level  */
 706 };
 707
 708 static const cpu_prefetch_tune xgene1_prefetch_tune =
 709 {
 710   8,                    /* num_slots  */
 711   32,                   /* l1_cache_size  */
 712   64,                   /* l1_cache_line_size  */
 713   256,                  /* l2_cache_size  */
 714   true,                 /* prefetch_dynamic_strides */
 715   -1,                   /* minimum_stride */
 716   -1                    /* default_opt_level  */
 717 };
 718
 719 static const struct tune_params generic_tunings =
 720 {
 721   &cortexa57_extra_costs,
 722   &generic_addrcost_table,
 723   &generic_regmove_cost,
 724   &generic_vector_cost,
 725   &generic_branch_cost,
 726   &generic_approx_modes,
 727   SVE_NOT_IMPLEMENTED, /* sve_width  */
 728   4, /* memmov_cost  */
 729   2, /* issue_rate  */
 730   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 731   "16:12",      /* function_align.  */
 732   "4",  /* jump_align.  */
 733   "8",  /* loop_align.  */
 734   2,    /* int_reassoc_width.  */
 735   4,    /* fp_reassoc_width.  */
 736   1,    /* vec_reassoc_width.  */
 737   2,    /* min_div_recip_mul_sf.  */
 738   2,    /* min_div_recip_mul_df.  */
 739   0,    /* max_case_values.  */
 740   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 741   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 742   &generic_prefetch_tune
 743 };
 744
 745 static const struct tune_params cortexa35_tunings =
 746 {
 747   &cortexa53_extra_costs,
 748   &generic_addrcost_table,
 749   &cortexa53_regmove_cost,
 750   &generic_vector_cost,
 751   &generic_branch_cost,
 752   &generic_approx_modes,
 753   SVE_NOT_IMPLEMENTED, /* sve_width  */
 754   4, /* memmov_cost  */
 755   1, /* issue_rate  */
 756   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 757    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 758   "16", /* function_align.  */
 759   "4",  /* jump_align.  */
 760   "8",  /* loop_align.  */
 761   2,    /* int_reassoc_width.  */
 762   4,    /* fp_reassoc_width.  */
 763   1,    /* vec_reassoc_width.  */
 764   2,    /* min_div_recip_mul_sf.  */
 765   2,    /* min_div_recip_mul_df.  */
 766   0,    /* max_case_values.  */
 767   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 768   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 769   &generic_prefetch_tune
 770 };
 771
 772 static const struct tune_params cortexa53_tunings =
 773 {
 774   &cortexa53_extra_costs,
 775   &generic_addrcost_table,
 776   &cortexa53_regmove_cost,
 777   &generic_vector_cost,
 778   &generic_branch_cost,
 779   &generic_approx_modes,
 780   SVE_NOT_IMPLEMENTED, /* sve_width  */
 781   4, /* memmov_cost  */
 782   2, /* issue_rate  */
 783   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 784    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 785   "16", /* function_align.  */
 786   "4",  /* jump_align.  */
 787   "8",  /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params cortexa57_tunings =
 800 {
 801   &cortexa57_extra_costs,
 802   &generic_addrcost_table,
 803   &cortexa57_regmove_cost,
 804   &cortexa57_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   SVE_NOT_IMPLEMENTED, /* sve_width  */
 808   4, /* memmov_cost  */
 809   3, /* issue_rate  */
 810   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 811    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 812   "16", /* function_align.  */
 813   "4",  /* jump_align.  */
 814   "8",  /* loop_align.  */
 815   2,    /* int_reassoc_width.  */
 816   4,    /* fp_reassoc_width.  */
 817   1,    /* vec_reassoc_width.  */
 818   2,    /* min_div_recip_mul_sf.  */
 819   2,    /* min_div_recip_mul_df.  */
 820   0,    /* max_case_values.  */
 821   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 822   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 823   &generic_prefetch_tune
 824 };
 825
 826 static const struct tune_params cortexa72_tunings =
 827 {
 828   &cortexa57_extra_costs,
 829   &generic_addrcost_table,
 830   &cortexa57_regmove_cost,
 831   &cortexa57_vector_cost,
 832   &generic_branch_cost,
 833   &generic_approx_modes,
 834   SVE_NOT_IMPLEMENTED, /* sve_width  */
 835   4, /* memmov_cost  */
 836   3, /* issue_rate  */
 837   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 838    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 839   "16", /* function_align.  */
 840   "4",  /* jump_align.  */
 841   "8",  /* loop_align.  */
 842   2,    /* int_reassoc_width.  */
 843   4,    /* fp_reassoc_width.  */
 844   1,    /* vec_reassoc_width.  */
 845   2,    /* min_div_recip_mul_sf.  */
 846   2,    /* min_div_recip_mul_df.  */
 847   0,    /* max_case_values.  */
 848   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 849   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 850   &generic_prefetch_tune
 851 };
 852
 853 static const struct tune_params cortexa73_tunings =
 854 {
 855   &cortexa57_extra_costs,
 856   &generic_addrcost_table,
 857   &cortexa57_regmove_cost,
 858   &cortexa57_vector_cost,
 859   &generic_branch_cost,
 860   &generic_approx_modes,
 861   SVE_NOT_IMPLEMENTED, /* sve_width  */
 862   4, /* memmov_cost.  */
 863   2, /* issue_rate.  */
 864   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 865    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 866   "16", /* function_align.  */
 867   "4",  /* jump_align.  */
 868   "8",  /* loop_align.  */
 869   2,    /* int_reassoc_width.  */
 870   4,    /* fp_reassoc_width.  */
 871   1,    /* vec_reassoc_width.  */
 872   2,    /* min_div_recip_mul_sf.  */
 873   2,    /* min_div_recip_mul_df.  */
 874   0,    /* max_case_values.  */
 875   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 876   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 877   &generic_prefetch_tune
 878 };
 879
 880
 881
 882 static const struct tune_params exynosm1_tunings =
 883 {
 884   &exynosm1_extra_costs,
 885   &exynosm1_addrcost_table,
 886   &exynosm1_regmove_cost,
 887   &exynosm1_vector_cost,
 888   &generic_branch_cost,
 889   &exynosm1_approx_modes,
 890   SVE_NOT_IMPLEMENTED, /* sve_width  */
 891   4,    /* memmov_cost  */
 892   3,    /* issue_rate  */
 893   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 894   "4",  /* function_align.  */
 895   "4",  /* jump_align.  */
 896   "4",  /* loop_align.  */
 897   2,    /* int_reassoc_width.  */
 898   4,    /* fp_reassoc_width.  */
 899   1,    /* vec_reassoc_width.  */
 900   2,    /* min_div_recip_mul_sf.  */
 901   2,    /* min_div_recip_mul_df.  */
 902   48,   /* max_case_values.  */
 903   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 904   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 905   &exynosm1_prefetch_tune
 906 };
 907
 908 static const struct tune_params thunderxt88_tunings =
 909 {
 910   &thunderx_extra_costs,
 911   &generic_addrcost_table,
 912   &thunderx_regmove_cost,
 913   &thunderx_vector_cost,
 914   &generic_branch_cost,
 915   &generic_approx_modes,
 916   SVE_NOT_IMPLEMENTED, /* sve_width  */
 917   6, /* memmov_cost  */
 918   2, /* issue_rate  */
 919   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 920   "8",  /* function_align.  */
 921   "8",  /* jump_align.  */
 922   "8",  /* loop_align.  */
 923   2,    /* int_reassoc_width.  */
 924   4,    /* fp_reassoc_width.  */
 925   1,    /* vec_reassoc_width.  */
 926   2,    /* min_div_recip_mul_sf.  */
 927   2,    /* min_div_recip_mul_df.  */
 928   0,    /* max_case_values.  */
 929   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 930   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 931   &thunderxt88_prefetch_tune
 932 };
 933
 934 static const struct tune_params thunderx_tunings =
 935 {
 936   &thunderx_extra_costs,
 937   &generic_addrcost_table,
 938   &thunderx_regmove_cost,
 939   &thunderx_vector_cost,
 940   &generic_branch_cost,
 941   &generic_approx_modes,
 942   SVE_NOT_IMPLEMENTED, /* sve_width  */
 943   6, /* memmov_cost  */
 944   2, /* issue_rate  */
 945   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 946   "8",  /* function_align.  */
 947   "8",  /* jump_align.  */
 948   "8",  /* loop_align.  */
 949   2,    /* int_reassoc_width.  */
 950   4,    /* fp_reassoc_width.  */
 951   1,    /* vec_reassoc_width.  */
 952   2,    /* min_div_recip_mul_sf.  */
 953   2,    /* min_div_recip_mul_df.  */
 954   0,    /* max_case_values.  */
 955   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 956   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 957    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 958   &thunderx_prefetch_tune
 959 };
 960
 961 static const struct tune_params tsv110_tunings =
 962 {
 963   &tsv110_extra_costs,
 964   &tsv110_addrcost_table,
 965   &tsv110_regmove_cost,
 966   &tsv110_vector_cost,
 967   &generic_branch_cost,
 968   &generic_approx_modes,
 969   SVE_NOT_IMPLEMENTED, /* sve_width  */
 970   4,    /* memmov_cost  */
 971   4,    /* issue_rate  */
 972   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 973    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 974   "16", /* function_align.  */
 975   "4",  /* jump_align.  */
 976   "8",  /* loop_align.  */
 977   2,    /* int_reassoc_width.  */
 978   4,    /* fp_reassoc_width.  */
 979   1,    /* vec_reassoc_width.  */
 980   2,    /* min_div_recip_mul_sf.  */
 981   2,    /* min_div_recip_mul_df.  */
 982   0,    /* max_case_values.  */
 983   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 984   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 985   &tsv110_prefetch_tune
 986 };
 987
 988 static const struct tune_params xgene1_tunings =
 989 {
 990   &xgene1_extra_costs,
 991   &xgene1_addrcost_table,
 992   &xgene1_regmove_cost,
 993   &xgene1_vector_cost,
 994   &generic_branch_cost,
 995   &xgene1_approx_modes,
 996   SVE_NOT_IMPLEMENTED, /* sve_width  */
 997   6, /* memmov_cost  */
 998   4, /* issue_rate  */
 999   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1000   "16", /* function_align.  */
1001   "16", /* jump_align.  */
1002   "16", /* loop_align.  */
1003   2,    /* int_reassoc_width.  */
1004   4,    /* fp_reassoc_width.  */
1005   1,    /* vec_reassoc_width.  */
1006   2,    /* min_div_recip_mul_sf.  */
1007   2,    /* min_div_recip_mul_df.  */
1008   17,   /* max_case_values.  */
1009   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1010   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1011   &xgene1_prefetch_tune
1012 };
1013
1014 static const struct tune_params emag_tunings =
1015 {
1016   &xgene1_extra_costs,
1017   &xgene1_addrcost_table,
1018   &xgene1_regmove_cost,
1019   &xgene1_vector_cost,
1020   &generic_branch_cost,
1021   &xgene1_approx_modes,
1022   SVE_NOT_IMPLEMENTED,
1023   6, /* memmov_cost  */
1024   4, /* issue_rate  */
1025   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1026   "16", /* function_align.  */
1027   "16", /* jump_align.  */
1028   "16", /* loop_align.  */
1029   2,    /* int_reassoc_width.  */
1030   4,    /* fp_reassoc_width.  */
1031   1,    /* vec_reassoc_width.  */
1032   2,    /* min_div_recip_mul_sf.  */
1033   2,    /* min_div_recip_mul_df.  */
1034   17,   /* max_case_values.  */
1035   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1036   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1037   &xgene1_prefetch_tune
1038 };
1039
1040 static const struct tune_params qdf24xx_tunings =
1041 {
1042   &qdf24xx_extra_costs,
1043   &qdf24xx_addrcost_table,
1044   &qdf24xx_regmove_cost,
1045   &qdf24xx_vector_cost,
1046   &generic_branch_cost,
1047   &generic_approx_modes,
1048   SVE_NOT_IMPLEMENTED, /* sve_width  */
1049   4, /* memmov_cost  */
1050   4, /* issue_rate  */
1051   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1052    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1053   "16", /* function_align.  */
1054   "8",  /* jump_align.  */
1055   "16", /* loop_align.  */
1056   2,    /* int_reassoc_width.  */
1057   4,    /* fp_reassoc_width.  */
1058   1,    /* vec_reassoc_width.  */
1059   2,    /* min_div_recip_mul_sf.  */
1060   2,    /* min_div_recip_mul_df.  */
1061   0,    /* max_case_values.  */
1062   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1063   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1064   &qdf24xx_prefetch_tune
1065 };
1066
1067 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1068    for now.  */
1069 static const struct tune_params saphira_tunings =
1070 {
1071   &generic_extra_costs,
1072   &generic_addrcost_table,
1073   &generic_regmove_cost,
1074   &generic_vector_cost,
1075   &generic_branch_cost,
1076   &generic_approx_modes,
1077   SVE_NOT_IMPLEMENTED, /* sve_width  */
1078   4, /* memmov_cost  */
1079   4, /* issue_rate  */
1080   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1081    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1082   "16", /* function_align.  */
1083   "8",  /* jump_align.  */
1084   "16", /* loop_align.  */
1085   2,    /* int_reassoc_width.  */
1086   4,    /* fp_reassoc_width.  */
1087   1,    /* vec_reassoc_width.  */
1088   2,    /* min_div_recip_mul_sf.  */
1089   2,    /* min_div_recip_mul_df.  */
1090   0,    /* max_case_values.  */
1091   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1092   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1093   &generic_prefetch_tune
1094 };
1095
1096 static const struct tune_params thunderx2t99_tunings =
1097 {
1098   &thunderx2t99_extra_costs,
1099   &thunderx2t99_addrcost_table,
1100   &thunderx2t99_regmove_cost,
1101   &thunderx2t99_vector_cost,
1102   &generic_branch_cost,
1103   &generic_approx_modes,
1104   SVE_NOT_IMPLEMENTED, /* sve_width  */
1105   4, /* memmov_cost.  */
1106   4, /* issue_rate.  */
1107   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1108    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1109   "16", /* function_align.  */
1110   "8",  /* jump_align.  */
1111   "16", /* loop_align.  */
1112   3,    /* int_reassoc_width.  */
1113   2,    /* fp_reassoc_width.  */
1114   2,    /* vec_reassoc_width.  */
1115   2,    /* min_div_recip_mul_sf.  */
1116   2,    /* min_div_recip_mul_df.  */
1117   0,    /* max_case_values.  */
1118   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1119   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1120   &thunderx2t99_prefetch_tune
1121 };
1122
1123 static const struct tune_params neoversen1_tunings =
1124 {
1125   &cortexa57_extra_costs,
1126   &generic_addrcost_table,
1127   &generic_regmove_cost,
1128   &cortexa57_vector_cost,
1129   &generic_branch_cost,
1130   &generic_approx_modes,
1131   SVE_NOT_IMPLEMENTED, /* sve_width  */
1132   4, /* memmov_cost  */
1133   3, /* issue_rate  */
1134   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1135   "32:16",      /* function_align.  */
1136   "32:16",      /* jump_align.  */
1137   "32:16",      /* loop_align.  */
1138   2,    /* int_reassoc_width.  */
1139   4,    /* fp_reassoc_width.  */
1140   2,    /* vec_reassoc_width.  */
1141   2,    /* min_div_recip_mul_sf.  */
1142   2,    /* min_div_recip_mul_df.  */
1143   0,    /* max_case_values.  */
1144   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1145   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1146   &generic_prefetch_tune
1147 };
1148
1149 /* Support for fine-grained override of the tuning structures.  */
1150 struct aarch64_tuning_override_function
1151 {
1152   const char* name;
1153   void (*parse_override)(const char*, struct tune_params*);
1154 };
1155
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1159
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions[] =
1162 {
1163   { "fuse", aarch64_parse_fuse_string },
1164   { "tune", aarch64_parse_tune_string },
1165   { "sve_width", aarch64_parse_sve_width_string },
1166   { NULL, NULL }
1167 };
1168
1169 /* A processor implementing AArch64.  */
1170 struct processor
1171 {
1172   const char *const name;
1173   enum aarch64_processor ident;
1174   enum aarch64_processor sched_core;
1175   enum aarch64_arch arch;
1176   unsigned architecture_version;
1177   const uint64_t flags;
1178   const struct tune_params *const tune;
1179 };
1180
1181 /* Architectures implementing AArch64.  */
1182 static const struct processor all_architectures[] =
1183 {
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1188 };
1189
1190 /* Processor cores implementing AArch64.  */
1191 static const struct processor all_cores[] =
1192 {
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1195   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1196   FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1199     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1200   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1201 };
1202
1203
1204 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1205    handling code or by target attributes.  */
1206 static const struct processor *selected_arch;
1207 static const struct processor *selected_cpu;
1208 static const struct processor *selected_tune;
1209
1210 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1211
1212 /* The current tuning set.  */
1213 struct tune_params aarch64_tune_params = generic_tunings;
1214
1215 /* Table of machine attributes.  */
1216 static const struct attribute_spec aarch64_attribute_table[] =
1217 {
1218   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1219        affects_type_identity, handler, exclude } */
1220   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1221   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1222 };
1223
1224 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1225
1226 /* An ISA extension in the co-processor and main instruction set space.  */
1227 struct aarch64_option_extension
1228 {
1229   const char *const name;
1230   const unsigned long flags_on;
1231   const unsigned long flags_off;
1232 };
1233
1234 typedef enum aarch64_cond_code
1235 {
1236   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1237   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1238   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1239 }
1240 aarch64_cc;
1241
1242 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1243
1244 struct aarch64_branch_protect_type
1245 {
1246   /* The type's name that the user passes to the branch-protection option
1247     string.  */
1248   const char* name;
1249   /* Function to handle the protection type and set global variables.
1250     First argument is the string token corresponding with this type and the
1251     second argument is the next token in the option string.
1252     Return values:
1253     * AARCH64_PARSE_OK: Handling was sucessful.
1254     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1255       should print an error.
1256     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1257       own error.  */
1258   enum aarch64_parse_opt_result (*handler)(char*, char*);
1259   /* A list of types that can follow this type in the option string.  */
1260   const aarch64_branch_protect_type* subtypes;
1261   unsigned int num_subtypes;
1262 };
1263
1264 static enum aarch64_parse_opt_result
1265 aarch64_handle_no_branch_protection (char* str, char* rest)
1266 {
1267   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1268   aarch64_enable_bti = 0;
1269   if (rest)
1270     {
1271       error ("unexpected %<%s%> after %<%s%>", rest, str);
1272       return AARCH64_PARSE_INVALID_FEATURE;
1273     }
1274   return AARCH64_PARSE_OK;
1275 }
1276
1277 static enum aarch64_parse_opt_result
1278 aarch64_handle_standard_branch_protection (char* str, char* rest)
1279 {
1280   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1281   aarch64_ra_sign_key = AARCH64_KEY_A;
1282   aarch64_enable_bti = 1;
1283   if (rest)
1284     {
1285       error ("unexpected %<%s%> after %<%s%>", rest, str);
1286       return AARCH64_PARSE_INVALID_FEATURE;
1287     }
1288   return AARCH64_PARSE_OK;
1289 }
1290
1291 static enum aarch64_parse_opt_result
1292 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1293                                     char* rest ATTRIBUTE_UNUSED)
1294 {
1295   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1296   aarch64_ra_sign_key = AARCH64_KEY_A;
1297   return AARCH64_PARSE_OK;
1298 }
1299
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1302                               char* rest ATTRIBUTE_UNUSED)
1303 {
1304   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1305   return AARCH64_PARSE_OK;
1306 }
1307
1308 static enum aarch64_parse_opt_result
1309 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1310                               char* rest ATTRIBUTE_UNUSED)
1311 {
1312   aarch64_ra_sign_key = AARCH64_KEY_B;
1313   return AARCH64_PARSE_OK;
1314 }
1315
1316 static enum aarch64_parse_opt_result
1317 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1318                                     char* rest ATTRIBUTE_UNUSED)
1319 {
1320   aarch64_enable_bti = 1;
1321   return AARCH64_PARSE_OK;
1322 }
1323
1324 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1325   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1326   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1327   { NULL, NULL, NULL, 0 }
1328 };
1329
1330 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1331   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1332   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1333   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1334     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1335   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1336   { NULL, NULL, NULL, 0 }
1337 };
1338
1339 /* The condition codes of the processor, and the inverse function.  */
1340 static const char * const aarch64_condition_codes[] =
1341 {
1342   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1343   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 };
1345
1346 /* The preferred condition codes for SVE conditions.  */
1347 static const char *const aarch64_sve_condition_codes[] =
1348 {
1349   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1350   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 };
1352
1353 /* Return the assembly token for svpattern value VALUE.  */
1354
1355 static const char *
1356 svpattern_token (enum aarch64_svpattern pattern)
1357 {
1358   switch (pattern)
1359     {
1360 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1361     AARCH64_FOR_SVPATTERN (CASE)
1362 #undef CASE
1363     case AARCH64_NUM_SVPATTERNS:
1364       break;
1365     }
1366   gcc_unreachable ();
1367 }
1368
1369 /* Return the descriptor of the SIMD ABI.  */
1370
1371 static const predefined_function_abi &
1372 aarch64_simd_abi (void)
1373 {
1374   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1375   if (!simd_abi.initialized_p ())
1376     {
1377       HARD_REG_SET full_reg_clobbers
1378         = default_function_abi.full_reg_clobbers ();
1379       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1380         if (FP_SIMD_SAVED_REGNUM_P (regno))
1381           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1382       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1383     }
1384   return simd_abi;
1385 }
1386
1387 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1388 const char *
1389 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1390                         const char * branch_format)
1391 {
1392     rtx_code_label * tmp_label = gen_label_rtx ();
1393     char label_buf[256];
1394     char buffer[128];
1395     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1396                                  CODE_LABEL_NUMBER (tmp_label));
1397     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1398     rtx dest_label = operands[pos_label];
1399     operands[pos_label] = tmp_label;
1400
1401     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1402     output_asm_insn (buffer, operands);
1403
1404     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1405     operands[pos_label] = dest_label;
1406     output_asm_insn (buffer, operands);
1407     return "";
1408 }
1409
1410 void
1411 aarch64_err_no_fpadvsimd (machine_mode mode)
1412 {
1413   if (TARGET_GENERAL_REGS_ONLY)
1414     if (FLOAT_MODE_P (mode))
1415       error ("%qs is incompatible with the use of floating-point types",
1416              "-mgeneral-regs-only");
1417     else
1418       error ("%qs is incompatible with the use of vector types",
1419              "-mgeneral-regs-only");
1420   else
1421     if (FLOAT_MODE_P (mode))
1422       error ("%qs feature modifier is incompatible with the use of"
1423              " floating-point types", "+nofp");
1424     else
1425       error ("%qs feature modifier is incompatible with the use of"
1426              " vector types", "+nofp");
1427 }
1428
1429 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1430    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1431    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1432    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1433    and GENERAL_REGS is lower than the memory cost (in this case the best class
1434    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1435    cost results in bad allocations with many redundant int<->FP moves which
1436    are expensive on various cores.
1437    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1438    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1439    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1440    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1441    The result of this is that it is no longer inefficient to have a higher
1442    memory move cost than the register move cost.
1443 */
1444
1445 static reg_class_t
1446 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1447                                          reg_class_t best_class)
1448 {
1449   machine_mode mode;
1450
1451   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1452       || !reg_class_subset_p (FP_REGS, allocno_class))
1453     return allocno_class;
1454
1455   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1456       || !reg_class_subset_p (FP_REGS, best_class))
1457     return best_class;
1458
1459   mode = PSEUDO_REGNO_MODE (regno);
1460   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1461 }
1462
1463 static unsigned int
1464 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1465 {
1466   if (GET_MODE_UNIT_SIZE (mode) == 4)
1467     return aarch64_tune_params.min_div_recip_mul_sf;
1468   return aarch64_tune_params.min_div_recip_mul_df;
1469 }
1470
1471 /* Return the reassociation width of treeop OPC with mode MODE.  */
1472 static int
1473 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1474 {
1475   if (VECTOR_MODE_P (mode))
1476     return aarch64_tune_params.vec_reassoc_width;
1477   if (INTEGRAL_MODE_P (mode))
1478     return aarch64_tune_params.int_reassoc_width;
1479   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1480   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1481     return aarch64_tune_params.fp_reassoc_width;
1482   return 1;
1483 }
1484
1485 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1486 unsigned
1487 aarch64_dbx_register_number (unsigned regno)
1488 {
1489    if (GP_REGNUM_P (regno))
1490      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1491    else if (regno == SP_REGNUM)
1492      return AARCH64_DWARF_SP;
1493    else if (FP_REGNUM_P (regno))
1494      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1495    else if (PR_REGNUM_P (regno))
1496      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1497    else if (regno == VG_REGNUM)
1498      return AARCH64_DWARF_VG;
1499
1500    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1501       equivalent DWARF register.  */
1502    return DWARF_FRAME_REGISTERS;
1503 }
1504
1505 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1506    integer, otherwise return X unmodified.  */
1507 static rtx
1508 aarch64_bit_representation (rtx x)
1509 {
1510   if (CONST_DOUBLE_P (x))
1511     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1512   return x;
1513 }
1514
1515 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1516 static bool
1517 aarch64_advsimd_struct_mode_p (machine_mode mode)
1518 {
1519   return (TARGET_SIMD
1520           && (mode == OImode || mode == CImode || mode == XImode));
1521 }
1522
1523 /* Return true if MODE is an SVE predicate mode.  */
1524 static bool
1525 aarch64_sve_pred_mode_p (machine_mode mode)
1526 {
1527   return (TARGET_SVE
1528           && (mode == VNx16BImode
1529               || mode == VNx8BImode
1530               || mode == VNx4BImode
1531               || mode == VNx2BImode));
1532 }
1533
1534 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1535 const unsigned int VEC_ADVSIMD  = 1;
1536 const unsigned int VEC_SVE_DATA = 2;
1537 const unsigned int VEC_SVE_PRED = 4;
1538 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1539    a structure of 2, 3 or 4 vectors.  */
1540 const unsigned int VEC_STRUCT   = 8;
1541 /* Useful combinations of the above.  */
1542 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1543 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1544
1545 /* Return a set of flags describing the vector properties of mode MODE.
1546    Ignore modes that are not supported by the current target.  */
1547 static unsigned int
1548 aarch64_classify_vector_mode (machine_mode mode)
1549 {
1550   if (aarch64_advsimd_struct_mode_p (mode))
1551     return VEC_ADVSIMD | VEC_STRUCT;
1552
1553   if (aarch64_sve_pred_mode_p (mode))
1554     return VEC_SVE_PRED;
1555
1556   /* Make the decision based on the mode's enum value rather than its
1557      properties, so that we keep the correct classification regardless
1558      of -msve-vector-bits.  */
1559   switch (mode)
1560     {
1561     /* Single SVE vectors.  */
1562     case E_VNx16QImode:
1563     case E_VNx8HImode:
1564     case E_VNx4SImode:
1565     case E_VNx2DImode:
1566     case E_VNx8HFmode:
1567     case E_VNx4SFmode:
1568     case E_VNx2DFmode:
1569       return TARGET_SVE ? VEC_SVE_DATA : 0;
1570
1571     /* x2 SVE vectors.  */
1572     case E_VNx32QImode:
1573     case E_VNx16HImode:
1574     case E_VNx8SImode:
1575     case E_VNx4DImode:
1576     case E_VNx16HFmode:
1577     case E_VNx8SFmode:
1578     case E_VNx4DFmode:
1579     /* x3 SVE vectors.  */
1580     case E_VNx48QImode:
1581     case E_VNx24HImode:
1582     case E_VNx12SImode:
1583     case E_VNx6DImode:
1584     case E_VNx24HFmode:
1585     case E_VNx12SFmode:
1586     case E_VNx6DFmode:
1587     /* x4 SVE vectors.  */
1588     case E_VNx64QImode:
1589     case E_VNx32HImode:
1590     case E_VNx16SImode:
1591     case E_VNx8DImode:
1592     case E_VNx32HFmode:
1593     case E_VNx16SFmode:
1594     case E_VNx8DFmode:
1595       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1596
1597     /* 64-bit Advanced SIMD vectors.  */
1598     case E_V8QImode:
1599     case E_V4HImode:
1600     case E_V2SImode:
1601     /* ...E_V1DImode doesn't exist.  */
1602     case E_V4HFmode:
1603     case E_V2SFmode:
1604     case E_V1DFmode:
1605     /* 128-bit Advanced SIMD vectors.  */
1606     case E_V16QImode:
1607     case E_V8HImode:
1608     case E_V4SImode:
1609     case E_V2DImode:
1610     case E_V8HFmode:
1611     case E_V4SFmode:
1612     case E_V2DFmode:
1613       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1614
1615     default:
1616       return 0;
1617     }
1618 }
1619
1620 /* Return true if MODE is any of the data vector modes, including
1621    structure modes.  */
1622 static bool
1623 aarch64_vector_data_mode_p (machine_mode mode)
1624 {
1625   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1626 }
1627
1628 /* Return true if MODE is any form of SVE mode, including predicates,
1629    vectors and structures.  */
1630 bool
1631 aarch64_sve_mode_p (machine_mode mode)
1632 {
1633   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1634 }
1635
1636 /* Return true if MODE is an SVE data vector mode; either a single vector
1637    or a structure of vectors.  */
1638 static bool
1639 aarch64_sve_data_mode_p (machine_mode mode)
1640 {
1641   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1642 }
1643
1644 /* Implement target hook TARGET_ARRAY_MODE.  */
1645 static opt_machine_mode
1646 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1647 {
1648   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1649       && IN_RANGE (nelems, 2, 4))
1650     return mode_for_vector (GET_MODE_INNER (mode),
1651                             GET_MODE_NUNITS (mode) * nelems);
1652
1653   return opt_machine_mode ();
1654 }
1655
1656 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1657 static bool
1658 aarch64_array_mode_supported_p (machine_mode mode,
1659                                 unsigned HOST_WIDE_INT nelems)
1660 {
1661   if (TARGET_SIMD
1662       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1663           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1664       && (nelems >= 2 && nelems <= 4))
1665     return true;
1666
1667   return false;
1668 }
1669
1670 /* Return the SVE predicate mode to use for elements that have
1671    ELEM_NBYTES bytes, if such a mode exists.  */
1672
1673 opt_machine_mode
1674 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1675 {
1676   if (TARGET_SVE)
1677     {
1678       if (elem_nbytes == 1)
1679         return VNx16BImode;
1680       if (elem_nbytes == 2)
1681         return VNx8BImode;
1682       if (elem_nbytes == 4)
1683         return VNx4BImode;
1684       if (elem_nbytes == 8)
1685         return VNx2BImode;
1686     }
1687   return opt_machine_mode ();
1688 }
1689
1690 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1691
1692 static opt_machine_mode
1693 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1694 {
1695   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1696     {
1697       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1698       machine_mode pred_mode;
1699       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1700         return pred_mode;
1701     }
1702
1703   return default_get_mask_mode (nunits, nbytes);
1704 }
1705
1706 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1707
1708 static opt_machine_mode
1709 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1710 {
1711   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1712                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1713   machine_mode mode;
1714   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1715     if (inner_mode == GET_MODE_INNER (mode)
1716         && known_eq (nunits, GET_MODE_NUNITS (mode))
1717         && aarch64_sve_data_mode_p (mode))
1718       return mode;
1719   return opt_machine_mode ();
1720 }
1721
1722 /* Return the integer element mode associated with SVE mode MODE.  */
1723
1724 static scalar_int_mode
1725 aarch64_sve_element_int_mode (machine_mode mode)
1726 {
1727   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1728                                                GET_MODE_NUNITS (mode));
1729   return int_mode_for_size (elt_bits, 0).require ();
1730 }
1731
1732 /* Return the integer vector mode associated with SVE mode MODE.
1733    Unlike mode_for_int_vector, this can handle the case in which
1734    MODE is a predicate (and thus has a different total size).  */
1735
1736 static machine_mode
1737 aarch64_sve_int_mode (machine_mode mode)
1738 {
1739   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1740   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1741 }
1742
1743 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1744    prefer to use the first arithmetic operand as the else value if
1745    the else value doesn't matter, since that exactly matches the SVE
1746    destructive merging form.  For ternary operations we could either
1747    pick the first operand and use FMAD-like instructions or the last
1748    operand and use FMLA-like instructions; the latter seems more
1749    natural.  */
1750
1751 static tree
1752 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1753 {
1754   return nops == 3 ? ops[2] : ops[0];
1755 }
1756
1757 /* Implement TARGET_HARD_REGNO_NREGS.  */
1758
1759 static unsigned int
1760 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1761 {
1762   /* ??? Logically we should only need to provide a value when
1763      HARD_REGNO_MODE_OK says that the combination is valid,
1764      but at the moment we need to handle all modes.  Just ignore
1765      any runtime parts for registers that can't store them.  */
1766   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1767   switch (aarch64_regno_regclass (regno))
1768     {
1769     case FP_REGS:
1770     case FP_LO_REGS:
1771     case FP_LO8_REGS:
1772       if (aarch64_sve_data_mode_p (mode))
1773         return exact_div (GET_MODE_SIZE (mode),
1774                           BYTES_PER_SVE_VECTOR).to_constant ();
1775       return CEIL (lowest_size, UNITS_PER_VREG);
1776     case PR_REGS:
1777     case PR_LO_REGS:
1778     case PR_HI_REGS:
1779       return 1;
1780     default:
1781       return CEIL (lowest_size, UNITS_PER_WORD);
1782     }
1783   gcc_unreachable ();
1784 }
1785
1786 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1787
1788 static bool
1789 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1790 {
1791   if (GET_MODE_CLASS (mode) == MODE_CC)
1792     return regno == CC_REGNUM;
1793
1794   if (regno == VG_REGNUM)
1795     /* This must have the same size as _Unwind_Word.  */
1796     return mode == DImode;
1797
1798   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1799   if (vec_flags & VEC_SVE_PRED)
1800     return PR_REGNUM_P (regno);
1801
1802   if (PR_REGNUM_P (regno))
1803     return 0;
1804
1805   if (regno == SP_REGNUM)
1806     /* The purpose of comparing with ptr_mode is to support the
1807        global register variable associated with the stack pointer
1808        register via the syntax of asm ("wsp") in ILP32.  */
1809     return mode == Pmode || mode == ptr_mode;
1810
1811   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1812     return mode == Pmode;
1813
1814   if (GP_REGNUM_P (regno))
1815     {
1816       if (known_le (GET_MODE_SIZE (mode), 8))
1817         return true;
1818       else if (known_le (GET_MODE_SIZE (mode), 16))
1819         return (regno & 1) == 0;
1820     }
1821   else if (FP_REGNUM_P (regno))
1822     {
1823       if (vec_flags & VEC_STRUCT)
1824         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1825       else
1826         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1827     }
1828
1829   return false;
1830 }
1831
1832 /* Implement TARGET_FNTYPE_ABI.  */
1833
1834 static const predefined_function_abi &
1835 aarch64_fntype_abi (const_tree fntype)
1836 {
1837   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
1838     return aarch64_simd_abi ();
1839   return default_function_abi;
1840 }
1841
1842 /* Return true if this is a definition of a vectorized simd function.  */
1843
1844 static bool
1845 aarch64_simd_decl_p (tree fndecl)
1846 {
1847   tree fntype;
1848
1849   if (fndecl == NULL)
1850     return false;
1851   fntype = TREE_TYPE (fndecl);
1852   if (fntype == NULL)
1853     return false;
1854
1855   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1856   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1857     return true;
1858
1859   return false;
1860 }
1861
1862 /* Return the mode a register save/restore should use.  DImode for integer
1863    registers, DFmode for FP registers in non-SIMD functions (they only save
1864    the bottom half of a 128 bit register), or TFmode for FP registers in
1865    SIMD functions.  */
1866
1867 static machine_mode
1868 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1869 {
1870   return GP_REGNUM_P (regno)
1871            ? E_DImode
1872            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1873 }
1874
1875 /* Implement TARGET_INSN_CALLEE_ABI.  */
1876
1877 const predefined_function_abi &
1878 aarch64_insn_callee_abi (const rtx_insn *insn)
1879 {
1880   rtx pat = PATTERN (insn);
1881   gcc_assert (GET_CODE (pat) == PARALLEL);
1882   rtx unspec = XVECEXP (pat, 0, 1);
1883   gcc_assert (GET_CODE (unspec) == UNSPEC
1884               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
1885   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
1886 }
1887
1888 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1889    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1890    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1891
1892 static bool
1893 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
1894                                         unsigned int regno,
1895                                         machine_mode mode)
1896 {
1897   if (FP_REGNUM_P (regno))
1898     {
1899       poly_int64 per_register_size = GET_MODE_SIZE (mode);
1900       unsigned int nregs = hard_regno_nregs (regno, mode);
1901       if (nregs > 1)
1902         per_register_size = exact_div (per_register_size, nregs);
1903       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
1904         return maybe_gt (per_register_size, 16);
1905       return maybe_gt (per_register_size, 8);
1906     }
1907   return false;
1908 }
1909
1910 /* Implement REGMODE_NATURAL_SIZE.  */
1911 poly_uint64
1912 aarch64_regmode_natural_size (machine_mode mode)
1913 {
1914   /* The natural size for SVE data modes is one SVE data vector,
1915      and similarly for predicates.  We can't independently modify
1916      anything smaller than that.  */
1917   /* ??? For now, only do this for variable-width SVE registers.
1918      Doing it for constant-sized registers breaks lower-subreg.c.  */
1919   /* ??? And once that's fixed, we should probably have similar
1920      code for Advanced SIMD.  */
1921   if (!aarch64_sve_vg.is_constant ())
1922     {
1923       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1924       if (vec_flags & VEC_SVE_PRED)
1925         return BYTES_PER_SVE_PRED;
1926       if (vec_flags & VEC_SVE_DATA)
1927         return BYTES_PER_SVE_VECTOR;
1928     }
1929   return UNITS_PER_WORD;
1930 }
1931
1932 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1933 machine_mode
1934 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1935                                      machine_mode mode)
1936 {
1937   /* The predicate mode determines which bits are significant and
1938      which are "don't care".  Decreasing the number of lanes would
1939      lose data while increasing the number of lanes would make bits
1940      unnecessarily significant.  */
1941   if (PR_REGNUM_P (regno))
1942     return mode;
1943   if (known_ge (GET_MODE_SIZE (mode), 4))
1944     return mode;
1945   else
1946     return SImode;
1947 }
1948
1949 /* Return true if I's bits are consecutive ones from the MSB.  */
1950 bool
1951 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1952 {
1953   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1954 }
1955
1956 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1957    that strcpy from constants will be faster.  */
1958
1959 static HOST_WIDE_INT
1960 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1961 {
1962   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1963     return MAX (align, BITS_PER_WORD);
1964   return align;
1965 }
1966
1967 /* Return true if calls to DECL should be treated as
1968    long-calls (ie called via a register).  */
1969 static bool
1970 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1971 {
1972   return false;
1973 }
1974
1975 /* Return true if calls to symbol-ref SYM should be treated as
1976    long-calls (ie called via a register).  */
1977 bool
1978 aarch64_is_long_call_p (rtx sym)
1979 {
1980   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1981 }
1982
1983 /* Return true if calls to symbol-ref SYM should not go through
1984    plt stubs.  */
1985
1986 bool
1987 aarch64_is_noplt_call_p (rtx sym)
1988 {
1989   const_tree decl = SYMBOL_REF_DECL (sym);
1990
1991   if (flag_pic
1992       && decl
1993       && (!flag_plt
1994           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1995       && !targetm.binds_local_p (decl))
1996     return true;
1997
1998   return false;
1999 }
2000
2001 /* Return true if the offsets to a zero/sign-extract operation
2002    represent an expression that matches an extend operation.  The
2003    operands represent the paramters from
2004
2005    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2006 bool
2007 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2008                                 rtx extract_imm)
2009 {
2010   HOST_WIDE_INT mult_val, extract_val;
2011
2012   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2013     return false;
2014
2015   mult_val = INTVAL (mult_imm);
2016   extract_val = INTVAL (extract_imm);
2017
2018   if (extract_val > 8
2019       && extract_val < GET_MODE_BITSIZE (mode)
2020       && exact_log2 (extract_val & ~7) > 0
2021       && (extract_val & 7) <= 4
2022       && mult_val == (1 << (extract_val & 7)))
2023     return true;
2024
2025   return false;
2026 }
2027
2028 /* Emit an insn that's a simple single-set.  Both the operands must be
2029    known to be valid.  */
2030 inline static rtx_insn *
2031 emit_set_insn (rtx x, rtx y)
2032 {
2033   return emit_insn (gen_rtx_SET (x, y));
2034 }
2035
2036 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2037    return the rtx for register 0 in the proper mode.  */
2038 rtx
2039 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2040 {
2041   machine_mode cmp_mode = GET_MODE (x);
2042   machine_mode cc_mode;
2043   rtx cc_reg;
2044
2045   if (cmp_mode == TImode)
2046     {
2047       gcc_assert (code == NE);
2048
2049       cc_mode = CCmode;
2050       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2051
2052       rtx x_lo = operand_subword (x, 0, 0, TImode);
2053       rtx y_lo = operand_subword (y, 0, 0, TImode);
2054       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2055
2056       rtx x_hi = operand_subword (x, 1, 0, TImode);
2057       rtx y_hi = operand_subword (y, 1, 0, TImode);
2058       emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2059                              gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2060                              GEN_INT (AARCH64_EQ)));
2061     }
2062   else
2063     {
2064       cc_mode = SELECT_CC_MODE (code, x, y);
2065       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2066       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2067     }
2068   return cc_reg;
2069 }
2070
2071 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2072
2073 static rtx
2074 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2075                                   machine_mode y_mode)
2076 {
2077   if (y_mode == E_QImode || y_mode == E_HImode)
2078     {
2079       if (CONST_INT_P (y))
2080         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2081       else
2082         {
2083           rtx t, cc_reg;
2084           machine_mode cc_mode;
2085
2086           t = gen_rtx_ZERO_EXTEND (SImode, y);
2087           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2088           cc_mode = CC_SWPmode;
2089           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2090           emit_set_insn (cc_reg, t);
2091           return cc_reg;
2092         }
2093     }
2094
2095   if (!aarch64_plus_operand (y, y_mode))
2096     y = force_reg (y_mode, y);
2097
2098   return aarch64_gen_compare_reg (code, x, y);
2099 }
2100
2101 /* Build the SYMBOL_REF for __tls_get_addr.  */
2102
2103 static GTY(()) rtx tls_get_addr_libfunc;
2104
2105 rtx
2106 aarch64_tls_get_addr (void)
2107 {
2108   if (!tls_get_addr_libfunc)
2109     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2110   return tls_get_addr_libfunc;
2111 }
2112
2113 /* Return the TLS model to use for ADDR.  */
2114
2115 static enum tls_model
2116 tls_symbolic_operand_type (rtx addr)
2117 {
2118   enum tls_model tls_kind = TLS_MODEL_NONE;
2119   if (GET_CODE (addr) == CONST)
2120     {
2121       poly_int64 addend;
2122       rtx sym = strip_offset (addr, &addend);
2123       if (GET_CODE (sym) == SYMBOL_REF)
2124         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2125     }
2126   else if (GET_CODE (addr) == SYMBOL_REF)
2127     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2128
2129   return tls_kind;
2130 }
2131
2132 /* We'll allow lo_sum's in addresses in our legitimate addresses
2133    so that combine would take care of combining addresses where
2134    necessary, but for generation purposes, we'll generate the address
2135    as :
2136    RTL                               Absolute
2137    tmp = hi (symbol_ref);            adrp  x1, foo
2138    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2139                                      nop
2140
2141    PIC                               TLS
2142    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2143    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2144                                      bl   __tls_get_addr
2145                                      nop
2146
2147    Load TLS symbol, depending on TLS mechanism and TLS access model.
2148
2149    Global Dynamic - Traditional TLS:
2150    adrp tmp, :tlsgd:imm
2151    add  dest, tmp, #:tlsgd_lo12:imm
2152    bl   __tls_get_addr
2153
2154    Global Dynamic - TLS Descriptors:
2155    adrp dest, :tlsdesc:imm
2156    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2157    add  dest, dest, #:tlsdesc_lo12:imm
2158    blr  tmp
2159    mrs  tp, tpidr_el0
2160    add  dest, dest, tp
2161
2162    Initial Exec:
2163    mrs  tp, tpidr_el0
2164    adrp tmp, :gottprel:imm
2165    ldr  dest, [tmp, #:gottprel_lo12:imm]
2166    add  dest, dest, tp
2167
2168    Local Exec:
2169    mrs  tp, tpidr_el0
2170    add  t0, tp, #:tprel_hi12:imm, lsl #12
2171    add  t0, t0, #:tprel_lo12_nc:imm
2172 */
2173
2174 static void
2175 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2176                                    enum aarch64_symbol_type type)
2177 {
2178   switch (type)
2179     {
2180     case SYMBOL_SMALL_ABSOLUTE:
2181       {
2182         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2183         rtx tmp_reg = dest;
2184         machine_mode mode = GET_MODE (dest);
2185
2186         gcc_assert (mode == Pmode || mode == ptr_mode);
2187
2188         if (can_create_pseudo_p ())
2189           tmp_reg = gen_reg_rtx (mode);
2190
2191         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2192         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2193         return;
2194       }
2195
2196     case SYMBOL_TINY_ABSOLUTE:
2197       emit_insn (gen_rtx_SET (dest, imm));
2198       return;
2199
2200     case SYMBOL_SMALL_GOT_28K:
2201       {
2202         machine_mode mode = GET_MODE (dest);
2203         rtx gp_rtx = pic_offset_table_rtx;
2204         rtx insn;
2205         rtx mem;
2206
2207         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2208            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2209            decide rtx costs, in which case pic_offset_table_rtx is not
2210            initialized.  For that case no need to generate the first adrp
2211            instruction as the final cost for global variable access is
2212            one instruction.  */
2213         if (gp_rtx != NULL)
2214           {
2215             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2216                using the page base as GOT base, the first page may be wasted,
2217                in the worst scenario, there is only 28K space for GOT).
2218
2219                The generate instruction sequence for accessing global variable
2220                is:
2221
2222                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2223
2224                Only one instruction needed. But we must initialize
2225                pic_offset_table_rtx properly.  We generate initialize insn for
2226                every global access, and allow CSE to remove all redundant.
2227
2228                The final instruction sequences will look like the following
2229                for multiply global variables access.
2230
2231                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2232
2233                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2234                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2235                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2236                  ...  */
2237
2238             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2239             crtl->uses_pic_offset_table = 1;
2240             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2241
2242             if (mode != GET_MODE (gp_rtx))
2243              gp_rtx = gen_lowpart (mode, gp_rtx);
2244
2245           }
2246
2247         if (mode == ptr_mode)
2248           {
2249             if (mode == DImode)
2250               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2251             else
2252               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2253
2254             mem = XVECEXP (SET_SRC (insn), 0, 0);
2255           }
2256         else
2257           {
2258             gcc_assert (mode == Pmode);
2259
2260             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2261             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2262           }
2263
2264         /* The operand is expected to be MEM.  Whenever the related insn
2265            pattern changed, above code which calculate mem should be
2266            updated.  */
2267         gcc_assert (GET_CODE (mem) == MEM);
2268         MEM_READONLY_P (mem) = 1;
2269         MEM_NOTRAP_P (mem) = 1;
2270         emit_insn (insn);
2271         return;
2272       }
2273
2274     case SYMBOL_SMALL_GOT_4G:
2275       {
2276         /* In ILP32, the mode of dest can be either SImode or DImode,
2277            while the got entry is always of SImode size.  The mode of
2278            dest depends on how dest is used: if dest is assigned to a
2279            pointer (e.g. in the memory), it has SImode; it may have
2280            DImode if dest is dereferenced to access the memeory.
2281            This is why we have to handle three different ldr_got_small
2282            patterns here (two patterns for ILP32).  */
2283
2284         rtx insn;
2285         rtx mem;
2286         rtx tmp_reg = dest;
2287         machine_mode mode = GET_MODE (dest);
2288
2289         if (can_create_pseudo_p ())
2290           tmp_reg = gen_reg_rtx (mode);
2291
2292         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2293         if (mode == ptr_mode)
2294           {
2295             if (mode == DImode)
2296               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2297             else
2298               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2299
2300             mem = XVECEXP (SET_SRC (insn), 0, 0);
2301           }
2302         else
2303           {
2304             gcc_assert (mode == Pmode);
2305
2306             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2307             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2308           }
2309
2310         gcc_assert (GET_CODE (mem) == MEM);
2311         MEM_READONLY_P (mem) = 1;
2312         MEM_NOTRAP_P (mem) = 1;
2313         emit_insn (insn);
2314         return;
2315       }
2316
2317     case SYMBOL_SMALL_TLSGD:
2318       {
2319         rtx_insn *insns;
2320         machine_mode mode = GET_MODE (dest);
2321         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2322
2323         start_sequence ();
2324         if (TARGET_ILP32)
2325           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2326         else
2327           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2328         insns = get_insns ();
2329         end_sequence ();
2330
2331         RTL_CONST_CALL_P (insns) = 1;
2332         emit_libcall_block (insns, dest, result, imm);
2333         return;
2334       }
2335
2336     case SYMBOL_SMALL_TLSDESC:
2337       {
2338         machine_mode mode = GET_MODE (dest);
2339         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2340         rtx tp;
2341
2342         gcc_assert (mode == Pmode || mode == ptr_mode);
2343
2344         /* In ILP32, the got entry is always of SImode size.  Unlike
2345            small GOT, the dest is fixed at reg 0.  */
2346         if (TARGET_ILP32)
2347           emit_insn (gen_tlsdesc_small_si (imm));
2348         else
2349           emit_insn (gen_tlsdesc_small_di (imm));
2350         tp = aarch64_load_tp (NULL);
2351
2352         if (mode != Pmode)
2353           tp = gen_lowpart (mode, tp);
2354
2355         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2356         if (REG_P (dest))
2357           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2358         return;
2359       }
2360
2361     case SYMBOL_SMALL_TLSIE:
2362       {
2363         /* In ILP32, the mode of dest can be either SImode or DImode,
2364            while the got entry is always of SImode size.  The mode of
2365            dest depends on how dest is used: if dest is assigned to a
2366            pointer (e.g. in the memory), it has SImode; it may have
2367            DImode if dest is dereferenced to access the memeory.
2368            This is why we have to handle three different tlsie_small
2369            patterns here (two patterns for ILP32).  */
2370         machine_mode mode = GET_MODE (dest);
2371         rtx tmp_reg = gen_reg_rtx (mode);
2372         rtx tp = aarch64_load_tp (NULL);
2373
2374         if (mode == ptr_mode)
2375           {
2376             if (mode == DImode)
2377               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2378             else
2379               {
2380                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2381                 tp = gen_lowpart (mode, tp);
2382               }
2383           }
2384         else
2385           {
2386             gcc_assert (mode == Pmode);
2387             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2388           }
2389
2390         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2391         if (REG_P (dest))
2392           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2393         return;
2394       }
2395
2396     case SYMBOL_TLSLE12:
2397     case SYMBOL_TLSLE24:
2398     case SYMBOL_TLSLE32:
2399     case SYMBOL_TLSLE48:
2400       {
2401         machine_mode mode = GET_MODE (dest);
2402         rtx tp = aarch64_load_tp (NULL);
2403
2404         if (mode != Pmode)
2405           tp = gen_lowpart (mode, tp);
2406
2407         switch (type)
2408           {
2409           case SYMBOL_TLSLE12:
2410             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2411                         (dest, tp, imm));
2412             break;
2413           case SYMBOL_TLSLE24:
2414             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2415                         (dest, tp, imm));
2416           break;
2417           case SYMBOL_TLSLE32:
2418             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2419                         (dest, imm));
2420             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2421                         (dest, dest, tp));
2422           break;
2423           case SYMBOL_TLSLE48:
2424             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2425                         (dest, imm));
2426             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2427                         (dest, dest, tp));
2428             break;
2429           default:
2430             gcc_unreachable ();
2431           }
2432
2433         if (REG_P (dest))
2434           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2435         return;
2436       }
2437
2438     case SYMBOL_TINY_GOT:
2439       emit_insn (gen_ldr_got_tiny (dest, imm));
2440       return;
2441
2442     case SYMBOL_TINY_TLSIE:
2443       {
2444         machine_mode mode = GET_MODE (dest);
2445         rtx tp = aarch64_load_tp (NULL);
2446
2447         if (mode == ptr_mode)
2448           {
2449             if (mode == DImode)
2450               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2451             else
2452               {
2453                 tp = gen_lowpart (mode, tp);
2454                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2455               }
2456           }
2457         else
2458           {
2459             gcc_assert (mode == Pmode);
2460             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2461           }
2462
2463         if (REG_P (dest))
2464           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2465         return;
2466       }
2467
2468     default:
2469       gcc_unreachable ();
2470     }
2471 }
2472
2473 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2474    handle all moves if !can_create_pseudo_p ().  The distinction is
2475    important because, unlike emit_move_insn, the move expanders know
2476    how to force Pmode objects into the constant pool even when the
2477    constant pool address is not itself legitimate.  */
2478 static rtx
2479 aarch64_emit_move (rtx dest, rtx src)
2480 {
2481   return (can_create_pseudo_p ()
2482           ? emit_move_insn (dest, src)
2483           : emit_move_insn_1 (dest, src));
2484 }
2485
2486 /* Apply UNOPTAB to OP and store the result in DEST.  */
2487
2488 static void
2489 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2490 {
2491   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2492   if (dest != tmp)
2493     emit_move_insn (dest, tmp);
2494 }
2495
2496 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2497
2498 static void
2499 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2500 {
2501   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2502                           OPTAB_DIRECT);
2503   if (dest != tmp)
2504     emit_move_insn (dest, tmp);
2505 }
2506
2507 /* Split a 128-bit move operation into two 64-bit move operations,
2508    taking care to handle partial overlap of register to register
2509    copies.  Special cases are needed when moving between GP regs and
2510    FP regs.  SRC can be a register, constant or memory; DST a register
2511    or memory.  If either operand is memory it must not have any side
2512    effects.  */
2513 void
2514 aarch64_split_128bit_move (rtx dst, rtx src)
2515 {
2516   rtx dst_lo, dst_hi;
2517   rtx src_lo, src_hi;
2518
2519   machine_mode mode = GET_MODE (dst);
2520
2521   gcc_assert (mode == TImode || mode == TFmode);
2522   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2523   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2524
2525   if (REG_P (dst) && REG_P (src))
2526     {
2527       int src_regno = REGNO (src);
2528       int dst_regno = REGNO (dst);
2529
2530       /* Handle FP <-> GP regs.  */
2531       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2532         {
2533           src_lo = gen_lowpart (word_mode, src);
2534           src_hi = gen_highpart (word_mode, src);
2535
2536           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2537           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2538           return;
2539         }
2540       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2541         {
2542           dst_lo = gen_lowpart (word_mode, dst);
2543           dst_hi = gen_highpart (word_mode, dst);
2544
2545           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2546           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2547           return;
2548         }
2549     }
2550
2551   dst_lo = gen_lowpart (word_mode, dst);
2552   dst_hi = gen_highpart (word_mode, dst);
2553   src_lo = gen_lowpart (word_mode, src);
2554   src_hi = gen_highpart_mode (word_mode, mode, src);
2555
2556   /* At most one pairing may overlap.  */
2557   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2558     {
2559       aarch64_emit_move (dst_hi, src_hi);
2560       aarch64_emit_move (dst_lo, src_lo);
2561     }
2562   else
2563     {
2564       aarch64_emit_move (dst_lo, src_lo);
2565       aarch64_emit_move (dst_hi, src_hi);
2566     }
2567 }
2568
2569 bool
2570 aarch64_split_128bit_move_p (rtx dst, rtx src)
2571 {
2572   return (! REG_P (src)
2573           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2574 }
2575
2576 /* Split a complex SIMD combine.  */
2577
2578 void
2579 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2580 {
2581   machine_mode src_mode = GET_MODE (src1);
2582   machine_mode dst_mode = GET_MODE (dst);
2583
2584   gcc_assert (VECTOR_MODE_P (dst_mode));
2585   gcc_assert (register_operand (dst, dst_mode)
2586               && register_operand (src1, src_mode)
2587               && register_operand (src2, src_mode));
2588
2589   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2590   return;
2591 }
2592
2593 /* Split a complex SIMD move.  */
2594
2595 void
2596 aarch64_split_simd_move (rtx dst, rtx src)
2597 {
2598   machine_mode src_mode = GET_MODE (src);
2599   machine_mode dst_mode = GET_MODE (dst);
2600
2601   gcc_assert (VECTOR_MODE_P (dst_mode));
2602
2603   if (REG_P (dst) && REG_P (src))
2604     {
2605       gcc_assert (VECTOR_MODE_P (src_mode));
2606       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2607     }
2608 }
2609
2610 bool
2611 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2612                               machine_mode ymode, rtx y)
2613 {
2614   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2615   gcc_assert (r != NULL);
2616   return rtx_equal_p (x, r);
2617 }
2618
2619 /* Return TARGET if it is nonnull and a register of mode MODE.
2620    Otherwise, return a fresh register of mode MODE if we can,
2621    or TARGET reinterpreted as MODE if we can't.  */
2622
2623 static rtx
2624 aarch64_target_reg (rtx target, machine_mode mode)
2625 {
2626   if (target && REG_P (target) && GET_MODE (target) == mode)
2627     return target;
2628   if (!can_create_pseudo_p ())
2629     {
2630       gcc_assert (target);
2631       return gen_lowpart (mode, target);
2632     }
2633   return gen_reg_rtx (mode);
2634 }
2635
2636 /* Return a register that contains the constant in BUILDER, given that
2637    the constant is a legitimate move operand.  Use TARGET as the register
2638    if it is nonnull and convenient.  */
2639
2640 static rtx
2641 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2642 {
2643   rtx src = builder.build ();
2644   target = aarch64_target_reg (target, GET_MODE (src));
2645   emit_insn (gen_rtx_SET (target, src));
2646   return target;
2647 }
2648
2649 static rtx
2650 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2651 {
2652   if (can_create_pseudo_p ())
2653     return force_reg (mode, value);
2654   else
2655     {
2656       gcc_assert (x);
2657       aarch64_emit_move (x, value);
2658       return x;
2659     }
2660 }
2661
2662 /* Return true if predicate value X is a constant in which every element
2663    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2664    value, i.e. as a predicate in which all bits are significant.  */
2665
2666 static bool
2667 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2668 {
2669   if (GET_CODE (x) != CONST_VECTOR)
2670     return false;
2671
2672   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2673                                              GET_MODE_NUNITS (GET_MODE (x)));
2674   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2675   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2676   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2677
2678   unsigned int nelts = const_vector_encoded_nelts (x);
2679   for (unsigned int i = 0; i < nelts; ++i)
2680     {
2681       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2682       if (!CONST_INT_P (elt))
2683         return false;
2684
2685       builder.quick_push (elt);
2686       for (unsigned int j = 1; j < factor; ++j)
2687         builder.quick_push (const0_rtx);
2688     }
2689   builder.finalize ();
2690   return true;
2691 }
2692
2693 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2694    widest predicate element size it can have (that is, the largest size
2695    for which each element would still be 0 or 1).  */
2696
2697 unsigned int
2698 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2699 {
2700   /* Start with the most optimistic assumption: that we only need
2701      one bit per pattern.  This is what we will use if only the first
2702      bit in each pattern is ever set.  */
2703   unsigned int mask = GET_MODE_SIZE (DImode);
2704   mask |= builder.npatterns ();
2705
2706   /* Look for set bits.  */
2707   unsigned int nelts = builder.encoded_nelts ();
2708   for (unsigned int i = 1; i < nelts; ++i)
2709     if (INTVAL (builder.elt (i)) != 0)
2710       {
2711         if (i & 1)
2712           return 1;
2713         mask |= i;
2714       }
2715   return mask & -mask;
2716 }
2717
2718 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2719    that the constant would have with predicate element size ELT_SIZE
2720    (ignoring the upper bits in each element) and return:
2721
2722    * -1 if all bits are set
2723    * N if the predicate has N leading set bits followed by all clear bits
2724    * 0 if the predicate does not have any of these forms.  */
2725
2726 int
2727 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2728                               unsigned int elt_size)
2729 {
2730   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2731      followed by set bits.  */
2732   if (builder.nelts_per_pattern () == 3)
2733     return 0;
2734
2735   /* Skip over leading set bits.  */
2736   unsigned int nelts = builder.encoded_nelts ();
2737   unsigned int i = 0;
2738   for (; i < nelts; i += elt_size)
2739     if (INTVAL (builder.elt (i)) == 0)
2740       break;
2741   unsigned int vl = i / elt_size;
2742
2743   /* Check for the all-true case.  */
2744   if (i == nelts)
2745     return -1;
2746
2747   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2748      repeating pattern of set bits followed by clear bits.  */
2749   if (builder.nelts_per_pattern () != 2)
2750     return 0;
2751
2752   /* We have a "foreground" value and a duplicated "background" value.
2753      If the background might repeat and the last set bit belongs to it,
2754      we might have set bits followed by clear bits followed by set bits.  */
2755   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2756     return 0;
2757
2758   /* Make sure that the rest are all clear.  */
2759   for (; i < nelts; i += elt_size)
2760     if (INTVAL (builder.elt (i)) != 0)
2761       return 0;
2762
2763   return vl;
2764 }
2765
2766 /* See if there is an svpattern that encodes an SVE predicate of mode
2767    PRED_MODE in which the first VL bits are set and the rest are clear.
2768    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2769    A VL of -1 indicates an all-true vector.  */
2770
2771 aarch64_svpattern
2772 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2773 {
2774   if (vl < 0)
2775     return AARCH64_SV_ALL;
2776
2777   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2778     return AARCH64_NUM_SVPATTERNS;
2779
2780   if (vl >= 1 && vl <= 8)
2781     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2782
2783   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2784     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2785
2786   int max_vl;
2787   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2788     {
2789       if (vl == (max_vl / 3) * 3)
2790         return AARCH64_SV_MUL3;
2791       /* These would only trigger for non-power-of-2 lengths.  */
2792       if (vl == (max_vl & -4))
2793         return AARCH64_SV_MUL4;
2794       if (vl == (1 << floor_log2 (max_vl)))
2795         return AARCH64_SV_POW2;
2796       if (vl == max_vl)
2797         return AARCH64_SV_ALL;
2798     }
2799   return AARCH64_NUM_SVPATTERNS;
2800 }
2801
2802 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2803    bits has the lowest bit set and the upper bits clear.  This is the
2804    VNx16BImode equivalent of a PTRUE for controlling elements of
2805    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2806    all bits are significant, even the upper zeros.  */
2807
2808 rtx
2809 aarch64_ptrue_all (unsigned int elt_size)
2810 {
2811   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2812   builder.quick_push (const1_rtx);
2813   for (unsigned int i = 1; i < elt_size; ++i)
2814     builder.quick_push (const0_rtx);
2815   return builder.build ();
2816 }
2817
2818 /* Return an all-true predicate register of mode MODE.  */
2819
2820 rtx
2821 aarch64_ptrue_reg (machine_mode mode)
2822 {
2823   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2824   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2825   return gen_lowpart (mode, reg);
2826 }
2827
2828 /* Return an all-false predicate register of mode MODE.  */
2829
2830 rtx
2831 aarch64_pfalse_reg (machine_mode mode)
2832 {
2833   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2834   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2835   return gen_lowpart (mode, reg);
2836 }
2837
2838 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2839    true, or alternatively if we know that the operation predicated by
2840    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2841    aarch64_sve_gp_strictness operand that describes the operation
2842    predicated by PRED1[0].  */
2843
2844 bool
2845 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2846 {
2847   machine_mode mode = GET_MODE (pred2);
2848   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2849               && mode == GET_MODE (pred1[0])
2850               && aarch64_sve_gp_strictness (pred1[1], SImode));
2851   return (pred1[0] == CONSTM1_RTX (mode)
2852           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2853           || rtx_equal_p (pred1[0], pred2));
2854 }
2855
2856 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2857    for it.  PRED2[0] is the predicate for the instruction whose result
2858    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2859    for it.  Return true if we can prove that the two predicates are
2860    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2861    with PRED1[0] without changing behavior.  */
2862
2863 bool
2864 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2865 {
2866   machine_mode mode = GET_MODE (pred1[0]);
2867   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2868               && mode == GET_MODE (pred2[0])
2869               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2870               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2871
2872   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2873                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2874   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2875                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2876   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2877 }
2878
2879 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2880    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2881    Use TARGET as the target register if nonnull and convenient.  */
2882
2883 static rtx
2884 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2885                           machine_mode data_mode, rtx op1, rtx op2)
2886 {
2887   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2888   expand_operand ops[5];
2889   create_output_operand (&ops[0], target, pred_mode);
2890   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2891   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2892   create_input_operand (&ops[3], op1, data_mode);
2893   create_input_operand (&ops[4], op2, data_mode);
2894   expand_insn (icode, 5, ops);
2895   return ops[0].value;
2896 }
2897
2898 /* Use a comparison to convert integer vector SRC into MODE, which is
2899    the corresponding SVE predicate mode.  Use TARGET for the result
2900    if it's nonnull and convenient.  */
2901
2902 static rtx
2903 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2904 {
2905   machine_mode src_mode = GET_MODE (src);
2906   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2907                                    src, CONST0_RTX (src_mode));
2908 }
2909
2910 /* Return true if we can move VALUE into a register using a single
2911    CNT[BHWD] instruction.  */
2912
2913 static bool
2914 aarch64_sve_cnt_immediate_p (poly_int64 value)
2915 {
2916   HOST_WIDE_INT factor = value.coeffs[0];
2917   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2918   return (value.coeffs[1] == factor
2919           && IN_RANGE (factor, 2, 16 * 16)
2920           && (factor & 1) == 0
2921           && factor <= 16 * (factor & -factor));
2922 }
2923
2924 /* Likewise for rtx X.  */
2925
2926 bool
2927 aarch64_sve_cnt_immediate_p (rtx x)
2928 {
2929   poly_int64 value;
2930   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2931 }
2932
2933 /* Return the asm string for an instruction with a CNT-like vector size
2934    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2935    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2936    first part of the operands template (the part that comes before the
2937    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
2938    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
2939    in each quadword.  If it is zero, we can use any element size.  */
2940
2941 static char *
2942 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2943                                   aarch64_svpattern pattern,
2944                                   unsigned int factor,
2945                                   unsigned int nelts_per_vq)
2946 {
2947   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2948
2949   if (nelts_per_vq == 0)
2950     /* There is some overlap in the ranges of the four CNT instructions.
2951        Here we always use the smallest possible element size, so that the
2952        multiplier is 1 whereever possible.  */
2953     nelts_per_vq = factor & -factor;
2954   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2955   gcc_assert (IN_RANGE (shift, 1, 4));
2956   char suffix = "dwhb"[shift - 1];
2957
2958   factor >>= shift;
2959   unsigned int written;
2960   if (pattern == AARCH64_SV_ALL && factor == 1)
2961     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2962                         prefix, suffix, operands);
2963   else if (factor == 1)
2964     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2965                         prefix, suffix, operands, svpattern_token (pattern));
2966   else
2967     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2968                         prefix, suffix, operands, svpattern_token (pattern),
2969                         factor);
2970   gcc_assert (written < sizeof (buffer));
2971   return buffer;
2972 }
2973
2974 /* Return the asm string for an instruction with a CNT-like vector size
2975    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2976    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2977    first part of the operands template (the part that comes before the
2978    vector size itself).  X is the value of the vector size operand,
2979    as a polynomial integer rtx; we need to convert this into an "all"
2980    pattern with a multiplier.  */
2981
2982 char *
2983 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2984                                   rtx x)
2985 {
2986   poly_int64 value = rtx_to_poly_int64 (x);
2987   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2988   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
2989                                            value.coeffs[1], 0);
2990 }
2991
2992 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
2993
2994 bool
2995 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2996 {
2997   poly_int64 value;
2998   return (poly_int_rtx_p (x, &value)
2999           && (aarch64_sve_cnt_immediate_p (value)
3000               || aarch64_sve_cnt_immediate_p (-value)));
3001 }
3002
3003 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3004    operand 0.  */
3005
3006 char *
3007 aarch64_output_sve_scalar_inc_dec (rtx offset)
3008 {
3009   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3010   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3011   if (offset_value.coeffs[1] > 0)
3012     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3013                                              offset_value.coeffs[1], 0);
3014   else
3015     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3016                                              -offset_value.coeffs[1], 0);
3017 }
3018
3019 /* Return true if we can add VALUE to a register using a single ADDVL
3020    or ADDPL instruction.  */
3021
3022 static bool
3023 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3024 {
3025   HOST_WIDE_INT factor = value.coeffs[0];
3026   if (factor == 0 || value.coeffs[1] != factor)
3027     return false;
3028   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3029      and a value of 16 is one vector width.  */
3030   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3031           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3032 }
3033
3034 /* Likewise for rtx X.  */
3035
3036 bool
3037 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3038 {
3039   poly_int64 value;
3040   return (poly_int_rtx_p (x, &value)
3041           && aarch64_sve_addvl_addpl_immediate_p (value));
3042 }
3043
3044 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3045    to operand 1 and storing the result in operand 0.  */
3046
3047 char *
3048 aarch64_output_sve_addvl_addpl (rtx offset)
3049 {
3050   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3051   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3052   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3053
3054   int factor = offset_value.coeffs[1];
3055   if ((factor & 15) == 0)
3056     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3057   else
3058     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3059   return buffer;
3060 }
3061
3062 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3063    instruction.  If it is, store the number of elements in each vector
3064    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3065    factor in *FACTOR_OUT (if nonnull).  */
3066
3067 bool
3068 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3069                                         unsigned int *nelts_per_vq_out)
3070 {
3071   rtx elt;
3072   poly_int64 value;
3073
3074   if (!const_vec_duplicate_p (x, &elt)
3075       || !poly_int_rtx_p (elt, &value))
3076     return false;
3077
3078   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3079   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3080     /* There's no vector INCB.  */
3081     return false;
3082
3083   HOST_WIDE_INT factor = value.coeffs[0];
3084   if (value.coeffs[1] != factor)
3085     return false;
3086
3087   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3088   if ((factor % nelts_per_vq) != 0
3089       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3090     return false;
3091
3092   if (factor_out)
3093     *factor_out = factor;
3094   if (nelts_per_vq_out)
3095     *nelts_per_vq_out = nelts_per_vq;
3096   return true;
3097 }
3098
3099 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3100    instruction.  */
3101
3102 bool
3103 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3104 {
3105   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3106 }
3107
3108 /* Return the asm template for an SVE vector INC or DEC instruction.
3109    OPERANDS gives the operands before the vector count and X is the
3110    value of the vector count operand itself.  */
3111
3112 char *
3113 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3114 {
3115   int factor;
3116   unsigned int nelts_per_vq;
3117   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3118     gcc_unreachable ();
3119   if (factor < 0)
3120     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3121                                              -factor, nelts_per_vq);
3122   else
3123     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3124                                              factor, nelts_per_vq);
3125 }
3126
3127 static int
3128 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3129                                 scalar_int_mode mode)
3130 {
3131   int i;
3132   unsigned HOST_WIDE_INT val, val2, mask;
3133   int one_match, zero_match;
3134   int num_insns;
3135
3136   val = INTVAL (imm);
3137
3138   if (aarch64_move_imm (val, mode))
3139     {
3140       if (generate)
3141         emit_insn (gen_rtx_SET (dest, imm));
3142       return 1;
3143     }
3144
3145   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3146      (with XXXX non-zero). In that case check to see if the move can be done in
3147      a smaller mode.  */
3148   val2 = val & 0xffffffff;
3149   if (mode == DImode
3150       && aarch64_move_imm (val2, SImode)
3151       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3152     {
3153       if (generate)
3154         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3155
3156       /* Check if we have to emit a second instruction by checking to see
3157          if any of the upper 32 bits of the original DI mode value is set.  */
3158       if (val == val2)
3159         return 1;
3160
3161       i = (val >> 48) ? 48 : 32;
3162
3163       if (generate)
3164          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3165                                     GEN_INT ((val >> i) & 0xffff)));
3166
3167       return 2;
3168     }
3169
3170   if ((val >> 32) == 0 || mode == SImode)
3171     {
3172       if (generate)
3173         {
3174           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3175           if (mode == SImode)
3176             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3177                                        GEN_INT ((val >> 16) & 0xffff)));
3178           else
3179             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3180                                        GEN_INT ((val >> 16) & 0xffff)));
3181         }
3182       return 2;
3183     }
3184
3185   /* Remaining cases are all for DImode.  */
3186
3187   mask = 0xffff;
3188   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3189     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3190   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3191     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3192
3193   if (zero_match != 2 && one_match != 2)
3194     {
3195       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3196          For a 64-bit bitmask try whether changing 16 bits to all ones or
3197          zeroes creates a valid bitmask.  To check any repeated bitmask,
3198          try using 16 bits from the other 32-bit half of val.  */
3199
3200       for (i = 0; i < 64; i += 16, mask <<= 16)
3201         {
3202           val2 = val & ~mask;
3203           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3204             break;
3205           val2 = val | mask;
3206           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3207             break;
3208           val2 = val2 & ~mask;
3209           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3210           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3211             break;
3212         }
3213       if (i != 64)
3214         {
3215           if (generate)
3216             {
3217               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3218               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3219                                          GEN_INT ((val >> i) & 0xffff)));
3220             }
3221           return 2;
3222         }
3223     }
3224
3225   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3226      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3227      otherwise skip zero bits.  */
3228
3229   num_insns = 1;
3230   mask = 0xffff;
3231   val2 = one_match > zero_match ? ~val : val;
3232   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3233
3234   if (generate)
3235     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3236                                            ? (val | ~(mask << i))
3237                                            : (val & (mask << i)))));
3238   for (i += 16; i < 64; i += 16)
3239     {
3240       if ((val2 & (mask << i)) == 0)
3241         continue;
3242       if (generate)
3243         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3244                                    GEN_INT ((val >> i) & 0xffff)));
3245       num_insns ++;
3246     }
3247
3248   return num_insns;
3249 }
3250
3251 /* Return whether imm is a 128-bit immediate which is simple enough to
3252    expand inline.  */
3253 bool
3254 aarch64_mov128_immediate (rtx imm)
3255 {
3256   if (GET_CODE (imm) == CONST_INT)
3257     return true;
3258
3259   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3260
3261   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3262   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3263
3264   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3265          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3266 }
3267
3268
3269 /* Return the number of temporary registers that aarch64_add_offset_1
3270    would need to add OFFSET to a register.  */
3271
3272 static unsigned int
3273 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3274 {
3275   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3276 }
3277
3278 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3279    a non-polynomial OFFSET.  MODE is the mode of the addition.
3280    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3281    be set and CFA adjustments added to the generated instructions.
3282
3283    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3284    temporary if register allocation is already complete.  This temporary
3285    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3286    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3287    the immediate again.
3288
3289    Since this function may be used to adjust the stack pointer, we must
3290    ensure that it cannot cause transient stack deallocation (for example
3291    by first incrementing SP and then decrementing when adjusting by a
3292    large immediate).  */
3293
3294 static void
3295 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3296                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3297                       bool frame_related_p, bool emit_move_imm)
3298 {
3299   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3300   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3301
3302   HOST_WIDE_INT moffset = abs_hwi (offset);
3303   rtx_insn *insn;
3304
3305   if (!moffset)
3306     {
3307       if (!rtx_equal_p (dest, src))
3308         {
3309           insn = emit_insn (gen_rtx_SET (dest, src));
3310           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3311         }
3312       return;
3313     }
3314
3315   /* Single instruction adjustment.  */
3316   if (aarch64_uimm12_shift (moffset))
3317     {
3318       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3319       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3320       return;
3321     }
3322
3323   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3324      and either:
3325
3326      a) the offset cannot be loaded by a 16-bit move or
3327      b) there is no spare register into which we can move it.  */
3328   if (moffset < 0x1000000
3329       && ((!temp1 && !can_create_pseudo_p ())
3330           || !aarch64_move_imm (moffset, mode)))
3331     {
3332       HOST_WIDE_INT low_off = moffset & 0xfff;
3333
3334       low_off = offset < 0 ? -low_off : low_off;
3335       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3336       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3337       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3338       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3339       return;
3340     }
3341
3342   /* Emit a move immediate if required and an addition/subtraction.  */
3343   if (emit_move_imm)
3344     {
3345       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3346       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3347     }
3348   insn = emit_insn (offset < 0
3349                     ? gen_sub3_insn (dest, src, temp1)
3350                     : gen_add3_insn (dest, src, temp1));
3351   if (frame_related_p)
3352     {
3353       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3354       rtx adj = plus_constant (mode, src, offset);
3355       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3356     }
3357 }
3358
3359 /* Return the number of temporary registers that aarch64_add_offset
3360    would need to move OFFSET into a register or add OFFSET to a register;
3361    ADD_P is true if we want the latter rather than the former.  */
3362
3363 static unsigned int
3364 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3365 {
3366   /* This follows the same structure as aarch64_add_offset.  */
3367   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3368     return 0;
3369
3370   unsigned int count = 0;
3371   HOST_WIDE_INT factor = offset.coeffs[1];
3372   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3373   poly_int64 poly_offset (factor, factor);
3374   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3375     /* Need one register for the ADDVL/ADDPL result.  */
3376     count += 1;
3377   else if (factor != 0)
3378     {
3379       factor = abs (factor);
3380       if (factor > 16 * (factor & -factor))
3381         /* Need one register for the CNT result and one for the multiplication
3382            factor.  If necessary, the second temporary can be reused for the
3383            constant part of the offset.  */
3384         return 2;
3385       /* Need one register for the CNT result (which might then
3386          be shifted).  */
3387       count += 1;
3388     }
3389   return count + aarch64_add_offset_1_temporaries (constant);
3390 }
3391
3392 /* If X can be represented as a poly_int64, return the number
3393    of temporaries that are required to add it to a register.
3394    Return -1 otherwise.  */
3395
3396 int
3397 aarch64_add_offset_temporaries (rtx x)
3398 {
3399   poly_int64 offset;
3400   if (!poly_int_rtx_p (x, &offset))
3401     return -1;
3402   return aarch64_offset_temporaries (true, offset);
3403 }
3404
3405 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3406    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3407    be set and CFA adjustments added to the generated instructions.
3408
3409    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3410    temporary if register allocation is already complete.  This temporary
3411    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3412    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3413    false to avoid emitting the immediate again.
3414
3415    TEMP2, if nonnull, is a second temporary register that doesn't
3416    overlap either DEST or REG.
3417
3418    Since this function may be used to adjust the stack pointer, we must
3419    ensure that it cannot cause transient stack deallocation (for example
3420    by first incrementing SP and then decrementing when adjusting by a
3421    large immediate).  */
3422
3423 static void
3424 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3425                     poly_int64 offset, rtx temp1, rtx temp2,
3426                     bool frame_related_p, bool emit_move_imm = true)
3427 {
3428   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3429   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3430   gcc_assert (temp1 == NULL_RTX
3431               || !frame_related_p
3432               || !reg_overlap_mentioned_p (temp1, dest));
3433   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3434
3435   /* Try using ADDVL or ADDPL to add the whole value.  */
3436   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3437     {
3438       rtx offset_rtx = gen_int_mode (offset, mode);
3439       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3440       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3441       return;
3442     }
3443
3444   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3445      SVE vector register, over and above the minimum size of 128 bits.
3446      This is equivalent to half the value returned by CNTD with a
3447      vector shape of ALL.  */
3448   HOST_WIDE_INT factor = offset.coeffs[1];
3449   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3450
3451   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3452   poly_int64 poly_offset (factor, factor);
3453   if (src != const0_rtx
3454       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3455     {
3456       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3457       if (frame_related_p)
3458         {
3459           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3460           RTX_FRAME_RELATED_P (insn) = true;
3461           src = dest;
3462         }
3463       else
3464         {
3465           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3466           src = aarch64_force_temporary (mode, temp1, addr);
3467           temp1 = temp2;
3468           temp2 = NULL_RTX;
3469         }
3470     }
3471   /* Otherwise use a CNT-based sequence.  */
3472   else if (factor != 0)
3473     {
3474       /* Use a subtraction if we have a negative factor.  */
3475       rtx_code code = PLUS;
3476       if (factor < 0)
3477         {
3478           factor = -factor;
3479           code = MINUS;
3480         }
3481
3482       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3483          into the multiplication.  */
3484       rtx val;
3485       int shift = 0;
3486       if (factor & 1)
3487         /* Use a right shift by 1.  */
3488         shift = -1;
3489       else
3490         factor /= 2;
3491       HOST_WIDE_INT low_bit = factor & -factor;
3492       if (factor <= 16 * low_bit)
3493         {
3494           if (factor > 16 * 8)
3495             {
3496               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3497                  the value with the minimum multiplier and shift it into
3498                  position.  */
3499               int extra_shift = exact_log2 (low_bit);
3500               shift += extra_shift;
3501               factor >>= extra_shift;
3502             }
3503           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3504         }
3505       else
3506         {
3507           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3508              directly, since that should increase the chances of being
3509              able to use a shift and add sequence.  If LOW_BIT itself
3510              is out of range, just use CNTD.  */
3511           if (low_bit <= 16 * 8)
3512             factor /= low_bit;
3513           else
3514             low_bit = 1;
3515
3516           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3517           val = aarch64_force_temporary (mode, temp1, val);
3518
3519           if (can_create_pseudo_p ())
3520             {
3521               rtx coeff1 = gen_int_mode (factor, mode);
3522               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3523             }
3524           else
3525             {
3526               /* Go back to using a negative multiplication factor if we have
3527                  no register from which to subtract.  */
3528               if (code == MINUS && src == const0_rtx)
3529                 {
3530                   factor = -factor;
3531                   code = PLUS;
3532                 }
3533               rtx coeff1 = gen_int_mode (factor, mode);
3534               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3535               val = gen_rtx_MULT (mode, val, coeff1);
3536             }
3537         }
3538
3539       if (shift > 0)
3540         {
3541           /* Multiply by 1 << SHIFT.  */
3542           val = aarch64_force_temporary (mode, temp1, val);
3543           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3544         }
3545       else if (shift == -1)
3546         {
3547           /* Divide by 2.  */
3548           val = aarch64_force_temporary (mode, temp1, val);
3549           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3550         }
3551
3552       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3553       if (src != const0_rtx)
3554         {
3555           val = aarch64_force_temporary (mode, temp1, val);
3556           val = gen_rtx_fmt_ee (code, mode, src, val);
3557         }
3558       else if (code == MINUS)
3559         {
3560           val = aarch64_force_temporary (mode, temp1, val);
3561           val = gen_rtx_NEG (mode, val);
3562         }
3563
3564       if (constant == 0 || frame_related_p)
3565         {
3566           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3567           if (frame_related_p)
3568             {
3569               RTX_FRAME_RELATED_P (insn) = true;
3570               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3571                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3572                                                               poly_offset)));
3573             }
3574           src = dest;
3575           if (constant == 0)
3576             return;
3577         }
3578       else
3579         {
3580           src = aarch64_force_temporary (mode, temp1, val);
3581           temp1 = temp2;
3582           temp2 = NULL_RTX;
3583         }
3584
3585       emit_move_imm = true;
3586     }
3587
3588   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3589                         frame_related_p, emit_move_imm);
3590 }
3591
3592 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3593    than a poly_int64.  */
3594
3595 void
3596 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3597                           rtx offset_rtx, rtx temp1, rtx temp2)
3598 {
3599   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3600                       temp1, temp2, false);
3601 }
3602
3603 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3604    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3605    if TEMP1 already contains abs (DELTA).  */
3606
3607 static inline void
3608 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3609 {
3610   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3611                       temp1, temp2, true, emit_move_imm);
3612 }
3613
3614 /* Subtract DELTA from the stack pointer, marking the instructions
3615    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3616    if nonnull.  */
3617
3618 static inline void
3619 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3620                 bool emit_move_imm = true)
3621 {
3622   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3623                       temp1, temp2, frame_related_p, emit_move_imm);
3624 }
3625
3626 /* Set DEST to (vec_series BASE STEP).  */
3627
3628 static void
3629 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3630 {
3631   machine_mode mode = GET_MODE (dest);
3632   scalar_mode inner = GET_MODE_INNER (mode);
3633
3634   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3635   if (!aarch64_sve_index_immediate_p (base))
3636     base = force_reg (inner, base);
3637   if (!aarch64_sve_index_immediate_p (step))
3638     step = force_reg (inner, step);
3639
3640   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3641 }
3642
3643 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3644    register of mode MODE.  Use TARGET for the result if it's nonnull
3645    and convenient.
3646
3647    The two vector modes must have the same element mode.  The behavior
3648    is to duplicate architectural lane N of SRC into architectural lanes
3649    N + I * STEP of the result.  On big-endian targets, architectural
3650    lane 0 of an Advanced SIMD vector is the last element of the vector
3651    in memory layout, so for big-endian targets this operation has the
3652    effect of reversing SRC before duplicating it.  Callers need to
3653    account for this.  */
3654
3655 rtx
3656 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3657 {
3658   machine_mode src_mode = GET_MODE (src);
3659   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3660   insn_code icode = (BYTES_BIG_ENDIAN
3661                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3662                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3663
3664   unsigned int i = 0;
3665   expand_operand ops[3];
3666   create_output_operand (&ops[i++], target, mode);
3667   create_output_operand (&ops[i++], src, src_mode);
3668   if (BYTES_BIG_ENDIAN)
3669     {
3670       /* Create a PARALLEL describing the reversal of SRC.  */
3671       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3672       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3673                                                   nelts_per_vq - 1, -1);
3674       create_fixed_operand (&ops[i++], sel);
3675     }
3676   expand_insn (icode, i, ops);
3677   return ops[0].value;
3678 }
3679
3680 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3681    the memory image into DEST.  Return true on success.  */
3682
3683 static bool
3684 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3685 {
3686   src = force_const_mem (GET_MODE (src), src);
3687   if (!src)
3688     return false;
3689
3690   /* Make sure that the address is legitimate.  */
3691   if (!aarch64_sve_ld1rq_operand_p (src))
3692     {
3693       rtx addr = force_reg (Pmode, XEXP (src, 0));
3694       src = replace_equiv_address (src, addr);
3695     }
3696
3697   machine_mode mode = GET_MODE (dest);
3698   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3699   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3700   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3701   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3702   return true;
3703 }
3704
3705 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3706    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3707    result if convenient.
3708
3709    The returned register can have whatever mode seems most natural
3710    given the contents of SRC.  */
3711
3712 static rtx
3713 aarch64_expand_sve_const_vector (rtx target, rtx src)
3714 {
3715   machine_mode mode = GET_MODE (src);
3716   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3717   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3718   scalar_mode elt_mode = GET_MODE_INNER (mode);
3719   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3720   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3721
3722   if (nelts_per_pattern == 1 && encoded_bits == 128)
3723     {
3724       /* The constant is a duplicated quadword but can't be narrowed
3725          beyond a quadword.  Get the memory image of the first quadword
3726          as a 128-bit vector and try using LD1RQ to load it from memory.
3727
3728          The effect for both endiannesses is to load memory lane N into
3729          architectural lanes N + I * STEP of the result.  On big-endian
3730          targets, the layout of the 128-bit vector in an Advanced SIMD
3731          register would be different from its layout in an SVE register,
3732          but this 128-bit vector is a memory value only.  */
3733       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3734       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3735       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3736         return target;
3737     }
3738
3739   if (nelts_per_pattern == 1 && encoded_bits < 128)
3740     {
3741       /* The vector is a repeating sequence of 64 bits or fewer.
3742          See if we can load them using an Advanced SIMD move and then
3743          duplicate it to fill a vector.  This is better than using a GPR
3744          move because it keeps everything in the same register file.  */
3745       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3746       rtx_vector_builder builder (vq_mode, npatterns, 1);
3747       for (unsigned int i = 0; i < npatterns; ++i)
3748         {
3749           /* We want memory lane N to go into architectural lane N,
3750              so reverse for big-endian targets.  The DUP .Q pattern
3751              has a compensating reverse built-in.  */
3752           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3753           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3754         }
3755       rtx vq_src = builder.build ();
3756       if (aarch64_simd_valid_immediate (vq_src, NULL))
3757         {
3758           vq_src = force_reg (vq_mode, vq_src);
3759           return aarch64_expand_sve_dupq (target, mode, vq_src);
3760         }
3761
3762       /* Get an integer representation of the repeating part of Advanced
3763          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3764          which for big-endian targets is lane-swapped wrt a normal
3765          Advanced SIMD vector.  This means that for both endiannesses,
3766          memory lane N of SVE vector SRC corresponds to architectural
3767          lane N of a register holding VQ_SRC.  This in turn means that
3768          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3769          as a single 128-bit value) and thus that memory lane 0 of SRC is
3770          in the lsb of the integer.  Duplicating the integer therefore
3771          ensures that memory lane N of SRC goes into architectural lane
3772          N + I * INDEX of the SVE register.  */
3773       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3774       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3775       if (elt_value)
3776         {
3777           /* Pretend that we had a vector of INT_MODE to start with.  */
3778           elt_mode = int_mode;
3779           mode = aarch64_full_sve_mode (int_mode).require ();
3780
3781           /* If the integer can be moved into a general register by a
3782              single instruction, do that and duplicate the result.  */
3783           if (CONST_INT_P (elt_value)
3784               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3785             {
3786               elt_value = force_reg (elt_mode, elt_value);
3787               return expand_vector_broadcast (mode, elt_value);
3788             }
3789         }
3790       else if (npatterns == 1)
3791         /* We're duplicating a single value, but can't do better than
3792            force it to memory and load from there.  This handles things
3793            like symbolic constants.  */
3794         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3795
3796       if (elt_value)
3797         {
3798           /* Load the element from memory if we can, otherwise move it into
3799              a register and use a DUP.  */
3800           rtx op = force_const_mem (elt_mode, elt_value);
3801           if (!op)
3802             op = force_reg (elt_mode, elt_value);
3803           return expand_vector_broadcast (mode, op);
3804         }
3805     }
3806
3807   /* Try using INDEX.  */
3808   rtx base, step;
3809   if (const_vec_series_p (src, &base, &step))
3810     {
3811       aarch64_expand_vec_series (target, base, step);
3812       return target;
3813     }
3814
3815   /* From here on, it's better to force the whole constant to memory
3816      if we can.  */
3817   if (GET_MODE_NUNITS (mode).is_constant ())
3818     return NULL_RTX;
3819
3820   /* Expand each pattern individually.  */
3821   gcc_assert (npatterns > 1);
3822   rtx_vector_builder builder;
3823   auto_vec<rtx, 16> vectors (npatterns);
3824   for (unsigned int i = 0; i < npatterns; ++i)
3825     {
3826       builder.new_vector (mode, 1, nelts_per_pattern);
3827       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3828         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3829       vectors.quick_push (force_reg (mode, builder.build ()));
3830     }
3831
3832   /* Use permutes to interleave the separate vectors.  */
3833   while (npatterns > 1)
3834     {
3835       npatterns /= 2;
3836       for (unsigned int i = 0; i < npatterns; ++i)
3837         {
3838           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3839           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3840           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3841           vectors[i] = tmp;
3842         }
3843     }
3844   gcc_assert (vectors[0] == target);
3845   return target;
3846 }
3847
3848 /* Use WHILE to set a predicate register of mode MODE in which the first
3849    VL bits are set and the rest are clear.  Use TARGET for the register
3850    if it's nonnull and convenient.  */
3851
3852 static rtx
3853 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3854                                  unsigned int vl)
3855 {
3856   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3857   target = aarch64_target_reg (target, mode);
3858   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3859   return target;
3860 }
3861
3862 static rtx
3863 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3864
3865 /* BUILDER is a constant predicate in which the index of every set bit
3866    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3867    by inverting every element at a multiple of ELT_SIZE and EORing the
3868    result with an ELT_SIZE PTRUE.
3869
3870    Return a register that contains the constant on success, otherwise
3871    return null.  Use TARGET as the register if it is nonnull and
3872    convenient.  */
3873
3874 static rtx
3875 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3876                                    unsigned int elt_size)
3877 {
3878   /* Invert every element at a multiple of ELT_SIZE, keeping the
3879      other bits zero.  */
3880   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3881                                   builder.nelts_per_pattern ());
3882   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3883     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3884       inv_builder.quick_push (const1_rtx);
3885     else
3886       inv_builder.quick_push (const0_rtx);
3887   inv_builder.finalize ();
3888
3889   /* See if we can load the constant cheaply.  */
3890   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3891   if (!inv)
3892     return NULL_RTX;
3893
3894   /* EOR the result with an ELT_SIZE PTRUE.  */
3895   rtx mask = aarch64_ptrue_all (elt_size);
3896   mask = force_reg (VNx16BImode, mask);
3897   target = aarch64_target_reg (target, VNx16BImode);
3898   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3899   return target;
3900 }
3901
3902 /* BUILDER is a constant predicate in which the index of every set bit
3903    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3904    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3905    register on success, otherwise return null.  Use TARGET as the register
3906    if nonnull and convenient.  */
3907
3908 static rtx
3909 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3910                                    unsigned int elt_size,
3911                                    unsigned int permute_size)
3912 {
3913   /* We're going to split the constant into two new constants A and B,
3914      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3915      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3916
3917      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3918      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3919
3920      where _ indicates elements that will be discarded by the permute.
3921
3922      First calculate the ELT_SIZEs for A and B.  */
3923   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3924   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3925   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3926     if (INTVAL (builder.elt (i)) != 0)
3927       {
3928         if (i & permute_size)
3929           b_elt_size |= i - permute_size;
3930         else
3931           a_elt_size |= i;
3932       }
3933   a_elt_size &= -a_elt_size;
3934   b_elt_size &= -b_elt_size;
3935
3936   /* Now construct the vectors themselves.  */
3937   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3938                                 builder.nelts_per_pattern ());
3939   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3940                                 builder.nelts_per_pattern ());
3941   unsigned int nelts = builder.encoded_nelts ();
3942   for (unsigned int i = 0; i < nelts; ++i)
3943     if (i & (elt_size - 1))
3944       {
3945         a_builder.quick_push (const0_rtx);
3946         b_builder.quick_push (const0_rtx);
3947       }
3948     else if ((i & permute_size) == 0)
3949       {
3950         /* The A and B elements are significant.  */
3951         a_builder.quick_push (builder.elt (i));
3952         b_builder.quick_push (builder.elt (i + permute_size));
3953       }
3954     else
3955       {
3956         /* The A and B elements are going to be discarded, so pick whatever
3957            is likely to give a nice constant.  We are targeting element
3958            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3959            with the aim of each being a sequence of ones followed by
3960            a sequence of zeros.  So:
3961
3962            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3963              duplicate the last X_ELT_SIZE element, to extend the
3964              current sequence of ones or zeros.
3965
3966            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3967              zero, so that the constant really does have X_ELT_SIZE and
3968              not a smaller size.  */
3969         if (a_elt_size > permute_size)
3970           a_builder.quick_push (const0_rtx);
3971         else
3972           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3973         if (b_elt_size > permute_size)
3974           b_builder.quick_push (const0_rtx);
3975         else
3976           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3977       }
3978   a_builder.finalize ();
3979   b_builder.finalize ();
3980
3981   /* Try loading A into a register.  */
3982   rtx_insn *last = get_last_insn ();
3983   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3984   if (!a)
3985     return NULL_RTX;
3986
3987   /* Try loading B into a register.  */
3988   rtx b = a;
3989   if (a_builder != b_builder)
3990     {
3991       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3992       if (!b)
3993         {
3994           delete_insns_since (last);
3995           return NULL_RTX;
3996         }
3997     }
3998
3999   /* Emit the TRN1 itself.  */
4000   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4001   target = aarch64_target_reg (target, mode);
4002   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4003                               gen_lowpart (mode, a),
4004                               gen_lowpart (mode, b)));
4005   return target;
4006 }
4007
4008 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4009    constant in BUILDER into an SVE predicate register.  Return the register
4010    on success, otherwise return null.  Use TARGET for the register if
4011    nonnull and convenient.
4012
4013    ALLOW_RECURSE_P is true if we can use methods that would call this
4014    function recursively.  */
4015
4016 static rtx
4017 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4018                                  bool allow_recurse_p)
4019 {
4020   if (builder.encoded_nelts () == 1)
4021     /* A PFALSE or a PTRUE .B ALL.  */
4022     return aarch64_emit_set_immediate (target, builder);
4023
4024   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4025   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4026     {
4027       /* If we can load the constant using PTRUE, use it as-is.  */
4028       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4029       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4030         return aarch64_emit_set_immediate (target, builder);
4031
4032       /* Otherwise use WHILE to set the first VL bits.  */
4033       return aarch64_sve_move_pred_via_while (target, mode, vl);
4034     }
4035
4036   if (!allow_recurse_p)
4037     return NULL_RTX;
4038
4039   /* Try inverting the vector in element size ELT_SIZE and then EORing
4040      the result with an ELT_SIZE PTRUE.  */
4041   if (INTVAL (builder.elt (0)) == 0)
4042     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4043                                                      elt_size))
4044       return res;
4045
4046   /* Try using TRN1 to permute two simpler constants.  */
4047   for (unsigned int i = elt_size; i <= 8; i *= 2)
4048     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4049                                                      elt_size, i))
4050       return res;
4051
4052   return NULL_RTX;
4053 }
4054
4055 /* Return an SVE predicate register that contains the VNx16BImode
4056    constant in BUILDER, without going through the move expanders.
4057
4058    The returned register can have whatever mode seems most natural
4059    given the contents of BUILDER.  Use TARGET for the result if
4060    convenient.  */
4061
4062 static rtx
4063 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4064 {
4065   /* Try loading the constant using pure predicate operations.  */
4066   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4067     return res;
4068
4069   /* Try forcing the constant to memory.  */
4070   if (builder.full_nelts ().is_constant ())
4071     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4072       {
4073         target = aarch64_target_reg (target, VNx16BImode);
4074         emit_move_insn (target, mem);
4075         return target;
4076       }
4077
4078   /* The last resort is to load the constant as an integer and then
4079      compare it against zero.  Use -1 for set bits in order to increase
4080      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4081   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4082                                   builder.nelts_per_pattern ());
4083   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4084     int_builder.quick_push (INTVAL (builder.elt (i))
4085                             ? constm1_rtx : const0_rtx);
4086   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4087                                            int_builder.build ());
4088 }
4089
4090 /* Set DEST to immediate IMM.  */
4091
4092 void
4093 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4094 {
4095   machine_mode mode = GET_MODE (dest);
4096
4097   /* Check on what type of symbol it is.  */
4098   scalar_int_mode int_mode;
4099   if ((GET_CODE (imm) == SYMBOL_REF
4100        || GET_CODE (imm) == LABEL_REF
4101        || GET_CODE (imm) == CONST
4102        || GET_CODE (imm) == CONST_POLY_INT)
4103       && is_a <scalar_int_mode> (mode, &int_mode))
4104     {
4105       rtx mem;
4106       poly_int64 offset;
4107       HOST_WIDE_INT const_offset;
4108       enum aarch64_symbol_type sty;
4109
4110       /* If we have (const (plus symbol offset)), separate out the offset
4111          before we start classifying the symbol.  */
4112       rtx base = strip_offset (imm, &offset);
4113
4114       /* We must always add an offset involving VL separately, rather than
4115          folding it into the relocation.  */
4116       if (!offset.is_constant (&const_offset))
4117         {
4118           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4119             emit_insn (gen_rtx_SET (dest, imm));
4120           else
4121             {
4122               /* Do arithmetic on 32-bit values if the result is smaller
4123                  than that.  */
4124               if (partial_subreg_p (int_mode, SImode))
4125                 {
4126                   /* It is invalid to do symbol calculations in modes
4127                      narrower than SImode.  */
4128                   gcc_assert (base == const0_rtx);
4129                   dest = gen_lowpart (SImode, dest);
4130                   int_mode = SImode;
4131                 }
4132               if (base != const0_rtx)
4133                 {
4134                   base = aarch64_force_temporary (int_mode, dest, base);
4135                   aarch64_add_offset (int_mode, dest, base, offset,
4136                                       NULL_RTX, NULL_RTX, false);
4137                 }
4138               else
4139                 aarch64_add_offset (int_mode, dest, base, offset,
4140                                     dest, NULL_RTX, false);
4141             }
4142           return;
4143         }
4144
4145       sty = aarch64_classify_symbol (base, const_offset);
4146       switch (sty)
4147         {
4148         case SYMBOL_FORCE_TO_MEM:
4149           if (const_offset != 0
4150               && targetm.cannot_force_const_mem (int_mode, imm))
4151             {
4152               gcc_assert (can_create_pseudo_p ());
4153               base = aarch64_force_temporary (int_mode, dest, base);
4154               aarch64_add_offset (int_mode, dest, base, const_offset,
4155                                   NULL_RTX, NULL_RTX, false);
4156               return;
4157             }
4158
4159           mem = force_const_mem (ptr_mode, imm);
4160           gcc_assert (mem);
4161
4162           /* If we aren't generating PC relative literals, then
4163              we need to expand the literal pool access carefully.
4164              This is something that needs to be done in a number
4165              of places, so could well live as a separate function.  */
4166           if (!aarch64_pcrelative_literal_loads)
4167             {
4168               gcc_assert (can_create_pseudo_p ());
4169               base = gen_reg_rtx (ptr_mode);
4170               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4171               if (ptr_mode != Pmode)
4172                 base = convert_memory_address (Pmode, base);
4173               mem = gen_rtx_MEM (ptr_mode, base);
4174             }
4175
4176           if (int_mode != ptr_mode)
4177             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4178
4179           emit_insn (gen_rtx_SET (dest, mem));
4180
4181           return;
4182
4183         case SYMBOL_SMALL_TLSGD:
4184         case SYMBOL_SMALL_TLSDESC:
4185         case SYMBOL_SMALL_TLSIE:
4186         case SYMBOL_SMALL_GOT_28K:
4187         case SYMBOL_SMALL_GOT_4G:
4188         case SYMBOL_TINY_GOT:
4189         case SYMBOL_TINY_TLSIE:
4190           if (const_offset != 0)
4191             {
4192               gcc_assert(can_create_pseudo_p ());
4193               base = aarch64_force_temporary (int_mode, dest, base);
4194               aarch64_add_offset (int_mode, dest, base, const_offset,
4195                                   NULL_RTX, NULL_RTX, false);
4196               return;
4197             }
4198           /* FALLTHRU */
4199
4200         case SYMBOL_SMALL_ABSOLUTE:
4201         case SYMBOL_TINY_ABSOLUTE:
4202         case SYMBOL_TLSLE12:
4203         case SYMBOL_TLSLE24:
4204         case SYMBOL_TLSLE32:
4205         case SYMBOL_TLSLE48:
4206           aarch64_load_symref_appropriately (dest, imm, sty);
4207           return;
4208
4209         default:
4210           gcc_unreachable ();
4211         }
4212     }
4213
4214   if (!CONST_INT_P (imm))
4215     {
4216       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4217         {
4218           /* Only the low bit of each .H, .S and .D element is defined,
4219              so we can set the upper bits to whatever we like.  If the
4220              predicate is all-true in MODE, prefer to set all the undefined
4221              bits as well, so that we can share a single .B predicate for
4222              all modes.  */
4223           if (imm == CONSTM1_RTX (mode))
4224             imm = CONSTM1_RTX (VNx16BImode);
4225
4226           /* All methods for constructing predicate modes wider than VNx16BI
4227              will set the upper bits of each element to zero.  Expose this
4228              by moving such constants as a VNx16BI, so that all bits are
4229              significant and so that constants for different modes can be
4230              shared.  The wider constant will still be available as a
4231              REG_EQUAL note.  */
4232           rtx_vector_builder builder;
4233           if (aarch64_get_sve_pred_bits (builder, imm))
4234             {
4235               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4236               if (dest != res)
4237                 emit_move_insn (dest, gen_lowpart (mode, res));
4238               return;
4239             }
4240         }
4241
4242       if (GET_CODE (imm) == HIGH
4243           || aarch64_simd_valid_immediate (imm, NULL))
4244         {
4245           emit_insn (gen_rtx_SET (dest, imm));
4246           return;
4247         }
4248
4249       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4250         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4251           {
4252             if (dest != res)
4253               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4254             return;
4255           }
4256
4257       rtx mem = force_const_mem (mode, imm);
4258       gcc_assert (mem);
4259       emit_move_insn (dest, mem);
4260       return;
4261     }
4262
4263   aarch64_internal_mov_immediate (dest, imm, true,
4264                                   as_a <scalar_int_mode> (mode));
4265 }
4266
4267 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4268    that is known to contain PTRUE.  */
4269
4270 void
4271 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4272 {
4273   expand_operand ops[3];
4274   machine_mode mode = GET_MODE (dest);
4275   create_output_operand (&ops[0], dest, mode);
4276   create_input_operand (&ops[1], pred, GET_MODE(pred));
4277   create_input_operand (&ops[2], src, mode);
4278   temporary_volatile_ok v (true);
4279   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4280 }
4281
4282 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4283    operand is in memory.  In this case we need to use the predicated LD1
4284    and ST1 instead of LDR and STR, both for correctness on big-endian
4285    targets and because LD1 and ST1 support a wider range of addressing modes.
4286    PRED_MODE is the mode of the predicate.
4287
4288    See the comment at the head of aarch64-sve.md for details about the
4289    big-endian handling.  */
4290
4291 void
4292 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4293 {
4294   machine_mode mode = GET_MODE (dest);
4295   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4296   if (!register_operand (src, mode)
4297       && !register_operand (dest, mode))
4298     {
4299       rtx tmp = gen_reg_rtx (mode);
4300       if (MEM_P (src))
4301         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4302       else
4303         emit_move_insn (tmp, src);
4304       src = tmp;
4305     }
4306   aarch64_emit_sve_pred_move (dest, ptrue, src);
4307 }
4308
4309 /* Called only on big-endian targets.  See whether an SVE vector move
4310    from SRC to DEST is effectively a REV[BHW] instruction, because at
4311    least one operand is a subreg of an SVE vector that has wider or
4312    narrower elements.  Return true and emit the instruction if so.
4313
4314    For example:
4315
4316      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4317
4318    represents a VIEW_CONVERT between the following vectors, viewed
4319    in memory order:
4320
4321      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4322      R1: { [0],      [1],      [2],      [3],     ... }
4323
4324    The high part of lane X in R2 should therefore correspond to lane X*2
4325    of R1, but the register representations are:
4326
4327          msb                                      lsb
4328      R2: ...... [1].high  [1].low   [0].high  [0].low
4329      R1: ...... [3]       [2]       [1]       [0]
4330
4331    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4332    We therefore need a reverse operation to swap the high and low values
4333    around.
4334
4335    This is purely an optimization.  Without it we would spill the
4336    subreg operand to the stack in one mode and reload it in the
4337    other mode, which has the same effect as the REV.  */
4338
4339 bool
4340 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4341 {
4342   gcc_assert (BYTES_BIG_ENDIAN);
4343   if (GET_CODE (dest) == SUBREG)
4344     dest = SUBREG_REG (dest);
4345   if (GET_CODE (src) == SUBREG)
4346     src = SUBREG_REG (src);
4347
4348   /* The optimization handles two single SVE REGs with different element
4349      sizes.  */
4350   if (!REG_P (dest)
4351       || !REG_P (src)
4352       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4353       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4354       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4355           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4356     return false;
4357
4358   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4359   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4360   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4361                                UNSPEC_REV_SUBREG);
4362   emit_insn (gen_rtx_SET (dest, unspec));
4363   return true;
4364 }
4365
4366 /* Return a copy of X with mode MODE, without changing its other
4367    attributes.  Unlike gen_lowpart, this doesn't care whether the
4368    mode change is valid.  */
4369
4370 static rtx
4371 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4372 {
4373   if (GET_MODE (x) == mode)
4374     return x;
4375
4376   x = shallow_copy_rtx (x);
4377   set_mode_and_regno (x, mode, REGNO (x));
4378   return x;
4379 }
4380
4381 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4382    stored in wider integer containers.  */
4383
4384 static unsigned int
4385 aarch64_sve_rev_unspec (machine_mode mode)
4386 {
4387   switch (GET_MODE_UNIT_SIZE (mode))
4388     {
4389     case 1: return UNSPEC_REVB;
4390     case 2: return UNSPEC_REVH;
4391     case 4: return UNSPEC_REVW;
4392     }
4393   gcc_unreachable ();
4394 }
4395
4396 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4397    operands.  */
4398
4399 void
4400 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4401 {
4402   /* Decide which REV operation we need.  The mode with wider elements
4403      determines the mode of the operands and the mode with the narrower
4404      elements determines the reverse width.  */
4405   machine_mode mode_with_wider_elts = GET_MODE (dest);
4406   machine_mode mode_with_narrower_elts = GET_MODE (src);
4407   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4408       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4409     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4410
4411   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4412   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4413   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4414
4415   /* Get the operands in the appropriate modes and emit the instruction.  */
4416   ptrue = gen_lowpart (pred_mode, ptrue);
4417   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4418   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4419   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4420                                dest, ptrue, src));
4421 }
4422
4423 static bool
4424 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4425                                  tree exp ATTRIBUTE_UNUSED)
4426 {
4427   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4428     return false;
4429
4430   return true;
4431 }
4432
4433 /* Implement TARGET_PASS_BY_REFERENCE.  */
4434
4435 static bool
4436 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4437 {
4438   HOST_WIDE_INT size;
4439   machine_mode dummymode;
4440   int nregs;
4441
4442   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4443   if (arg.mode == BLKmode && arg.type)
4444     size = int_size_in_bytes (arg.type);
4445   else
4446     /* No frontends can create types with variable-sized modes, so we
4447        shouldn't be asked to pass or return them.  */
4448     size = GET_MODE_SIZE (arg.mode).to_constant ();
4449
4450   /* Aggregates are passed by reference based on their size.  */
4451   if (arg.aggregate_type_p ())
4452     size = int_size_in_bytes (arg.type);
4453
4454   /* Variable sized arguments are always returned by reference.  */
4455   if (size < 0)
4456     return true;
4457
4458   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4459   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4460                                                &dummymode, &nregs,
4461                                                NULL))
4462     return false;
4463
4464   /* Arguments which are variable sized or larger than 2 registers are
4465      passed by reference unless they are a homogenous floating point
4466      aggregate.  */
4467   return size > 2 * UNITS_PER_WORD;
4468 }
4469
4470 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4471 static bool
4472 aarch64_return_in_msb (const_tree valtype)
4473 {
4474   machine_mode dummy_mode;
4475   int dummy_int;
4476
4477   /* Never happens in little-endian mode.  */
4478   if (!BYTES_BIG_ENDIAN)
4479     return false;
4480
4481   /* Only composite types smaller than or equal to 16 bytes can
4482      be potentially returned in registers.  */
4483   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4484       || int_size_in_bytes (valtype) <= 0
4485       || int_size_in_bytes (valtype) > 16)
4486     return false;
4487
4488   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4489      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4490      is always passed/returned in the least significant bits of fp/simd
4491      register(s).  */
4492   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4493                                                &dummy_mode, &dummy_int, NULL))
4494     return false;
4495
4496   return true;
4497 }
4498
4499 /* Implement TARGET_FUNCTION_VALUE.
4500    Define how to find the value returned by a function.  */
4501
4502 static rtx
4503 aarch64_function_value (const_tree type, const_tree func,
4504                         bool outgoing ATTRIBUTE_UNUSED)
4505 {
4506   machine_mode mode;
4507   int unsignedp;
4508   int count;
4509   machine_mode ag_mode;
4510
4511   mode = TYPE_MODE (type);
4512   if (INTEGRAL_TYPE_P (type))
4513     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4514
4515   if (aarch64_return_in_msb (type))
4516     {
4517       HOST_WIDE_INT size = int_size_in_bytes (type);
4518
4519       if (size % UNITS_PER_WORD != 0)
4520         {
4521           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4522           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4523         }
4524     }
4525
4526   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4527                                                &ag_mode, &count, NULL))
4528     {
4529       if (!aarch64_composite_type_p (type, mode))
4530         {
4531           gcc_assert (count == 1 && mode == ag_mode);
4532           return gen_rtx_REG (mode, V0_REGNUM);
4533         }
4534       else
4535         {
4536           int i;
4537           rtx par;
4538
4539           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4540           for (i = 0; i < count; i++)
4541             {
4542               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4543               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4544               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4545               XVECEXP (par, 0, i) = tmp;
4546             }
4547           return par;
4548         }
4549     }
4550   else
4551     return gen_rtx_REG (mode, R0_REGNUM);
4552 }
4553
4554 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4555    Return true if REGNO is the number of a hard register in which the values
4556    of called function may come back.  */
4557
4558 static bool
4559 aarch64_function_value_regno_p (const unsigned int regno)
4560 {
4561   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4562      of 16-byte return values are: 128-bit integers and 16-byte small
4563      structures (excluding homogeneous floating-point aggregates).  */
4564   if (regno == R0_REGNUM || regno == R1_REGNUM)
4565     return true;
4566
4567   /* Up to four fp/simd registers can return a function value, e.g. a
4568      homogeneous floating-point aggregate having four members.  */
4569   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4570     return TARGET_FLOAT;
4571
4572   return false;
4573 }
4574
4575 /* Implement TARGET_RETURN_IN_MEMORY.
4576
4577    If the type T of the result of a function is such that
4578      void func (T arg)
4579    would require that arg be passed as a value in a register (or set of
4580    registers) according to the parameter passing rules, then the result
4581    is returned in the same registers as would be used for such an
4582    argument.  */
4583
4584 static bool
4585 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4586 {
4587   HOST_WIDE_INT size;
4588   machine_mode ag_mode;
4589   int count;
4590
4591   if (!AGGREGATE_TYPE_P (type)
4592       && TREE_CODE (type) != COMPLEX_TYPE
4593       && TREE_CODE (type) != VECTOR_TYPE)
4594     /* Simple scalar types always returned in registers.  */
4595     return false;
4596
4597   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4598                                                type,
4599                                                &ag_mode,
4600                                                &count,
4601                                                NULL))
4602     return false;
4603
4604   /* Types larger than 2 registers returned in memory.  */
4605   size = int_size_in_bytes (type);
4606   return (size < 0 || size > 2 * UNITS_PER_WORD);
4607 }
4608
4609 static bool
4610 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4611                                const_tree type, int *nregs)
4612 {
4613   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4614   return aarch64_vfp_is_call_or_return_candidate (mode,
4615                                                   type,
4616                                                   &pcum->aapcs_vfp_rmode,
4617                                                   nregs,
4618                                                   NULL);
4619 }
4620
4621 /* Given MODE and TYPE of a function argument, return the alignment in
4622    bits.  The idea is to suppress any stronger alignment requested by
4623    the user and opt for the natural alignment (specified in AAPCS64 \S
4624    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4625    calculated in versions of GCC prior to GCC-9.  This is a helper
4626    function for local use only.  */
4627
4628 static unsigned int
4629 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4630                                 bool *abi_break)
4631 {
4632   *abi_break = false;
4633   if (!type)
4634     return GET_MODE_ALIGNMENT (mode);
4635
4636   if (integer_zerop (TYPE_SIZE (type)))
4637     return 0;
4638
4639   gcc_assert (TYPE_MODE (type) == mode);
4640
4641   if (!AGGREGATE_TYPE_P (type))
4642     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4643
4644   if (TREE_CODE (type) == ARRAY_TYPE)
4645     return TYPE_ALIGN (TREE_TYPE (type));
4646
4647   unsigned int alignment = 0;
4648   unsigned int bitfield_alignment = 0;
4649   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4650     if (TREE_CODE (field) == FIELD_DECL)
4651       {
4652         alignment = std::max (alignment, DECL_ALIGN (field));
4653         if (DECL_BIT_FIELD_TYPE (field))
4654           bitfield_alignment
4655             = std::max (bitfield_alignment,
4656                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4657       }
4658
4659   if (bitfield_alignment > alignment)
4660     {
4661       *abi_break = true;
4662       return bitfield_alignment;
4663     }
4664
4665   return alignment;
4666 }
4667
4668 /* Layout a function argument according to the AAPCS64 rules.  The rule
4669    numbers refer to the rule numbers in the AAPCS64.  */
4670
4671 static void
4672 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4673                     const_tree type,
4674                     bool named ATTRIBUTE_UNUSED)
4675 {
4676   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4677   int ncrn, nvrn, nregs;
4678   bool allocate_ncrn, allocate_nvrn;
4679   HOST_WIDE_INT size;
4680   bool abi_break;
4681
4682   /* We need to do this once per argument.  */
4683   if (pcum->aapcs_arg_processed)
4684     return;
4685
4686   pcum->aapcs_arg_processed = true;
4687
4688   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4689   if (type)
4690     size = int_size_in_bytes (type);
4691   else
4692     /* No frontends can create types with variable-sized modes, so we
4693        shouldn't be asked to pass or return them.  */
4694     size = GET_MODE_SIZE (mode).to_constant ();
4695   size = ROUND_UP (size, UNITS_PER_WORD);
4696
4697   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4698   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4699                                                  mode,
4700                                                  type,
4701                                                  &nregs);
4702
4703   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4704      The following code thus handles passing by SIMD/FP registers first.  */
4705
4706   nvrn = pcum->aapcs_nvrn;
4707
4708   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4709      and homogenous short-vector aggregates (HVA).  */
4710   if (allocate_nvrn)
4711     {
4712       if (!TARGET_FLOAT)
4713         aarch64_err_no_fpadvsimd (mode);
4714
4715       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4716         {
4717           pcum->aapcs_nextnvrn = nvrn + nregs;
4718           if (!aarch64_composite_type_p (type, mode))
4719             {
4720               gcc_assert (nregs == 1);
4721               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4722             }
4723           else
4724             {
4725               rtx par;
4726               int i;
4727               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4728               for (i = 0; i < nregs; i++)
4729                 {
4730                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4731                                          V0_REGNUM + nvrn + i);
4732                   rtx offset = gen_int_mode
4733                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4734                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4735                   XVECEXP (par, 0, i) = tmp;
4736                 }
4737               pcum->aapcs_reg = par;
4738             }
4739           return;
4740         }
4741       else
4742         {
4743           /* C.3 NSRN is set to 8.  */
4744           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4745           goto on_stack;
4746         }
4747     }
4748
4749   ncrn = pcum->aapcs_ncrn;
4750   nregs = size / UNITS_PER_WORD;
4751
4752   /* C6 - C9.  though the sign and zero extension semantics are
4753      handled elsewhere.  This is the case where the argument fits
4754      entirely general registers.  */
4755   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4756     {
4757       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4758
4759       /* C.8 if the argument has an alignment of 16 then the NGRN is
4760          rounded up to the next even number.  */
4761       if (nregs == 2
4762           && ncrn % 2
4763           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4764              comparison is there because for > 16 * BITS_PER_UNIT
4765              alignment nregs should be > 2 and therefore it should be
4766              passed by reference rather than value.  */
4767           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4768               == 16 * BITS_PER_UNIT))
4769         {
4770           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4771             inform (input_location, "parameter passing for argument of type "
4772                     "%qT changed in GCC 9.1", type);
4773           ++ncrn;
4774           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4775         }
4776
4777       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4778          A reg is still generated for it, but the caller should be smart
4779          enough not to use it.  */
4780       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4781         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4782       else
4783         {
4784           rtx par;
4785           int i;
4786
4787           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4788           for (i = 0; i < nregs; i++)
4789             {
4790               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4791               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4792                                        GEN_INT (i * UNITS_PER_WORD));
4793               XVECEXP (par, 0, i) = tmp;
4794             }
4795           pcum->aapcs_reg = par;
4796         }
4797
4798       pcum->aapcs_nextncrn = ncrn + nregs;
4799       return;
4800     }
4801
4802   /* C.11  */
4803   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4804
4805   /* The argument is passed on stack; record the needed number of words for
4806      this argument and align the total size if necessary.  */
4807 on_stack:
4808   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4809
4810   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4811       == 16 * BITS_PER_UNIT)
4812     {
4813       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4814       if (pcum->aapcs_stack_size != new_size)
4815         {
4816           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4817             inform (input_location, "parameter passing for argument of type "
4818                     "%qT changed in GCC 9.1", type);
4819           pcum->aapcs_stack_size = new_size;
4820         }
4821     }
4822   return;
4823 }
4824
4825 /* Implement TARGET_FUNCTION_ARG.  */
4826
4827 static rtx
4828 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4829 {
4830   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4831   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
4832               || pcum->pcs_variant == ARM_PCS_SIMD);
4833
4834   if (arg.end_marker_p ())
4835     return gen_int_mode (pcum->pcs_variant, DImode);
4836
4837   aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4838   return pcum->aapcs_reg;
4839 }
4840
4841 void
4842 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4843                               const_tree fntype,
4844                               rtx libname ATTRIBUTE_UNUSED,
4845                               const_tree fndecl ATTRIBUTE_UNUSED,
4846                               unsigned n_named ATTRIBUTE_UNUSED)
4847 {
4848   pcum->aapcs_ncrn = 0;
4849   pcum->aapcs_nvrn = 0;
4850   pcum->aapcs_nextncrn = 0;
4851   pcum->aapcs_nextnvrn = 0;
4852   if (fntype)
4853     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
4854   else
4855     pcum->pcs_variant = ARM_PCS_AAPCS64;
4856   pcum->aapcs_reg = NULL_RTX;
4857   pcum->aapcs_arg_processed = false;
4858   pcum->aapcs_stack_words = 0;
4859   pcum->aapcs_stack_size = 0;
4860
4861   if (!TARGET_FLOAT
4862       && fndecl && TREE_PUBLIC (fndecl)
4863       && fntype && fntype != error_mark_node)
4864     {
4865       const_tree type = TREE_TYPE (fntype);
4866       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4867       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4868       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4869                                                    &mode, &nregs, NULL))
4870         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4871     }
4872   return;
4873 }
4874
4875 static void
4876 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4877                               const function_arg_info &arg)
4878 {
4879   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4880   if (pcum->pcs_variant == ARM_PCS_AAPCS64
4881       || pcum->pcs_variant == ARM_PCS_SIMD)
4882     {
4883       aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4884       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4885                   != (pcum->aapcs_stack_words != 0));
4886       pcum->aapcs_arg_processed = false;
4887       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4888       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4889       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4890       pcum->aapcs_stack_words = 0;
4891       pcum->aapcs_reg = NULL_RTX;
4892     }
4893 }
4894
4895 bool
4896 aarch64_function_arg_regno_p (unsigned regno)
4897 {
4898   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4899           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4900 }
4901
4902 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4903    PARM_BOUNDARY bits of alignment, but will be given anything up
4904    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4905    that both before and after the layout of each argument, the Next
4906    Stacked Argument Address (NSAA) will have a minimum alignment of
4907    8 bytes.  */
4908
4909 static unsigned int
4910 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4911 {
4912   bool abi_break;
4913   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4914                                                            &abi_break);
4915   if (abi_break & warn_psabi)
4916     inform (input_location, "parameter passing for argument of type "
4917             "%qT changed in GCC 9.1", type);
4918
4919   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4920 }
4921
4922 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4923
4924 static fixed_size_mode
4925 aarch64_get_reg_raw_mode (int regno)
4926 {
4927   if (TARGET_SVE && FP_REGNUM_P (regno))
4928     /* Don't use the SVE part of the register for __builtin_apply and
4929        __builtin_return.  The SVE registers aren't used by the normal PCS,
4930        so using them there would be a waste of time.  The PCS extensions
4931        for SVE types are fundamentally incompatible with the
4932        __builtin_return/__builtin_apply interface.  */
4933     return as_a <fixed_size_mode> (V16QImode);
4934   return default_get_reg_raw_mode (regno);
4935 }
4936
4937 /* Implement TARGET_FUNCTION_ARG_PADDING.
4938
4939    Small aggregate types are placed in the lowest memory address.
4940
4941    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4942
4943 static pad_direction
4944 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4945 {
4946   /* On little-endian targets, the least significant byte of every stack
4947      argument is passed at the lowest byte address of the stack slot.  */
4948   if (!BYTES_BIG_ENDIAN)
4949     return PAD_UPWARD;
4950
4951   /* Otherwise, integral, floating-point and pointer types are padded downward:
4952      the least significant byte of a stack argument is passed at the highest
4953      byte address of the stack slot.  */
4954   if (type
4955       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4956          || POINTER_TYPE_P (type))
4957       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4958     return PAD_DOWNWARD;
4959
4960   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4961   return PAD_UPWARD;
4962 }
4963
4964 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4965
4966    It specifies padding for the last (may also be the only)
4967    element of a block move between registers and memory.  If
4968    assuming the block is in the memory, padding upward means that
4969    the last element is padded after its highest significant byte,
4970    while in downward padding, the last element is padded at the
4971    its least significant byte side.
4972
4973    Small aggregates and small complex types are always padded
4974    upwards.
4975
4976    We don't need to worry about homogeneous floating-point or
4977    short-vector aggregates; their move is not affected by the
4978    padding direction determined here.  Regardless of endianness,
4979    each element of such an aggregate is put in the least
4980    significant bits of a fp/simd register.
4981
4982    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4983    register has useful data, and return the opposite if the most
4984    significant byte does.  */
4985
4986 bool
4987 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4988                      bool first ATTRIBUTE_UNUSED)
4989 {
4990
4991   /* Small composite types are always padded upward.  */
4992   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4993     {
4994       HOST_WIDE_INT size;
4995       if (type)
4996         size = int_size_in_bytes (type);
4997       else
4998         /* No frontends can create types with variable-sized modes, so we
4999            shouldn't be asked to pass or return them.  */
5000         size = GET_MODE_SIZE (mode).to_constant ();
5001       if (size < 2 * UNITS_PER_WORD)
5002         return true;
5003     }
5004
5005   /* Otherwise, use the default padding.  */
5006   return !BYTES_BIG_ENDIAN;
5007 }
5008
5009 static scalar_int_mode
5010 aarch64_libgcc_cmp_return_mode (void)
5011 {
5012   return SImode;
5013 }
5014
5015 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5016
5017 /* We use the 12-bit shifted immediate arithmetic instructions so values
5018    must be multiple of (1 << 12), i.e. 4096.  */
5019 #define ARITH_FACTOR 4096
5020
5021 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5022 #error Cannot use simple address calculation for stack probing
5023 #endif
5024
5025 /* The pair of scratch registers used for stack probing.  */
5026 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5027 #define PROBE_STACK_SECOND_REG R10_REGNUM
5028
5029 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5030    inclusive.  These are offsets from the current stack pointer.  */
5031
5032 static void
5033 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5034 {
5035   HOST_WIDE_INT size;
5036   if (!poly_size.is_constant (&size))
5037     {
5038       sorry ("stack probes for SVE frames");
5039       return;
5040     }
5041
5042   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5043
5044   /* See the same assertion on PROBE_INTERVAL above.  */
5045   gcc_assert ((first % ARITH_FACTOR) == 0);
5046
5047   /* See if we have a constant small number of probes to generate.  If so,
5048      that's the easy case.  */
5049   if (size <= PROBE_INTERVAL)
5050     {
5051       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5052
5053       emit_set_insn (reg1,
5054                      plus_constant (Pmode,
5055                                     stack_pointer_rtx, -(first + base)));
5056       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5057     }
5058
5059   /* The run-time loop is made up of 8 insns in the generic case while the
5060      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5061   else if (size <= 4 * PROBE_INTERVAL)
5062     {
5063       HOST_WIDE_INT i, rem;
5064
5065       emit_set_insn (reg1,
5066                      plus_constant (Pmode,
5067                                     stack_pointer_rtx,
5068                                     -(first + PROBE_INTERVAL)));
5069       emit_stack_probe (reg1);
5070
5071       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5072          it exceeds SIZE.  If only two probes are needed, this will not
5073          generate any code.  Then probe at FIRST + SIZE.  */
5074       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5075         {
5076           emit_set_insn (reg1,
5077                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5078           emit_stack_probe (reg1);
5079         }
5080
5081       rem = size - (i - PROBE_INTERVAL);
5082       if (rem > 256)
5083         {
5084           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5085
5086           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5087           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5088         }
5089       else
5090         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5091     }
5092
5093   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5094      extra careful with variables wrapping around because we might be at
5095      the very top (or the very bottom) of the address space and we have
5096      to be able to handle this case properly; in particular, we use an
5097      equality test for the loop condition.  */
5098   else
5099     {
5100       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5101
5102       /* Step 1: round SIZE to the previous multiple of the interval.  */
5103
5104       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5105
5106
5107       /* Step 2: compute initial and final value of the loop counter.  */
5108
5109       /* TEST_ADDR = SP + FIRST.  */
5110       emit_set_insn (reg1,
5111                      plus_constant (Pmode, stack_pointer_rtx, -first));
5112
5113       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5114       HOST_WIDE_INT adjustment = - (first + rounded_size);
5115       if (! aarch64_uimm12_shift (adjustment))
5116         {
5117           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5118                                           true, Pmode);
5119           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5120         }
5121       else
5122         emit_set_insn (reg2,
5123                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5124
5125       /* Step 3: the loop
5126
5127          do
5128            {
5129              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5130              probe at TEST_ADDR
5131            }
5132          while (TEST_ADDR != LAST_ADDR)
5133
5134          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5135          until it is equal to ROUNDED_SIZE.  */
5136
5137       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5138
5139
5140       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5141          that SIZE is equal to ROUNDED_SIZE.  */
5142
5143       if (size != rounded_size)
5144         {
5145           HOST_WIDE_INT rem = size - rounded_size;
5146
5147           if (rem > 256)
5148             {
5149               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5150
5151               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5152               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5153             }
5154           else
5155             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5156         }
5157     }
5158
5159   /* Make sure nothing is scheduled before we are done.  */
5160   emit_insn (gen_blockage ());
5161 }
5162
5163 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5164    absolute addresses.  */
5165
5166 const char *
5167 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5168 {
5169   static int labelno = 0;
5170   char loop_lab[32];
5171   rtx xops[2];
5172
5173   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5174
5175   /* Loop.  */
5176   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5177
5178   HOST_WIDE_INT stack_clash_probe_interval
5179     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5180
5181   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5182   xops[0] = reg1;
5183   HOST_WIDE_INT interval;
5184   if (flag_stack_clash_protection)
5185     interval = stack_clash_probe_interval;
5186   else
5187     interval = PROBE_INTERVAL;
5188
5189   gcc_assert (aarch64_uimm12_shift (interval));
5190   xops[1] = GEN_INT (interval);
5191
5192   output_asm_insn ("sub\t%0, %0, %1", xops);
5193
5194   /* If doing stack clash protection then we probe up by the ABI specified
5195      amount.  We do this because we're dropping full pages at a time in the
5196      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5197   if (flag_stack_clash_protection)
5198     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5199   else
5200     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5201
5202   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5203      by this amount for each iteration.  */
5204   output_asm_insn ("str\txzr, [%0, %1]", xops);
5205
5206   /* Test if TEST_ADDR == LAST_ADDR.  */
5207   xops[1] = reg2;
5208   output_asm_insn ("cmp\t%0, %1", xops);
5209
5210   /* Branch.  */
5211   fputs ("\tb.ne\t", asm_out_file);
5212   assemble_name_raw (asm_out_file, loop_lab);
5213   fputc ('\n', asm_out_file);
5214
5215   return "";
5216 }
5217
5218 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5219    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5220    of GUARD_SIZE.  When a probe is emitted it is done at most
5221    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5222    at most MIN_PROBE_THRESHOLD.  By the end of this function
5223    BASE = BASE - ADJUSTMENT.  */
5224
5225 const char *
5226 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5227                                       rtx min_probe_threshold, rtx guard_size)
5228 {
5229   /* This function is not allowed to use any instruction generation function
5230      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5231      so instead emit the code you want using output_asm_insn.  */
5232   gcc_assert (flag_stack_clash_protection);
5233   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5234   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5235
5236   /* The minimum required allocation before the residual requires probing.  */
5237   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5238
5239   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5240   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5241   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5242
5243   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5244   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5245
5246   static int labelno = 0;
5247   char loop_start_lab[32];
5248   char loop_end_lab[32];
5249   rtx xops[2];
5250
5251   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5252   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5253
5254   /* Emit loop start label.  */
5255   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5256
5257   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5258   xops[0] = adjustment;
5259   xops[1] = probe_offset_value_rtx;
5260   output_asm_insn ("cmp\t%0, %1", xops);
5261
5262   /* Branch to end if not enough adjustment to probe.  */
5263   fputs ("\tb.lt\t", asm_out_file);
5264   assemble_name_raw (asm_out_file, loop_end_lab);
5265   fputc ('\n', asm_out_file);
5266
5267   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5268   xops[0] = base;
5269   xops[1] = probe_offset_value_rtx;
5270   output_asm_insn ("sub\t%0, %0, %1", xops);
5271
5272   /* Probe at BASE.  */
5273   xops[1] = const0_rtx;
5274   output_asm_insn ("str\txzr, [%0, %1]", xops);
5275
5276   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5277   xops[0] = adjustment;
5278   xops[1] = probe_offset_value_rtx;
5279   output_asm_insn ("sub\t%0, %0, %1", xops);
5280
5281   /* Branch to start if still more bytes to allocate.  */
5282   fputs ("\tb\t", asm_out_file);
5283   assemble_name_raw (asm_out_file, loop_start_lab);
5284   fputc ('\n', asm_out_file);
5285
5286   /* No probe leave.  */
5287   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5288
5289   /* BASE = BASE - ADJUSTMENT.  */
5290   xops[0] = base;
5291   xops[1] = adjustment;
5292   output_asm_insn ("sub\t%0, %0, %1", xops);
5293   return "";
5294 }
5295
5296 /* Determine whether a frame chain needs to be generated.  */
5297 static bool
5298 aarch64_needs_frame_chain (void)
5299 {
5300   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5301   if (frame_pointer_needed || crtl->calls_eh_return)
5302     return true;
5303
5304   /* A leaf function cannot have calls or write LR.  */
5305   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5306
5307   /* Don't use a frame chain in leaf functions if leaf frame pointers
5308      are disabled.  */
5309   if (flag_omit_leaf_frame_pointer && is_leaf)
5310     return false;
5311
5312   return aarch64_use_frame_pointer;
5313 }
5314
5315 /* Mark the registers that need to be saved by the callee and calculate
5316    the size of the callee-saved registers area and frame record (both FP
5317    and LR may be omitted).  */
5318 static void
5319 aarch64_layout_frame (void)
5320 {
5321   HOST_WIDE_INT offset = 0;
5322   int regno, last_fp_reg = INVALID_REGNUM;
5323   bool simd_function = (crtl->abi->id () == ARM_PCS_SIMD);
5324
5325   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5326
5327   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5328      the mid-end is doing.  */
5329   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5330
5331 #define SLOT_NOT_REQUIRED (-2)
5332 #define SLOT_REQUIRED     (-1)
5333
5334   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5335   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5336
5337   /* First mark all the registers that really need to be saved...  */
5338   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5339     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5340
5341   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5342     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5343
5344   /* ... that includes the eh data registers (if needed)...  */
5345   if (crtl->calls_eh_return)
5346     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5347       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5348         = SLOT_REQUIRED;
5349
5350   /* ... and any callee saved register that dataflow says is live.  */
5351   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5352     if (df_regs_ever_live_p (regno)
5353         && !fixed_regs[regno]
5354         && (regno == R30_REGNUM
5355             || !crtl->abi->clobbers_full_reg_p (regno)))
5356       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5357
5358   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5359     if (df_regs_ever_live_p (regno)
5360         && !fixed_regs[regno]
5361         && !crtl->abi->clobbers_full_reg_p (regno))
5362       {
5363         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5364         last_fp_reg = regno;
5365       }
5366
5367   if (cfun->machine->frame.emit_frame_chain)
5368     {
5369       /* FP and LR are placed in the linkage record.  */
5370       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5371       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5372       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5373       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5374       offset = 2 * UNITS_PER_WORD;
5375     }
5376
5377   /* With stack-clash, LR must be saved in non-leaf functions.  */
5378   gcc_assert (crtl->is_leaf
5379               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5380                   != SLOT_NOT_REQUIRED));
5381
5382   /* Now assign stack slots for them.  */
5383   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5384     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5385       {
5386         cfun->machine->frame.reg_offset[regno] = offset;
5387         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5388           cfun->machine->frame.wb_candidate1 = regno;
5389         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5390           cfun->machine->frame.wb_candidate2 = regno;
5391         offset += UNITS_PER_WORD;
5392       }
5393
5394   HOST_WIDE_INT max_int_offset = offset;
5395   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5396   bool has_align_gap = offset != max_int_offset;
5397
5398   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5399     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5400       {
5401         /* If there is an alignment gap between integer and fp callee-saves,
5402            allocate the last fp register to it if possible.  */
5403         if (regno == last_fp_reg
5404             && has_align_gap
5405             && !simd_function
5406             && (offset & 8) == 0)
5407           {
5408             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5409             break;
5410           }
5411
5412         cfun->machine->frame.reg_offset[regno] = offset;
5413         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5414           cfun->machine->frame.wb_candidate1 = regno;
5415         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5416                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5417           cfun->machine->frame.wb_candidate2 = regno;
5418         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5419       }
5420
5421   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5422
5423   cfun->machine->frame.saved_regs_size = offset;
5424
5425   HOST_WIDE_INT varargs_and_saved_regs_size
5426     = offset + cfun->machine->frame.saved_varargs_size;
5427
5428   cfun->machine->frame.hard_fp_offset
5429     = aligned_upper_bound (varargs_and_saved_regs_size
5430                            + get_frame_size (),
5431                            STACK_BOUNDARY / BITS_PER_UNIT);
5432
5433   /* Both these values are already aligned.  */
5434   gcc_assert (multiple_p (crtl->outgoing_args_size,
5435                           STACK_BOUNDARY / BITS_PER_UNIT));
5436   cfun->machine->frame.frame_size
5437     = (cfun->machine->frame.hard_fp_offset
5438        + crtl->outgoing_args_size);
5439
5440   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5441
5442   cfun->machine->frame.initial_adjust = 0;
5443   cfun->machine->frame.final_adjust = 0;
5444   cfun->machine->frame.callee_adjust = 0;
5445   cfun->machine->frame.callee_offset = 0;
5446
5447   HOST_WIDE_INT max_push_offset = 0;
5448   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5449     max_push_offset = 512;
5450   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5451     max_push_offset = 256;
5452
5453   HOST_WIDE_INT const_size, const_fp_offset;
5454   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5455       && const_size < max_push_offset
5456       && known_eq (crtl->outgoing_args_size, 0))
5457     {
5458       /* Simple, small frame with no outgoing arguments:
5459          stp reg1, reg2, [sp, -frame_size]!
5460          stp reg3, reg4, [sp, 16]  */
5461       cfun->machine->frame.callee_adjust = const_size;
5462     }
5463   else if (known_lt (crtl->outgoing_args_size
5464                      + cfun->machine->frame.saved_regs_size, 512)
5465            && !(cfun->calls_alloca
5466                 && known_lt (cfun->machine->frame.hard_fp_offset,
5467                              max_push_offset)))
5468     {
5469       /* Frame with small outgoing arguments:
5470          sub sp, sp, frame_size
5471          stp reg1, reg2, [sp, outgoing_args_size]
5472          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5473       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5474       cfun->machine->frame.callee_offset
5475         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5476     }
5477   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5478            && const_fp_offset < max_push_offset)
5479     {
5480       /* Frame with large outgoing arguments but a small local area:
5481          stp reg1, reg2, [sp, -hard_fp_offset]!
5482          stp reg3, reg4, [sp, 16]
5483          sub sp, sp, outgoing_args_size  */
5484       cfun->machine->frame.callee_adjust = const_fp_offset;
5485       cfun->machine->frame.final_adjust
5486         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5487     }
5488   else
5489     {
5490       /* Frame with large local area and outgoing arguments using frame pointer:
5491          sub sp, sp, hard_fp_offset
5492          stp x29, x30, [sp, 0]
5493          add x29, sp, 0
5494          stp reg3, reg4, [sp, 16]
5495          sub sp, sp, outgoing_args_size  */
5496       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5497       cfun->machine->frame.final_adjust
5498         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5499     }
5500
5501   cfun->machine->frame.laid_out = true;
5502 }
5503
5504 /* Return true if the register REGNO is saved on entry to
5505    the current function.  */
5506
5507 static bool
5508 aarch64_register_saved_on_entry (int regno)
5509 {
5510   return cfun->machine->frame.reg_offset[regno] >= 0;
5511 }
5512
5513 /* Return the next register up from REGNO up to LIMIT for the callee
5514    to save.  */
5515
5516 static unsigned
5517 aarch64_next_callee_save (unsigned regno, unsigned limit)
5518 {
5519   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5520     regno ++;
5521   return regno;
5522 }
5523
5524 /* Push the register number REGNO of mode MODE to the stack with write-back
5525    adjusting the stack by ADJUSTMENT.  */
5526
5527 static void
5528 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5529                            HOST_WIDE_INT adjustment)
5530  {
5531   rtx base_rtx = stack_pointer_rtx;
5532   rtx insn, reg, mem;
5533
5534   reg = gen_rtx_REG (mode, regno);
5535   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5536                             plus_constant (Pmode, base_rtx, -adjustment));
5537   mem = gen_frame_mem (mode, mem);
5538
5539   insn = emit_move_insn (mem, reg);
5540   RTX_FRAME_RELATED_P (insn) = 1;
5541 }
5542
5543 /* Generate and return an instruction to store the pair of registers
5544    REG and REG2 of mode MODE to location BASE with write-back adjusting
5545    the stack location BASE by ADJUSTMENT.  */
5546
5547 static rtx
5548 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5549                           HOST_WIDE_INT adjustment)
5550 {
5551   switch (mode)
5552     {
5553     case E_DImode:
5554       return gen_storewb_pairdi_di (base, base, reg, reg2,
5555                                     GEN_INT (-adjustment),
5556                                     GEN_INT (UNITS_PER_WORD - adjustment));
5557     case E_DFmode:
5558       return gen_storewb_pairdf_di (base, base, reg, reg2,
5559                                     GEN_INT (-adjustment),
5560                                     GEN_INT (UNITS_PER_WORD - adjustment));
5561     case E_TFmode:
5562       return gen_storewb_pairtf_di (base, base, reg, reg2,
5563                                     GEN_INT (-adjustment),
5564                                     GEN_INT (UNITS_PER_VREG - adjustment));
5565     default:
5566       gcc_unreachable ();
5567     }
5568 }
5569
5570 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5571    stack pointer by ADJUSTMENT.  */
5572
5573 static void
5574 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5575 {
5576   rtx_insn *insn;
5577   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5578
5579   if (regno2 == INVALID_REGNUM)
5580     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5581
5582   rtx reg1 = gen_rtx_REG (mode, regno1);
5583   rtx reg2 = gen_rtx_REG (mode, regno2);
5584
5585   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5586                                               reg2, adjustment));
5587   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5588   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5589   RTX_FRAME_RELATED_P (insn) = 1;
5590 }
5591
5592 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5593    adjusting it by ADJUSTMENT afterwards.  */
5594
5595 static rtx
5596 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5597                          HOST_WIDE_INT adjustment)
5598 {
5599   switch (mode)
5600     {
5601     case E_DImode:
5602       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5603                                    GEN_INT (UNITS_PER_WORD));
5604     case E_DFmode:
5605       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5606                                    GEN_INT (UNITS_PER_WORD));
5607     case E_TFmode:
5608       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5609                                    GEN_INT (UNITS_PER_VREG));
5610     default:
5611       gcc_unreachable ();
5612     }
5613 }
5614
5615 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5616    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5617    into CFI_OPS.  */
5618
5619 static void
5620 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5621                   rtx *cfi_ops)
5622 {
5623   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5624   rtx reg1 = gen_rtx_REG (mode, regno1);
5625
5626   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5627
5628   if (regno2 == INVALID_REGNUM)
5629     {
5630       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5631       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5632       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5633     }
5634   else
5635     {
5636       rtx reg2 = gen_rtx_REG (mode, regno2);
5637       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5638       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5639                                           reg2, adjustment));
5640     }
5641 }
5642
5643 /* Generate and return a store pair instruction of mode MODE to store
5644    register REG1 to MEM1 and register REG2 to MEM2.  */
5645
5646 static rtx
5647 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5648                         rtx reg2)
5649 {
5650   switch (mode)
5651     {
5652     case E_DImode:
5653       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5654
5655     case E_DFmode:
5656       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5657
5658     case E_TFmode:
5659       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5660
5661     default:
5662       gcc_unreachable ();
5663     }
5664 }
5665
5666 /* Generate and regurn a load pair isntruction of mode MODE to load register
5667    REG1 from MEM1 and register REG2 from MEM2.  */
5668
5669 static rtx
5670 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5671                        rtx mem2)
5672 {
5673   switch (mode)
5674     {
5675     case E_DImode:
5676       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5677
5678     case E_DFmode:
5679       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5680
5681     case E_TFmode:
5682       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5683
5684     default:
5685       gcc_unreachable ();
5686     }
5687 }
5688
5689 /* Return TRUE if return address signing should be enabled for the current
5690    function, otherwise return FALSE.  */
5691
5692 bool
5693 aarch64_return_address_signing_enabled (void)
5694 {
5695   /* This function should only be called after frame laid out.   */
5696   gcc_assert (cfun->machine->frame.laid_out);
5697
5698   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5699      if its LR is pushed onto stack.  */
5700   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5701           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5702               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5703 }
5704
5705 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5706 bool
5707 aarch64_bti_enabled (void)
5708 {
5709   return (aarch64_enable_bti == 1);
5710 }
5711
5712 /* Emit code to save the callee-saved registers from register number START
5713    to LIMIT to the stack at the location starting at offset START_OFFSET,
5714    skipping any write-back candidates if SKIP_WB is true.  */
5715
5716 static void
5717 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5718                            unsigned start, unsigned limit, bool skip_wb)
5719 {
5720   rtx_insn *insn;
5721   unsigned regno;
5722   unsigned regno2;
5723
5724   for (regno = aarch64_next_callee_save (start, limit);
5725        regno <= limit;
5726        regno = aarch64_next_callee_save (regno + 1, limit))
5727     {
5728       rtx reg, mem;
5729       poly_int64 offset;
5730       int offset_diff;
5731
5732       if (skip_wb
5733           && (regno == cfun->machine->frame.wb_candidate1
5734               || regno == cfun->machine->frame.wb_candidate2))
5735         continue;
5736
5737       if (cfun->machine->reg_is_wrapped_separately[regno])
5738        continue;
5739
5740       reg = gen_rtx_REG (mode, regno);
5741       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5742       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5743                                                 offset));
5744
5745       regno2 = aarch64_next_callee_save (regno + 1, limit);
5746       offset_diff = cfun->machine->frame.reg_offset[regno2]
5747                     - cfun->machine->frame.reg_offset[regno];
5748
5749       if (regno2 <= limit
5750           && !cfun->machine->reg_is_wrapped_separately[regno2]
5751           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5752         {
5753           rtx reg2 = gen_rtx_REG (mode, regno2);
5754           rtx mem2;
5755
5756           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5757           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5758                                                      offset));
5759           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5760                                                     reg2));
5761
5762           /* The first part of a frame-related parallel insn is
5763              always assumed to be relevant to the frame
5764              calculations; subsequent parts, are only
5765              frame-related if explicitly marked.  */
5766           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5767           regno = regno2;
5768         }
5769       else
5770         insn = emit_move_insn (mem, reg);
5771
5772       RTX_FRAME_RELATED_P (insn) = 1;
5773     }
5774 }
5775
5776 /* Emit code to restore the callee registers of mode MODE from register
5777    number START up to and including LIMIT.  Restore from the stack offset
5778    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5779    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5780
5781 static void
5782 aarch64_restore_callee_saves (machine_mode mode,
5783                               poly_int64 start_offset, unsigned start,
5784                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5785 {
5786   rtx base_rtx = stack_pointer_rtx;
5787   unsigned regno;
5788   unsigned regno2;
5789   poly_int64 offset;
5790
5791   for (regno = aarch64_next_callee_save (start, limit);
5792        regno <= limit;
5793        regno = aarch64_next_callee_save (regno + 1, limit))
5794     {
5795       if (cfun->machine->reg_is_wrapped_separately[regno])
5796        continue;
5797
5798       rtx reg, mem;
5799       int offset_diff;
5800
5801       if (skip_wb
5802           && (regno == cfun->machine->frame.wb_candidate1
5803               || regno == cfun->machine->frame.wb_candidate2))
5804         continue;
5805
5806       reg = gen_rtx_REG (mode, regno);
5807       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5808       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5809
5810       regno2 = aarch64_next_callee_save (regno + 1, limit);
5811       offset_diff = cfun->machine->frame.reg_offset[regno2]
5812                     - cfun->machine->frame.reg_offset[regno];
5813
5814       if (regno2 <= limit
5815           && !cfun->machine->reg_is_wrapped_separately[regno2]
5816           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5817         {
5818           rtx reg2 = gen_rtx_REG (mode, regno2);
5819           rtx mem2;
5820
5821           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5822           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5823           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5824
5825           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5826           regno = regno2;
5827         }
5828       else
5829         emit_move_insn (reg, mem);
5830       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5831     }
5832 }
5833
5834 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5835    of MODE.  */
5836
5837 static inline bool
5838 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5839 {
5840   HOST_WIDE_INT multiple;
5841   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5842           && IN_RANGE (multiple, -8, 7));
5843 }
5844
5845 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5846    of MODE.  */
5847
5848 static inline bool
5849 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5850 {
5851   HOST_WIDE_INT multiple;
5852   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5853           && IN_RANGE (multiple, 0, 63));
5854 }
5855
5856 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5857    of MODE.  */
5858
5859 bool
5860 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5861 {
5862   HOST_WIDE_INT multiple;
5863   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5864           && IN_RANGE (multiple, -64, 63));
5865 }
5866
5867 /* Return true if OFFSET is a signed 9-bit value.  */
5868
5869 bool
5870 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5871                                        poly_int64 offset)
5872 {
5873   HOST_WIDE_INT const_offset;
5874   return (offset.is_constant (&const_offset)
5875           && IN_RANGE (const_offset, -256, 255));
5876 }
5877
5878 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5879    of MODE.  */
5880
5881 static inline bool
5882 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5883 {
5884   HOST_WIDE_INT multiple;
5885   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5886           && IN_RANGE (multiple, -256, 255));
5887 }
5888
5889 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5890    of MODE.  */
5891
5892 static inline bool
5893 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5894 {
5895   HOST_WIDE_INT multiple;
5896   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5897           && IN_RANGE (multiple, 0, 4095));
5898 }
5899
5900 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5901
5902 static sbitmap
5903 aarch64_get_separate_components (void)
5904 {
5905   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5906   bitmap_clear (components);
5907
5908   /* The registers we need saved to the frame.  */
5909   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5910     if (aarch64_register_saved_on_entry (regno))
5911       {
5912         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5913         if (!frame_pointer_needed)
5914           offset += cfun->machine->frame.frame_size
5915                     - cfun->machine->frame.hard_fp_offset;
5916         /* Check that we can access the stack slot of the register with one
5917            direct load with no adjustments needed.  */
5918         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5919           bitmap_set_bit (components, regno);
5920       }
5921
5922   /* Don't mess with the hard frame pointer.  */
5923   if (frame_pointer_needed)
5924     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5925
5926   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5927   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5928   /* If registers have been chosen to be stored/restored with
5929      writeback don't interfere with them to avoid having to output explicit
5930      stack adjustment instructions.  */
5931   if (reg2 != INVALID_REGNUM)
5932     bitmap_clear_bit (components, reg2);
5933   if (reg1 != INVALID_REGNUM)
5934     bitmap_clear_bit (components, reg1);
5935
5936   bitmap_clear_bit (components, LR_REGNUM);
5937   bitmap_clear_bit (components, SP_REGNUM);
5938
5939   return components;
5940 }
5941
5942 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5943
5944 static sbitmap
5945 aarch64_components_for_bb (basic_block bb)
5946 {
5947   bitmap in = DF_LIVE_IN (bb);
5948   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5949   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5950
5951   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5952   bitmap_clear (components);
5953
5954   /* Clobbered registers don't generate values in any meaningful sense,
5955      since nothing after the clobber can rely on their value.  And we can't
5956      say that partially-clobbered registers are unconditionally killed,
5957      because whether they're killed or not depends on the mode of the
5958      value they're holding.  Thus partially call-clobbered registers
5959      appear in neither the kill set nor the gen set.
5960
5961      Check manually for any calls that clobber more of a register than the
5962      current function can.  */
5963   function_abi_aggregator callee_abis;
5964   rtx_insn *insn;
5965   FOR_BB_INSNS (bb, insn)
5966     if (CALL_P (insn))
5967       callee_abis.note_callee_abi (insn_callee_abi (insn));
5968   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
5969
5970   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5971   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5972     if (!fixed_regs[regno]
5973         && !crtl->abi->clobbers_full_reg_p (regno)
5974         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
5975             || bitmap_bit_p (in, regno)
5976             || bitmap_bit_p (gen, regno)
5977             || bitmap_bit_p (kill, regno)))
5978       {
5979         unsigned regno2, offset, offset2;
5980         bitmap_set_bit (components, regno);
5981
5982         /* If there is a callee-save at an adjacent offset, add it too
5983            to increase the use of LDP/STP.  */
5984         offset = cfun->machine->frame.reg_offset[regno];
5985         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5986
5987         if (regno2 <= LAST_SAVED_REGNUM)
5988           {
5989             offset2 = cfun->machine->frame.reg_offset[regno2];
5990             if ((offset & ~8) == (offset2 & ~8))
5991               bitmap_set_bit (components, regno2);
5992           }
5993       }
5994
5995   return components;
5996 }
5997
5998 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5999    Nothing to do for aarch64.  */
6000
6001 static void
6002 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6003 {
6004 }
6005
6006 /* Return the next set bit in BMP from START onwards.  Return the total number
6007    of bits in BMP if no set bit is found at or after START.  */
6008
6009 static unsigned int
6010 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6011 {
6012   unsigned int nbits = SBITMAP_SIZE (bmp);
6013   if (start == nbits)
6014     return start;
6015
6016   gcc_assert (start < nbits);
6017   for (unsigned int i = start; i < nbits; i++)
6018     if (bitmap_bit_p (bmp, i))
6019       return i;
6020
6021   return nbits;
6022 }
6023
6024 /* Do the work for aarch64_emit_prologue_components and
6025    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
6026    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6027    for these components or the epilogue sequence.  That is, it determines
6028    whether we should emit stores or loads and what kind of CFA notes to attach
6029    to the insns.  Otherwise the logic for the two sequences is very
6030    similar.  */
6031
6032 static void
6033 aarch64_process_components (sbitmap components, bool prologue_p)
6034 {
6035   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6036                              ? HARD_FRAME_POINTER_REGNUM
6037                              : STACK_POINTER_REGNUM);
6038
6039   unsigned last_regno = SBITMAP_SIZE (components);
6040   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6041   rtx_insn *insn = NULL;
6042
6043   while (regno != last_regno)
6044     {
6045       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6046          so DFmode for the vector registers is enough.  For simd functions
6047          we want to save the low 128 bits.  */
6048       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6049
6050       rtx reg = gen_rtx_REG (mode, regno);
6051       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6052       if (!frame_pointer_needed)
6053         offset += cfun->machine->frame.frame_size
6054                   - cfun->machine->frame.hard_fp_offset;
6055       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6056       rtx mem = gen_frame_mem (mode, addr);
6057
6058       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6059       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6060       /* No more registers to handle after REGNO.
6061          Emit a single save/restore and exit.  */
6062       if (regno2 == last_regno)
6063         {
6064           insn = emit_insn (set);
6065           RTX_FRAME_RELATED_P (insn) = 1;
6066           if (prologue_p)
6067             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6068           else
6069             add_reg_note (insn, REG_CFA_RESTORE, reg);
6070           break;
6071         }
6072
6073       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6074       /* The next register is not of the same class or its offset is not
6075          mergeable with the current one into a pair.  */
6076       if (!satisfies_constraint_Ump (mem)
6077           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6078           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6079           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6080                        GET_MODE_SIZE (mode)))
6081         {
6082           insn = emit_insn (set);
6083           RTX_FRAME_RELATED_P (insn) = 1;
6084           if (prologue_p)
6085             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6086           else
6087             add_reg_note (insn, REG_CFA_RESTORE, reg);
6088
6089           regno = regno2;
6090           continue;
6091         }
6092
6093       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6094       rtx reg2 = gen_rtx_REG (mode, regno2);
6095       if (!frame_pointer_needed)
6096         offset2 += cfun->machine->frame.frame_size
6097                   - cfun->machine->frame.hard_fp_offset;
6098       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6099       rtx mem2 = gen_frame_mem (mode, addr2);
6100       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6101                              : gen_rtx_SET (reg2, mem2);
6102
6103       if (prologue_p)
6104         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6105       else
6106         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6107
6108       RTX_FRAME_RELATED_P (insn) = 1;
6109       if (prologue_p)
6110         {
6111           add_reg_note (insn, REG_CFA_OFFSET, set);
6112           add_reg_note (insn, REG_CFA_OFFSET, set2);
6113         }
6114       else
6115         {
6116           add_reg_note (insn, REG_CFA_RESTORE, reg);
6117           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6118         }
6119
6120       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6121     }
6122 }
6123
6124 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6125
6126 static void
6127 aarch64_emit_prologue_components (sbitmap components)
6128 {
6129   aarch64_process_components (components, true);
6130 }
6131
6132 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6133
6134 static void
6135 aarch64_emit_epilogue_components (sbitmap components)
6136 {
6137   aarch64_process_components (components, false);
6138 }
6139
6140 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6141
6142 static void
6143 aarch64_set_handled_components (sbitmap components)
6144 {
6145   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6146     if (bitmap_bit_p (components, regno))
6147       cfun->machine->reg_is_wrapped_separately[regno] = true;
6148 }
6149
6150 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6151    determining the probe offset for alloca.  */
6152
6153 static HOST_WIDE_INT
6154 aarch64_stack_clash_protection_alloca_probe_range (void)
6155 {
6156   return STACK_CLASH_CALLER_GUARD;
6157 }
6158
6159
6160 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6161    registers.  If POLY_SIZE is not large enough to require a probe this function
6162    will only adjust the stack.  When allocating the stack space
6163    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6164    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6165    arguments.  If we are then we ensure that any allocation larger than the ABI
6166    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6167    maintained.
6168
6169    We emit barriers after each stack adjustment to prevent optimizations from
6170    breaking the invariant that we never drop the stack more than a page.  This
6171    invariant is needed to make it easier to correctly handle asynchronous
6172    events, e.g. if we were to allow the stack to be dropped by more than a page
6173    and then have multiple probes up and we take a signal somewhere in between
6174    then the signal handler doesn't know the state of the stack and can make no
6175    assumptions about which pages have been probed.  */
6176
6177 static void
6178 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6179                                         poly_int64 poly_size,
6180                                         bool frame_related_p,
6181                                         bool final_adjustment_p)
6182 {
6183   HOST_WIDE_INT guard_size
6184     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6185   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6186   /* When doing the final adjustment for the outgoing argument size we can't
6187      assume that LR was saved at position 0.  So subtract it's offset from the
6188      ABI safe buffer so that we don't accidentally allow an adjustment that
6189      would result in an allocation larger than the ABI buffer without
6190      probing.  */
6191   HOST_WIDE_INT min_probe_threshold
6192     = final_adjustment_p
6193       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6194       : guard_size - guard_used_by_caller;
6195
6196   poly_int64 frame_size = cfun->machine->frame.frame_size;
6197
6198   /* We should always have a positive probe threshold.  */
6199   gcc_assert (min_probe_threshold > 0);
6200
6201   if (flag_stack_clash_protection && !final_adjustment_p)
6202     {
6203       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6204       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6205
6206       if (known_eq (frame_size, 0))
6207         {
6208           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6209         }
6210       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6211                && known_lt (final_adjust, guard_used_by_caller))
6212         {
6213           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6214         }
6215     }
6216
6217   /* If SIZE is not large enough to require probing, just adjust the stack and
6218      exit.  */
6219   if (known_lt (poly_size, min_probe_threshold)
6220       || !flag_stack_clash_protection)
6221     {
6222       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6223       return;
6224     }
6225
6226   HOST_WIDE_INT size;
6227   /* Handle the SVE non-constant case first.  */
6228   if (!poly_size.is_constant (&size))
6229     {
6230      if (dump_file)
6231       {
6232         fprintf (dump_file, "Stack clash SVE prologue: ");
6233         print_dec (poly_size, dump_file);
6234         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6235       }
6236
6237       /* First calculate the amount of bytes we're actually spilling.  */
6238       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6239                           poly_size, temp1, temp2, false, true);
6240
6241       rtx_insn *insn = get_last_insn ();
6242
6243       if (frame_related_p)
6244         {
6245           /* This is done to provide unwinding information for the stack
6246              adjustments we're about to do, however to prevent the optimizers
6247              from removing the R11 move and leaving the CFA note (which would be
6248              very wrong) we tie the old and new stack pointer together.
6249              The tie will expand to nothing but the optimizers will not touch
6250              the instruction.  */
6251           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6252           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6253           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6254
6255           /* We want the CFA independent of the stack pointer for the
6256              duration of the loop.  */
6257           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6258           RTX_FRAME_RELATED_P (insn) = 1;
6259         }
6260
6261       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6262       rtx guard_const = gen_int_mode (guard_size, Pmode);
6263
6264       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6265                                                    stack_pointer_rtx, temp1,
6266                                                    probe_const, guard_const));
6267
6268       /* Now reset the CFA register if needed.  */
6269       if (frame_related_p)
6270         {
6271           add_reg_note (insn, REG_CFA_DEF_CFA,
6272                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6273                                       gen_int_mode (poly_size, Pmode)));
6274           RTX_FRAME_RELATED_P (insn) = 1;
6275         }
6276
6277       return;
6278     }
6279
6280   if (dump_file)
6281     fprintf (dump_file,
6282              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6283              " bytes, probing will be required.\n", size);
6284
6285   /* Round size to the nearest multiple of guard_size, and calculate the
6286      residual as the difference between the original size and the rounded
6287      size.  */
6288   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6289   HOST_WIDE_INT residual = size - rounded_size;
6290
6291   /* We can handle a small number of allocations/probes inline.  Otherwise
6292      punt to a loop.  */
6293   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6294     {
6295       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6296         {
6297           aarch64_sub_sp (NULL, temp2, guard_size, true);
6298           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6299                                            guard_used_by_caller));
6300           emit_insn (gen_blockage ());
6301         }
6302       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6303     }
6304   else
6305     {
6306       /* Compute the ending address.  */
6307       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6308                           temp1, NULL, false, true);
6309       rtx_insn *insn = get_last_insn ();
6310
6311       /* For the initial allocation, we don't have a frame pointer
6312          set up, so we always need CFI notes.  If we're doing the
6313          final allocation, then we may have a frame pointer, in which
6314          case it is the CFA, otherwise we need CFI notes.
6315
6316          We can determine which allocation we are doing by looking at
6317          the value of FRAME_RELATED_P since the final allocations are not
6318          frame related.  */
6319       if (frame_related_p)
6320         {
6321           /* We want the CFA independent of the stack pointer for the
6322              duration of the loop.  */
6323           add_reg_note (insn, REG_CFA_DEF_CFA,
6324                         plus_constant (Pmode, temp1, rounded_size));
6325           RTX_FRAME_RELATED_P (insn) = 1;
6326         }
6327
6328       /* This allocates and probes the stack.  Note that this re-uses some of
6329          the existing Ada stack protection code.  However we are guaranteed not
6330          to enter the non loop or residual branches of that code.
6331
6332          The non-loop part won't be entered because if our allocation amount
6333          doesn't require a loop, the case above would handle it.
6334
6335          The residual amount won't be entered because TEMP1 is a mutliple of
6336          the allocation size.  The residual will always be 0.  As such, the only
6337          part we are actually using from that code is the loop setup.  The
6338          actual probing is done in aarch64_output_probe_stack_range.  */
6339       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6340                                                stack_pointer_rtx, temp1));
6341
6342       /* Now reset the CFA register if needed.  */
6343       if (frame_related_p)
6344         {
6345           add_reg_note (insn, REG_CFA_DEF_CFA,
6346                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6347           RTX_FRAME_RELATED_P (insn) = 1;
6348         }
6349
6350       emit_insn (gen_blockage ());
6351       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6352     }
6353
6354   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6355      be probed.  This maintains the requirement that each page is probed at
6356      least once.  For initial probing we probe only if the allocation is
6357      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6358      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6359      GUARD_SIZE.  This works that for any allocation that is large enough to
6360      trigger a probe here, we'll have at least one, and if they're not large
6361      enough for this code to emit anything for them, The page would have been
6362      probed by the saving of FP/LR either by this function or any callees.  If
6363      we don't have any callees then we won't have more stack adjustments and so
6364      are still safe.  */
6365   if (residual)
6366     {
6367       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6368       /* If we're doing final adjustments, and we've done any full page
6369          allocations then any residual needs to be probed.  */
6370       if (final_adjustment_p && rounded_size != 0)
6371         min_probe_threshold = 0;
6372       /* If doing a small final adjustment, we always probe at offset 0.
6373          This is done to avoid issues when LR is not at position 0 or when
6374          the final adjustment is smaller than the probing offset.  */
6375       else if (final_adjustment_p && rounded_size == 0)
6376         residual_probe_offset = 0;
6377
6378       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6379       if (residual >= min_probe_threshold)
6380         {
6381           if (dump_file)
6382             fprintf (dump_file,
6383                      "Stack clash AArch64 prologue residuals: "
6384                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6385                      "\n", residual);
6386
6387             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6388                                              residual_probe_offset));
6389           emit_insn (gen_blockage ());
6390         }
6391     }
6392 }
6393
6394 /* Return 1 if the register is used by the epilogue.  We need to say the
6395    return register is used, but only after epilogue generation is complete.
6396    Note that in the case of sibcalls, the values "used by the epilogue" are
6397    considered live at the start of the called function.
6398
6399    For SIMD functions we need to return 1 for FP registers that are saved and
6400    restored by a function but are not zero in call_used_regs.  If we do not do
6401    this optimizations may remove the restore of the register.  */
6402
6403 int
6404 aarch64_epilogue_uses (int regno)
6405 {
6406   if (epilogue_completed)
6407     {
6408       if (regno == LR_REGNUM)
6409         return 1;
6410     }
6411   return 0;
6412 }
6413
6414 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6415    is saved at BASE + OFFSET.  */
6416
6417 static void
6418 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6419                             rtx base, poly_int64 offset)
6420 {
6421   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6422   add_reg_note (insn, REG_CFA_EXPRESSION,
6423                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6424 }
6425
6426 /* AArch64 stack frames generated by this compiler look like:
6427
6428         +-------------------------------+
6429         |                               |
6430         |  incoming stack arguments     |
6431         |                               |
6432         +-------------------------------+
6433         |                               | <-- incoming stack pointer (aligned)
6434         |  callee-allocated save area   |
6435         |  for register varargs         |
6436         |                               |
6437         +-------------------------------+
6438         |  local variables              | <-- frame_pointer_rtx
6439         |                               |
6440         +-------------------------------+
6441         |  padding                      | \
6442         +-------------------------------+  |
6443         |  callee-saved registers       |  | frame.saved_regs_size
6444         +-------------------------------+  |
6445         |  LR'                          |  |
6446         +-------------------------------+  |
6447         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6448         +-------------------------------+
6449         |  dynamic allocation           |
6450         +-------------------------------+
6451         |  padding                      |
6452         +-------------------------------+
6453         |  outgoing stack arguments     | <-- arg_pointer
6454         |                               |
6455         +-------------------------------+
6456         |                               | <-- stack_pointer_rtx (aligned)
6457
6458    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6459    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6460    unchanged.
6461
6462    By default for stack-clash we assume the guard is at least 64KB, but this
6463    value is configurable to either 4KB or 64KB.  We also force the guard size to
6464    be the same as the probing interval and both values are kept in sync.
6465
6466    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6467    on the guard size) of stack space without probing.
6468
6469    When probing is needed, we emit a probe at the start of the prologue
6470    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6471
6472    We have to track how much space has been allocated and the only stores
6473    to the stack we track as implicit probes are the FP/LR stores.
6474
6475    For outgoing arguments we probe if the size is larger than 1KB, such that
6476    the ABI specified buffer is maintained for the next callee.
6477
6478    The following registers are reserved during frame layout and should not be
6479    used for any other purpose:
6480
6481    - r11: Used by stack clash protection when SVE is enabled.
6482    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6483    - r14 and r15: Used for speculation tracking.
6484    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6485    - r30(LR), r29(FP): Used by standard frame layout.
6486
6487    These registers must be avoided in frame layout related code unless the
6488    explicit intention is to interact with one of the features listed above.  */
6489
6490 /* Generate the prologue instructions for entry into a function.
6491    Establish the stack frame by decreasing the stack pointer with a
6492    properly calculated size and, if necessary, create a frame record
6493    filled with the values of LR and previous frame pointer.  The
6494    current FP is also set up if it is in use.  */
6495
6496 void
6497 aarch64_expand_prologue (void)
6498 {
6499   poly_int64 frame_size = cfun->machine->frame.frame_size;
6500   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6501   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6502   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6503   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6504   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6505   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6506   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6507   rtx_insn *insn;
6508
6509   /* Sign return address for functions.  */
6510   if (aarch64_return_address_signing_enabled ())
6511     {
6512       switch (aarch64_ra_sign_key)
6513         {
6514           case AARCH64_KEY_A:
6515             insn = emit_insn (gen_paciasp ());
6516             break;
6517           case AARCH64_KEY_B:
6518             insn = emit_insn (gen_pacibsp ());
6519             break;
6520           default:
6521             gcc_unreachable ();
6522         }
6523       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6524       RTX_FRAME_RELATED_P (insn) = 1;
6525     }
6526
6527   if (flag_stack_usage_info)
6528     current_function_static_stack_size = constant_lower_bound (frame_size);
6529
6530   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6531     {
6532       if (crtl->is_leaf && !cfun->calls_alloca)
6533         {
6534           if (maybe_gt (frame_size, PROBE_INTERVAL)
6535               && maybe_gt (frame_size, get_stack_check_protect ()))
6536             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6537                                             (frame_size
6538                                              - get_stack_check_protect ()));
6539         }
6540       else if (maybe_gt (frame_size, 0))
6541         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6542     }
6543
6544   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6545   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6546
6547   /* In theory we should never have both an initial adjustment
6548      and a callee save adjustment.  Verify that is the case since the
6549      code below does not handle it for -fstack-clash-protection.  */
6550   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6551
6552   /* Will only probe if the initial adjustment is larger than the guard
6553      less the amount of the guard reserved for use by the caller's
6554      outgoing args.  */
6555   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6556                                           true, false);
6557
6558   if (callee_adjust != 0)
6559     aarch64_push_regs (reg1, reg2, callee_adjust);
6560
6561   if (emit_frame_chain)
6562     {
6563       poly_int64 reg_offset = callee_adjust;
6564       if (callee_adjust == 0)
6565         {
6566           reg1 = R29_REGNUM;
6567           reg2 = R30_REGNUM;
6568           reg_offset = callee_offset;
6569           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6570         }
6571       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6572                           stack_pointer_rtx, callee_offset,
6573                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6574       if (frame_pointer_needed && !frame_size.is_constant ())
6575         {
6576           /* Variable-sized frames need to describe the save slot
6577              address using DW_CFA_expression rather than DW_CFA_offset.
6578              This means that, without taking further action, the
6579              locations of the registers that we've already saved would
6580              remain based on the stack pointer even after we redefine
6581              the CFA based on the frame pointer.  We therefore need new
6582              DW_CFA_expressions to re-express the save slots with addresses
6583              based on the frame pointer.  */
6584           rtx_insn *insn = get_last_insn ();
6585           gcc_assert (RTX_FRAME_RELATED_P (insn));
6586
6587           /* Add an explicit CFA definition if this was previously
6588              implicit.  */
6589           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6590             {
6591               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6592                                        callee_offset);
6593               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6594                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6595             }
6596
6597           /* Change the save slot expressions for the registers that
6598              we've already saved.  */
6599           reg_offset -= callee_offset;
6600           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6601                                       reg_offset + UNITS_PER_WORD);
6602           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6603                                       reg_offset);
6604         }
6605       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6606     }
6607
6608   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6609                              callee_adjust != 0 || emit_frame_chain);
6610   if (crtl->abi->id () == ARM_PCS_SIMD)
6611     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6612                                callee_adjust != 0 || emit_frame_chain);
6613   else
6614     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6615                                callee_adjust != 0 || emit_frame_chain);
6616
6617   /* We may need to probe the final adjustment if it is larger than the guard
6618      that is assumed by the called.  */
6619   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6620                                           !frame_pointer_needed, true);
6621 }
6622
6623 /* Return TRUE if we can use a simple_return insn.
6624
6625    This function checks whether the callee saved stack is empty, which
6626    means no restore actions are need. The pro_and_epilogue will use
6627    this to check whether shrink-wrapping opt is feasible.  */
6628
6629 bool
6630 aarch64_use_return_insn_p (void)
6631 {
6632   if (!reload_completed)
6633     return false;
6634
6635   if (crtl->profile)
6636     return false;
6637
6638   return known_eq (cfun->machine->frame.frame_size, 0);
6639 }
6640
6641 /* Generate the epilogue instructions for returning from a function.
6642    This is almost exactly the reverse of the prolog sequence, except
6643    that we need to insert barriers to avoid scheduling loads that read
6644    from a deallocated stack, and we optimize the unwind records by
6645    emitting them all together if possible.  */
6646 void
6647 aarch64_expand_epilogue (bool for_sibcall)
6648 {
6649   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6650   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6651   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6652   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6653   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6654   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6655   rtx cfi_ops = NULL;
6656   rtx_insn *insn;
6657   /* A stack clash protection prologue may not have left EP0_REGNUM or
6658      EP1_REGNUM in a usable state.  The same is true for allocations
6659      with an SVE component, since we then need both temporary registers
6660      for each allocation.  For stack clash we are in a usable state if
6661      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6662   HOST_WIDE_INT guard_size
6663     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6664   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6665
6666   /* We can re-use the registers when the allocation amount is smaller than
6667      guard_size - guard_used_by_caller because we won't be doing any probes
6668      then.  In such situations the register should remain live with the correct
6669      value.  */
6670   bool can_inherit_p = (initial_adjust.is_constant ()
6671                         && final_adjust.is_constant ())
6672                         && (!flag_stack_clash_protection
6673                             || known_lt (initial_adjust,
6674                                          guard_size - guard_used_by_caller));
6675
6676   /* We need to add memory barrier to prevent read from deallocated stack.  */
6677   bool need_barrier_p
6678     = maybe_ne (get_frame_size ()
6679                 + cfun->machine->frame.saved_varargs_size, 0);
6680
6681   /* Emit a barrier to prevent loads from a deallocated stack.  */
6682   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6683       || cfun->calls_alloca
6684       || crtl->calls_eh_return)
6685     {
6686       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6687       need_barrier_p = false;
6688     }
6689
6690   /* Restore the stack pointer from the frame pointer if it may not
6691      be the same as the stack pointer.  */
6692   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6693   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6694   if (frame_pointer_needed
6695       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6696     /* If writeback is used when restoring callee-saves, the CFA
6697        is restored on the instruction doing the writeback.  */
6698     aarch64_add_offset (Pmode, stack_pointer_rtx,
6699                         hard_frame_pointer_rtx, -callee_offset,
6700                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6701   else
6702      /* The case where we need to re-use the register here is very rare, so
6703         avoid the complicated condition and just always emit a move if the
6704         immediate doesn't fit.  */
6705      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6706
6707   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6708                                 callee_adjust != 0, &cfi_ops);
6709   if (crtl->abi->id () == ARM_PCS_SIMD)
6710     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6711                                   callee_adjust != 0, &cfi_ops);
6712   else
6713     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6714                                   callee_adjust != 0, &cfi_ops);
6715
6716   if (need_barrier_p)
6717     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6718
6719   if (callee_adjust != 0)
6720     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6721
6722   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6723     {
6724       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6725       insn = get_last_insn ();
6726       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6727       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6728       RTX_FRAME_RELATED_P (insn) = 1;
6729       cfi_ops = NULL;
6730     }
6731
6732   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6733      add restriction on emit_move optimization to leaf functions.  */
6734   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6735                   (!can_inherit_p || !crtl->is_leaf
6736                    || df_regs_ever_live_p (EP0_REGNUM)));
6737
6738   if (cfi_ops)
6739     {
6740       /* Emit delayed restores and reset the CFA to be SP.  */
6741       insn = get_last_insn ();
6742       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6743       REG_NOTES (insn) = cfi_ops;
6744       RTX_FRAME_RELATED_P (insn) = 1;
6745     }
6746
6747   /* We prefer to emit the combined return/authenticate instruction RETAA,
6748      however there are three cases in which we must instead emit an explicit
6749      authentication instruction.
6750
6751         1) Sibcalls don't return in a normal way, so if we're about to call one
6752            we must authenticate.
6753
6754         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6755            generating code for !TARGET_ARMV8_3 we can't use it and must
6756            explicitly authenticate.
6757
6758         3) On an eh_return path we make extra stack adjustments to update the
6759            canonical frame address to be the exception handler's CFA.  We want
6760            to authenticate using the CFA of the function which calls eh_return.
6761     */
6762   if (aarch64_return_address_signing_enabled ()
6763       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6764     {
6765       switch (aarch64_ra_sign_key)
6766         {
6767           case AARCH64_KEY_A:
6768             insn = emit_insn (gen_autiasp ());
6769             break;
6770           case AARCH64_KEY_B:
6771             insn = emit_insn (gen_autibsp ());
6772             break;
6773           default:
6774             gcc_unreachable ();
6775         }
6776       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6777       RTX_FRAME_RELATED_P (insn) = 1;
6778     }
6779
6780   /* Stack adjustment for exception handler.  */
6781   if (crtl->calls_eh_return && !for_sibcall)
6782     {
6783       /* We need to unwind the stack by the offset computed by
6784          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6785          to be SP; letting the CFA move during this adjustment
6786          is just as correct as retaining the CFA from the body
6787          of the function.  Therefore, do nothing special.  */
6788       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6789     }
6790
6791   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6792   if (!for_sibcall)
6793     emit_jump_insn (ret_rtx);
6794 }
6795
6796 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6797    normally or return to a previous frame after unwinding.
6798
6799    An EH return uses a single shared return sequence.  The epilogue is
6800    exactly like a normal epilogue except that it has an extra input
6801    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6802    that must be applied after the frame has been destroyed.  An extra label
6803    is inserted before the epilogue which initializes this register to zero,
6804    and this is the entry point for a normal return.
6805
6806    An actual EH return updates the return address, initializes the stack
6807    adjustment and jumps directly into the epilogue (bypassing the zeroing
6808    of the adjustment).  Since the return address is typically saved on the
6809    stack when a function makes a call, the saved LR must be updated outside
6810    the epilogue.
6811
6812    This poses problems as the store is generated well before the epilogue,
6813    so the offset of LR is not known yet.  Also optimizations will remove the
6814    store as it appears dead, even after the epilogue is generated (as the
6815    base or offset for loading LR is different in many cases).
6816
6817    To avoid these problems this implementation forces the frame pointer
6818    in eh_return functions so that the location of LR is fixed and known early.
6819    It also marks the store volatile, so no optimization is permitted to
6820    remove the store.  */
6821 rtx
6822 aarch64_eh_return_handler_rtx (void)
6823 {
6824   rtx tmp = gen_frame_mem (Pmode,
6825     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6826
6827   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6828   MEM_VOLATILE_P (tmp) = true;
6829   return tmp;
6830 }
6831
6832 /* Output code to add DELTA to the first argument, and then jump
6833    to FUNCTION.  Used for C++ multiple inheritance.  */
6834 static void
6835 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6836                          HOST_WIDE_INT delta,
6837                          HOST_WIDE_INT vcall_offset,
6838                          tree function)
6839 {
6840   /* The this pointer is always in x0.  Note that this differs from
6841      Arm where the this pointer maybe bumped to r1 if r0 is required
6842      to return a pointer to an aggregate.  On AArch64 a result value
6843      pointer will be in x8.  */
6844   int this_regno = R0_REGNUM;
6845   rtx this_rtx, temp0, temp1, addr, funexp;
6846   rtx_insn *insn;
6847   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6848
6849   if (aarch64_bti_enabled ())
6850     emit_insn (gen_bti_c());
6851
6852   reload_completed = 1;
6853   emit_note (NOTE_INSN_PROLOGUE_END);
6854
6855   this_rtx = gen_rtx_REG (Pmode, this_regno);
6856   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6857   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6858
6859   if (vcall_offset == 0)
6860     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6861   else
6862     {
6863       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6864
6865       addr = this_rtx;
6866       if (delta != 0)
6867         {
6868           if (delta >= -256 && delta < 256)
6869             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6870                                        plus_constant (Pmode, this_rtx, delta));
6871           else
6872             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6873                                 temp1, temp0, false);
6874         }
6875
6876       if (Pmode == ptr_mode)
6877         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6878       else
6879         aarch64_emit_move (temp0,
6880                            gen_rtx_ZERO_EXTEND (Pmode,
6881                                                 gen_rtx_MEM (ptr_mode, addr)));
6882
6883       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6884           addr = plus_constant (Pmode, temp0, vcall_offset);
6885       else
6886         {
6887           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6888                                           Pmode);
6889           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6890         }
6891
6892       if (Pmode == ptr_mode)
6893         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6894       else
6895         aarch64_emit_move (temp1,
6896                            gen_rtx_SIGN_EXTEND (Pmode,
6897                                                 gen_rtx_MEM (ptr_mode, addr)));
6898
6899       emit_insn (gen_add2_insn (this_rtx, temp1));
6900     }
6901
6902   /* Generate a tail call to the target function.  */
6903   if (!TREE_USED (function))
6904     {
6905       assemble_external (function);
6906       TREE_USED (function) = 1;
6907     }
6908   funexp = XEXP (DECL_RTL (function), 0);
6909   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6910   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
6911   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
6912   SIBLING_CALL_P (insn) = 1;
6913
6914   insn = get_insns ();
6915   shorten_branches (insn);
6916
6917   assemble_start_function (thunk, fnname);
6918   final_start_function (insn, file, 1);
6919   final (insn, file, 1);
6920   final_end_function ();
6921   assemble_end_function (thunk, fnname);
6922
6923   /* Stop pretending to be a post-reload pass.  */
6924   reload_completed = 0;
6925 }
6926
6927 static bool
6928 aarch64_tls_referenced_p (rtx x)
6929 {
6930   if (!TARGET_HAVE_TLS)
6931     return false;
6932   subrtx_iterator::array_type array;
6933   FOR_EACH_SUBRTX (iter, array, x, ALL)
6934     {
6935       const_rtx x = *iter;
6936       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6937         return true;
6938       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6939          TLS offsets, not real symbol references.  */
6940       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6941         iter.skip_subrtxes ();
6942     }
6943   return false;
6944 }
6945
6946
6947 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6948    a left shift of 0 or 12 bits.  */
6949 bool
6950 aarch64_uimm12_shift (HOST_WIDE_INT val)
6951 {
6952   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6953           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6954           );
6955 }
6956
6957 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6958    that can be created with a left shift of 0 or 12.  */
6959 static HOST_WIDE_INT
6960 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6961 {
6962   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6963      handle correctly.  */
6964   gcc_assert ((val & 0xffffff) == val);
6965
6966   if (((val & 0xfff) << 0) == val)
6967     return val;
6968
6969   return val & (0xfff << 12);
6970 }
6971
6972 /* Return true if val is an immediate that can be loaded into a
6973    register by a MOVZ instruction.  */
6974 static bool
6975 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6976 {
6977   if (GET_MODE_SIZE (mode) > 4)
6978     {
6979       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6980           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6981         return 1;
6982     }
6983   else
6984     {
6985       /* Ignore sign extension.  */
6986       val &= (HOST_WIDE_INT) 0xffffffff;
6987     }
6988   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6989           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6990 }
6991
6992 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6993    64-bit (DImode) integer.  */
6994
6995 static unsigned HOST_WIDE_INT
6996 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6997 {
6998   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6999   while (size < 64)
7000     {
7001       val &= (HOST_WIDE_INT_1U << size) - 1;
7002       val |= val << size;
7003       size *= 2;
7004     }
7005   return val;
7006 }
7007
7008 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
7009
7010 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7011   {
7012     0x0000000100000001ull,
7013     0x0001000100010001ull,
7014     0x0101010101010101ull,
7015     0x1111111111111111ull,
7016     0x5555555555555555ull,
7017   };
7018
7019
7020 /* Return true if val is a valid bitmask immediate.  */
7021
7022 bool
7023 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7024 {
7025   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7026   int bits;
7027
7028   /* Check for a single sequence of one bits and return quickly if so.
7029      The special cases of all ones and all zeroes returns false.  */
7030   val = aarch64_replicate_bitmask_imm (val_in, mode);
7031   tmp = val + (val & -val);
7032
7033   if (tmp == (tmp & -tmp))
7034     return (val + 1) > 1;
7035
7036   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7037   if (mode == SImode)
7038     val = (val << 32) | (val & 0xffffffff);
7039
7040   /* Invert if the immediate doesn't start with a zero bit - this means we
7041      only need to search for sequences of one bits.  */
7042   if (val & 1)
7043     val = ~val;
7044
7045   /* Find the first set bit and set tmp to val with the first sequence of one
7046      bits removed.  Return success if there is a single sequence of ones.  */
7047   first_one = val & -val;
7048   tmp = val & (val + first_one);
7049
7050   if (tmp == 0)
7051     return true;
7052
7053   /* Find the next set bit and compute the difference in bit position.  */
7054   next_one = tmp & -tmp;
7055   bits = clz_hwi (first_one) - clz_hwi (next_one);
7056   mask = val ^ tmp;
7057
7058   /* Check the bit position difference is a power of 2, and that the first
7059      sequence of one bits fits within 'bits' bits.  */
7060   if ((mask >> bits) != 0 || bits != (bits & -bits))
7061     return false;
7062
7063   /* Check the sequence of one bits is repeated 64/bits times.  */
7064   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7065 }
7066
7067 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7068    Assumed precondition: VAL_IN Is not zero.  */
7069
7070 unsigned HOST_WIDE_INT
7071 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7072 {
7073   int lowest_bit_set = ctz_hwi (val_in);
7074   int highest_bit_set = floor_log2 (val_in);
7075   gcc_assert (val_in != 0);
7076
7077   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7078           (HOST_WIDE_INT_1U << lowest_bit_set));
7079 }
7080
7081 /* Create constant where bits outside of lowest bit set to highest bit set
7082    are set to 1.  */
7083
7084 unsigned HOST_WIDE_INT
7085 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7086 {
7087   return val_in | ~aarch64_and_split_imm1 (val_in);
7088 }
7089
7090 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7091
7092 bool
7093 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7094 {
7095   scalar_int_mode int_mode;
7096   if (!is_a <scalar_int_mode> (mode, &int_mode))
7097     return false;
7098
7099   if (aarch64_bitmask_imm (val_in, int_mode))
7100     return false;
7101
7102   if (aarch64_move_imm (val_in, int_mode))
7103     return false;
7104
7105   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7106
7107   return aarch64_bitmask_imm (imm2, int_mode);
7108 }
7109
7110 /* Return true if val is an immediate that can be loaded into a
7111    register in a single instruction.  */
7112 bool
7113 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7114 {
7115   scalar_int_mode int_mode;
7116   if (!is_a <scalar_int_mode> (mode, &int_mode))
7117     return false;
7118
7119   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7120     return 1;
7121   return aarch64_bitmask_imm (val, int_mode);
7122 }
7123
7124 static bool
7125 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7126 {
7127   rtx base, offset;
7128
7129   if (GET_CODE (x) == HIGH)
7130     return true;
7131
7132   /* There's no way to calculate VL-based values using relocations.  */
7133   subrtx_iterator::array_type array;
7134   FOR_EACH_SUBRTX (iter, array, x, ALL)
7135     if (GET_CODE (*iter) == CONST_POLY_INT)
7136       return true;
7137
7138   split_const (x, &base, &offset);
7139   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7140     {
7141       if (aarch64_classify_symbol (base, INTVAL (offset))
7142           != SYMBOL_FORCE_TO_MEM)
7143         return true;
7144       else
7145         /* Avoid generating a 64-bit relocation in ILP32; leave
7146            to aarch64_expand_mov_immediate to handle it properly.  */
7147         return mode != ptr_mode;
7148     }
7149
7150   return aarch64_tls_referenced_p (x);
7151 }
7152
7153 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7154    The expansion for a table switch is quite expensive due to the number
7155    of instructions, the table lookup and hard to predict indirect jump.
7156    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7157    set, otherwise use tables for > 16 cases as a tradeoff between size and
7158    performance.  When optimizing for size, use the default setting.  */
7159
7160 static unsigned int
7161 aarch64_case_values_threshold (void)
7162 {
7163   /* Use the specified limit for the number of cases before using jump
7164      tables at higher optimization levels.  */
7165   if (optimize > 2
7166       && selected_cpu->tune->max_case_values != 0)
7167     return selected_cpu->tune->max_case_values;
7168   else
7169     return optimize_size ? default_case_values_threshold () : 17;
7170 }
7171
7172 /* Return true if register REGNO is a valid index register.
7173    STRICT_P is true if REG_OK_STRICT is in effect.  */
7174
7175 bool
7176 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7177 {
7178   if (!HARD_REGISTER_NUM_P (regno))
7179     {
7180       if (!strict_p)
7181         return true;
7182
7183       if (!reg_renumber)
7184         return false;
7185
7186       regno = reg_renumber[regno];
7187     }
7188   return GP_REGNUM_P (regno);
7189 }
7190
7191 /* Return true if register REGNO is a valid base register for mode MODE.
7192    STRICT_P is true if REG_OK_STRICT is in effect.  */
7193
7194 bool
7195 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7196 {
7197   if (!HARD_REGISTER_NUM_P (regno))
7198     {
7199       if (!strict_p)
7200         return true;
7201
7202       if (!reg_renumber)
7203         return false;
7204
7205       regno = reg_renumber[regno];
7206     }
7207
7208   /* The fake registers will be eliminated to either the stack or
7209      hard frame pointer, both of which are usually valid base registers.
7210      Reload deals with the cases where the eliminated form isn't valid.  */
7211   return (GP_REGNUM_P (regno)
7212           || regno == SP_REGNUM
7213           || regno == FRAME_POINTER_REGNUM
7214           || regno == ARG_POINTER_REGNUM);
7215 }
7216
7217 /* Return true if X is a valid base register for mode MODE.
7218    STRICT_P is true if REG_OK_STRICT is in effect.  */
7219
7220 static bool
7221 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7222 {
7223   if (!strict_p
7224       && GET_CODE (x) == SUBREG
7225       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7226     x = SUBREG_REG (x);
7227
7228   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7229 }
7230
7231 /* Return true if address offset is a valid index.  If it is, fill in INFO
7232    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7233
7234 static bool
7235 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7236                         machine_mode mode, bool strict_p)
7237 {
7238   enum aarch64_address_type type;
7239   rtx index;
7240   int shift;
7241
7242   /* (reg:P) */
7243   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7244       && GET_MODE (x) == Pmode)
7245     {
7246       type = ADDRESS_REG_REG;
7247       index = x;
7248       shift = 0;
7249     }
7250   /* (sign_extend:DI (reg:SI)) */
7251   else if ((GET_CODE (x) == SIGN_EXTEND
7252             || GET_CODE (x) == ZERO_EXTEND)
7253            && GET_MODE (x) == DImode
7254            && GET_MODE (XEXP (x, 0)) == SImode)
7255     {
7256       type = (GET_CODE (x) == SIGN_EXTEND)
7257         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7258       index = XEXP (x, 0);
7259       shift = 0;
7260     }
7261   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7262   else if (GET_CODE (x) == MULT
7263            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7264                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7265            && GET_MODE (XEXP (x, 0)) == DImode
7266            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7267            && CONST_INT_P (XEXP (x, 1)))
7268     {
7269       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7270         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7271       index = XEXP (XEXP (x, 0), 0);
7272       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7273     }
7274   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7275   else if (GET_CODE (x) == ASHIFT
7276            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7277                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7278            && GET_MODE (XEXP (x, 0)) == DImode
7279            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7280            && CONST_INT_P (XEXP (x, 1)))
7281     {
7282       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7283         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7284       index = XEXP (XEXP (x, 0), 0);
7285       shift = INTVAL (XEXP (x, 1));
7286     }
7287   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7288   else if ((GET_CODE (x) == SIGN_EXTRACT
7289             || GET_CODE (x) == ZERO_EXTRACT)
7290            && GET_MODE (x) == DImode
7291            && GET_CODE (XEXP (x, 0)) == MULT
7292            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7293            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7294     {
7295       type = (GET_CODE (x) == SIGN_EXTRACT)
7296         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7297       index = XEXP (XEXP (x, 0), 0);
7298       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7299       if (INTVAL (XEXP (x, 1)) != 32 + shift
7300           || INTVAL (XEXP (x, 2)) != 0)
7301         shift = -1;
7302     }
7303   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7304      (const_int 0xffffffff<<shift)) */
7305   else if (GET_CODE (x) == AND
7306            && GET_MODE (x) == DImode
7307            && GET_CODE (XEXP (x, 0)) == MULT
7308            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7309            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7310            && CONST_INT_P (XEXP (x, 1)))
7311     {
7312       type = ADDRESS_REG_UXTW;
7313       index = XEXP (XEXP (x, 0), 0);
7314       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7315       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7316         shift = -1;
7317     }
7318   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7319   else if ((GET_CODE (x) == SIGN_EXTRACT
7320             || GET_CODE (x) == ZERO_EXTRACT)
7321            && GET_MODE (x) == DImode
7322            && GET_CODE (XEXP (x, 0)) == ASHIFT
7323            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7324            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7325     {
7326       type = (GET_CODE (x) == SIGN_EXTRACT)
7327         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7328       index = XEXP (XEXP (x, 0), 0);
7329       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7330       if (INTVAL (XEXP (x, 1)) != 32 + shift
7331           || INTVAL (XEXP (x, 2)) != 0)
7332         shift = -1;
7333     }
7334   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7335      (const_int 0xffffffff<<shift)) */
7336   else if (GET_CODE (x) == AND
7337            && GET_MODE (x) == DImode
7338            && GET_CODE (XEXP (x, 0)) == ASHIFT
7339            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7340            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7341            && CONST_INT_P (XEXP (x, 1)))
7342     {
7343       type = ADDRESS_REG_UXTW;
7344       index = XEXP (XEXP (x, 0), 0);
7345       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7346       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7347         shift = -1;
7348     }
7349   /* (mult:P (reg:P) (const_int scale)) */
7350   else if (GET_CODE (x) == MULT
7351            && GET_MODE (x) == Pmode
7352            && GET_MODE (XEXP (x, 0)) == Pmode
7353            && CONST_INT_P (XEXP (x, 1)))
7354     {
7355       type = ADDRESS_REG_REG;
7356       index = XEXP (x, 0);
7357       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7358     }
7359   /* (ashift:P (reg:P) (const_int shift)) */
7360   else if (GET_CODE (x) == ASHIFT
7361            && GET_MODE (x) == Pmode
7362            && GET_MODE (XEXP (x, 0)) == Pmode
7363            && CONST_INT_P (XEXP (x, 1)))
7364     {
7365       type = ADDRESS_REG_REG;
7366       index = XEXP (x, 0);
7367       shift = INTVAL (XEXP (x, 1));
7368     }
7369   else
7370     return false;
7371
7372   if (!strict_p
7373       && GET_CODE (index) == SUBREG
7374       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7375     index = SUBREG_REG (index);
7376
7377   if (aarch64_sve_data_mode_p (mode))
7378     {
7379       if (type != ADDRESS_REG_REG
7380           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7381         return false;
7382     }
7383   else
7384     {
7385       if (shift != 0
7386           && !(IN_RANGE (shift, 1, 3)
7387                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7388         return false;
7389     }
7390
7391   if (REG_P (index)
7392       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7393     {
7394       info->type = type;
7395       info->offset = index;
7396       info->shift = shift;
7397       return true;
7398     }
7399
7400   return false;
7401 }
7402
7403 /* Return true if MODE is one of the modes for which we
7404    support LDP/STP operations.  */
7405
7406 static bool
7407 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7408 {
7409   return mode == SImode || mode == DImode
7410          || mode == SFmode || mode == DFmode
7411          || (aarch64_vector_mode_supported_p (mode)
7412              && (known_eq (GET_MODE_SIZE (mode), 8)
7413                  || (known_eq (GET_MODE_SIZE (mode), 16)
7414                     && (aarch64_tune_params.extra_tuning_flags
7415                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7416 }
7417
7418 /* Return true if REGNO is a virtual pointer register, or an eliminable
7419    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7420    include stack_pointer or hard_frame_pointer.  */
7421 static bool
7422 virt_or_elim_regno_p (unsigned regno)
7423 {
7424   return ((regno >= FIRST_VIRTUAL_REGISTER
7425            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7426           || regno == FRAME_POINTER_REGNUM
7427           || regno == ARG_POINTER_REGNUM);
7428 }
7429
7430 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7431    If it is, fill in INFO appropriately.  STRICT_P is true if
7432    REG_OK_STRICT is in effect.  */
7433
7434 bool
7435 aarch64_classify_address (struct aarch64_address_info *info,
7436                           rtx x, machine_mode mode, bool strict_p,
7437                           aarch64_addr_query_type type)
7438 {
7439   enum rtx_code code = GET_CODE (x);
7440   rtx op0, op1;
7441   poly_int64 offset;
7442
7443   HOST_WIDE_INT const_size;
7444
7445   /* On BE, we use load/store pair for all large int mode load/stores.
7446      TI/TFmode may also use a load/store pair.  */
7447   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7448   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7449   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7450                             || type == ADDR_QUERY_LDP_STP_N
7451                             || mode == TImode
7452                             || mode == TFmode
7453                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7454
7455   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7456      corresponds to the actual size of the memory being loaded/stored and the
7457      mode of the corresponding addressing mode is half of that.  */
7458   if (type == ADDR_QUERY_LDP_STP_N
7459       && known_eq (GET_MODE_SIZE (mode), 16))
7460     mode = DFmode;
7461
7462   bool allow_reg_index_p = (!load_store_pair_p
7463                             && (known_lt (GET_MODE_SIZE (mode), 16)
7464                                 || vec_flags == VEC_ADVSIMD
7465                                 || vec_flags & VEC_SVE_DATA));
7466
7467   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7468      [Rn, #offset, MUL VL].  */
7469   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7470       && (code != REG && code != PLUS))
7471     return false;
7472
7473   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7474      REG addressing.  */
7475   if (advsimd_struct_p
7476       && !BYTES_BIG_ENDIAN
7477       && (code != POST_INC && code != REG))
7478     return false;
7479
7480   gcc_checking_assert (GET_MODE (x) == VOIDmode
7481                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7482
7483   switch (code)
7484     {
7485     case REG:
7486     case SUBREG:
7487       info->type = ADDRESS_REG_IMM;
7488       info->base = x;
7489       info->offset = const0_rtx;
7490       info->const_offset = 0;
7491       return aarch64_base_register_rtx_p (x, strict_p);
7492
7493     case PLUS:
7494       op0 = XEXP (x, 0);
7495       op1 = XEXP (x, 1);
7496
7497       if (! strict_p
7498           && REG_P (op0)
7499           && virt_or_elim_regno_p (REGNO (op0))
7500           && poly_int_rtx_p (op1, &offset))
7501         {
7502           info->type = ADDRESS_REG_IMM;
7503           info->base = op0;
7504           info->offset = op1;
7505           info->const_offset = offset;
7506
7507           return true;
7508         }
7509
7510       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7511           && aarch64_base_register_rtx_p (op0, strict_p)
7512           && poly_int_rtx_p (op1, &offset))
7513         {
7514           info->type = ADDRESS_REG_IMM;
7515           info->base = op0;
7516           info->offset = op1;
7517           info->const_offset = offset;
7518
7519           /* TImode and TFmode values are allowed in both pairs of X
7520              registers and individual Q registers.  The available
7521              address modes are:
7522              X,X: 7-bit signed scaled offset
7523              Q:   9-bit signed offset
7524              We conservatively require an offset representable in either mode.
7525              When performing the check for pairs of X registers i.e.  LDP/STP
7526              pass down DImode since that is the natural size of the LDP/STP
7527              instruction memory accesses.  */
7528           if (mode == TImode || mode == TFmode)
7529             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7530                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7531                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7532
7533           /* A 7bit offset check because OImode will emit a ldp/stp
7534              instruction (only big endian will get here).
7535              For ldp/stp instructions, the offset is scaled for the size of a
7536              single element of the pair.  */
7537           if (mode == OImode)
7538             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7539
7540           /* Three 9/12 bit offsets checks because CImode will emit three
7541              ldr/str instructions (only big endian will get here).  */
7542           if (mode == CImode)
7543             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7544                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7545                                                                offset + 32)
7546                         || offset_12bit_unsigned_scaled_p (V16QImode,
7547                                                            offset + 32)));
7548
7549           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7550              instructions (only big endian will get here).  */
7551           if (mode == XImode)
7552             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7553                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7554                                                             offset + 32));
7555
7556           /* Make "m" use the LD1 offset range for SVE data modes, so
7557              that pre-RTL optimizers like ivopts will work to that
7558              instead of the wider LDR/STR range.  */
7559           if (vec_flags == VEC_SVE_DATA)
7560             return (type == ADDR_QUERY_M
7561                     ? offset_4bit_signed_scaled_p (mode, offset)
7562                     : offset_9bit_signed_scaled_p (mode, offset));
7563
7564           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7565             {
7566               poly_int64 end_offset = (offset
7567                                        + GET_MODE_SIZE (mode)
7568                                        - BYTES_PER_SVE_VECTOR);
7569               return (type == ADDR_QUERY_M
7570                       ? offset_4bit_signed_scaled_p (mode, offset)
7571                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7572                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7573                                                          end_offset)));
7574             }
7575
7576           if (vec_flags == VEC_SVE_PRED)
7577             return offset_9bit_signed_scaled_p (mode, offset);
7578
7579           if (load_store_pair_p)
7580             return ((known_eq (GET_MODE_SIZE (mode), 4)
7581                      || known_eq (GET_MODE_SIZE (mode), 8)
7582                      || known_eq (GET_MODE_SIZE (mode), 16))
7583                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7584           else
7585             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7586                     || offset_12bit_unsigned_scaled_p (mode, offset));
7587         }
7588
7589       if (allow_reg_index_p)
7590         {
7591           /* Look for base + (scaled/extended) index register.  */
7592           if (aarch64_base_register_rtx_p (op0, strict_p)
7593               && aarch64_classify_index (info, op1, mode, strict_p))
7594             {
7595               info->base = op0;
7596               return true;
7597             }
7598           if (aarch64_base_register_rtx_p (op1, strict_p)
7599               && aarch64_classify_index (info, op0, mode, strict_p))
7600             {
7601               info->base = op1;
7602               return true;
7603             }
7604         }
7605
7606       return false;
7607
7608     case POST_INC:
7609     case POST_DEC:
7610     case PRE_INC:
7611     case PRE_DEC:
7612       info->type = ADDRESS_REG_WB;
7613       info->base = XEXP (x, 0);
7614       info->offset = NULL_RTX;
7615       return aarch64_base_register_rtx_p (info->base, strict_p);
7616
7617     case POST_MODIFY:
7618     case PRE_MODIFY:
7619       info->type = ADDRESS_REG_WB;
7620       info->base = XEXP (x, 0);
7621       if (GET_CODE (XEXP (x, 1)) == PLUS
7622           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7623           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7624           && aarch64_base_register_rtx_p (info->base, strict_p))
7625         {
7626           info->offset = XEXP (XEXP (x, 1), 1);
7627           info->const_offset = offset;
7628
7629           /* TImode and TFmode values are allowed in both pairs of X
7630              registers and individual Q registers.  The available
7631              address modes are:
7632              X,X: 7-bit signed scaled offset
7633              Q:   9-bit signed offset
7634              We conservatively require an offset representable in either mode.
7635            */
7636           if (mode == TImode || mode == TFmode)
7637             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7638                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7639
7640           if (load_store_pair_p)
7641             return ((known_eq (GET_MODE_SIZE (mode), 4)
7642                      || known_eq (GET_MODE_SIZE (mode), 8)
7643                      || known_eq (GET_MODE_SIZE (mode), 16))
7644                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7645           else
7646             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7647         }
7648       return false;
7649
7650     case CONST:
7651     case SYMBOL_REF:
7652     case LABEL_REF:
7653       /* load literal: pc-relative constant pool entry.  Only supported
7654          for SI mode or larger.  */
7655       info->type = ADDRESS_SYMBOLIC;
7656
7657       if (!load_store_pair_p
7658           && GET_MODE_SIZE (mode).is_constant (&const_size)
7659           && const_size >= 4)
7660         {
7661           rtx sym, addend;
7662
7663           split_const (x, &sym, &addend);
7664           return ((GET_CODE (sym) == LABEL_REF
7665                    || (GET_CODE (sym) == SYMBOL_REF
7666                        && CONSTANT_POOL_ADDRESS_P (sym)
7667                        && aarch64_pcrelative_literal_loads)));
7668         }
7669       return false;
7670
7671     case LO_SUM:
7672       info->type = ADDRESS_LO_SUM;
7673       info->base = XEXP (x, 0);
7674       info->offset = XEXP (x, 1);
7675       if (allow_reg_index_p
7676           && aarch64_base_register_rtx_p (info->base, strict_p))
7677         {
7678           rtx sym, offs;
7679           split_const (info->offset, &sym, &offs);
7680           if (GET_CODE (sym) == SYMBOL_REF
7681               && (aarch64_classify_symbol (sym, INTVAL (offs))
7682                   == SYMBOL_SMALL_ABSOLUTE))
7683             {
7684               /* The symbol and offset must be aligned to the access size.  */
7685               unsigned int align;
7686
7687               if (CONSTANT_POOL_ADDRESS_P (sym))
7688                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7689               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7690                 {
7691                   tree exp = SYMBOL_REF_DECL (sym);
7692                   align = TYPE_ALIGN (TREE_TYPE (exp));
7693                   align = aarch64_constant_alignment (exp, align);
7694                 }
7695               else if (SYMBOL_REF_DECL (sym))
7696                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7697               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7698                        && SYMBOL_REF_BLOCK (sym) != NULL)
7699                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7700               else
7701                 align = BITS_PER_UNIT;
7702
7703               poly_int64 ref_size = GET_MODE_SIZE (mode);
7704               if (known_eq (ref_size, 0))
7705                 ref_size = GET_MODE_SIZE (DImode);
7706
7707               return (multiple_p (INTVAL (offs), ref_size)
7708                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7709             }
7710         }
7711       return false;
7712
7713     default:
7714       return false;
7715     }
7716 }
7717
7718 /* Return true if the address X is valid for a PRFM instruction.
7719    STRICT_P is true if we should do strict checking with
7720    aarch64_classify_address.  */
7721
7722 bool
7723 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7724 {
7725   struct aarch64_address_info addr;
7726
7727   /* PRFM accepts the same addresses as DImode...  */
7728   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7729   if (!res)
7730     return false;
7731
7732   /* ... except writeback forms.  */
7733   return addr.type != ADDRESS_REG_WB;
7734 }
7735
7736 bool
7737 aarch64_symbolic_address_p (rtx x)
7738 {
7739   rtx offset;
7740
7741   split_const (x, &x, &offset);
7742   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7743 }
7744
7745 /* Classify the base of symbolic expression X.  */
7746
7747 enum aarch64_symbol_type
7748 aarch64_classify_symbolic_expression (rtx x)
7749 {
7750   rtx offset;
7751
7752   split_const (x, &x, &offset);
7753   return aarch64_classify_symbol (x, INTVAL (offset));
7754 }
7755
7756
7757 /* Return TRUE if X is a legitimate address for accessing memory in
7758    mode MODE.  */
7759 static bool
7760 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7761 {
7762   struct aarch64_address_info addr;
7763
7764   return aarch64_classify_address (&addr, x, mode, strict_p);
7765 }
7766
7767 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7768    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7769 bool
7770 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7771                               aarch64_addr_query_type type)
7772 {
7773   struct aarch64_address_info addr;
7774
7775   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7776 }
7777
7778 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7779
7780 static bool
7781 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7782                                          poly_int64 orig_offset,
7783                                          machine_mode mode)
7784 {
7785   HOST_WIDE_INT size;
7786   if (GET_MODE_SIZE (mode).is_constant (&size))
7787     {
7788       HOST_WIDE_INT const_offset, second_offset;
7789
7790       /* A general SVE offset is A * VQ + B.  Remove the A component from
7791          coefficient 0 in order to get the constant B.  */
7792       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7793
7794       /* Split an out-of-range address displacement into a base and
7795          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7796          range otherwise to increase opportunities for sharing the base
7797          address of different sizes.  Unaligned accesses use the signed
7798          9-bit range, TImode/TFmode use the intersection of signed
7799          scaled 7-bit and signed 9-bit offset.  */
7800       if (mode == TImode || mode == TFmode)
7801         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7802       else if ((const_offset & (size - 1)) != 0)
7803         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7804       else
7805         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7806
7807       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7808         return false;
7809
7810       /* Split the offset into second_offset and the rest.  */
7811       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7812       *offset2 = gen_int_mode (second_offset, Pmode);
7813       return true;
7814     }
7815   else
7816     {
7817       /* Get the mode we should use as the basis of the range.  For structure
7818          modes this is the mode of one vector.  */
7819       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7820       machine_mode step_mode
7821         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7822
7823       /* Get the "mul vl" multiplier we'd like to use.  */
7824       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7825       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7826       if (vec_flags & VEC_SVE_DATA)
7827         /* LDR supports a 9-bit range, but the move patterns for
7828            structure modes require all vectors to be in range of the
7829            same base.  The simplest way of accomodating that while still
7830            promoting reuse of anchor points between different modes is
7831            to use an 8-bit range unconditionally.  */
7832         vnum = ((vnum + 128) & 255) - 128;
7833       else
7834         /* Predicates are only handled singly, so we might as well use
7835            the full range.  */
7836         vnum = ((vnum + 256) & 511) - 256;
7837       if (vnum == 0)
7838         return false;
7839
7840       /* Convert the "mul vl" multiplier into a byte offset.  */
7841       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7842       if (known_eq (second_offset, orig_offset))
7843         return false;
7844
7845       /* Split the offset into second_offset and the rest.  */
7846       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7847       *offset2 = gen_int_mode (second_offset, Pmode);
7848       return true;
7849     }
7850 }
7851
7852 /* Return the binary representation of floating point constant VALUE in INTVAL.
7853    If the value cannot be converted, return false without setting INTVAL.
7854    The conversion is done in the given MODE.  */
7855 bool
7856 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7857 {
7858
7859   /* We make a general exception for 0.  */
7860   if (aarch64_float_const_zero_rtx_p (value))
7861     {
7862       *intval = 0;
7863       return true;
7864     }
7865
7866   scalar_float_mode mode;
7867   if (GET_CODE (value) != CONST_DOUBLE
7868       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7869       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7870       /* Only support up to DF mode.  */
7871       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7872     return false;
7873
7874   unsigned HOST_WIDE_INT ival = 0;
7875
7876   long res[2];
7877   real_to_target (res,
7878                   CONST_DOUBLE_REAL_VALUE (value),
7879                   REAL_MODE_FORMAT (mode));
7880
7881   if (mode == DFmode)
7882     {
7883       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7884       ival = zext_hwi (res[order], 32);
7885       ival |= (zext_hwi (res[1 - order], 32) << 32);
7886     }
7887   else
7888       ival = zext_hwi (res[0], 32);
7889
7890   *intval = ival;
7891   return true;
7892 }
7893
7894 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7895    single MOV(+MOVK) followed by an FMOV.  */
7896 bool
7897 aarch64_float_const_rtx_p (rtx x)
7898 {
7899   machine_mode mode = GET_MODE (x);
7900   if (mode == VOIDmode)
7901     return false;
7902
7903   /* Determine whether it's cheaper to write float constants as
7904      mov/movk pairs over ldr/adrp pairs.  */
7905   unsigned HOST_WIDE_INT ival;
7906
7907   if (GET_CODE (x) == CONST_DOUBLE
7908       && SCALAR_FLOAT_MODE_P (mode)
7909       && aarch64_reinterpret_float_as_int (x, &ival))
7910     {
7911       scalar_int_mode imode = (mode == HFmode
7912                                ? SImode
7913                                : int_mode_for_mode (mode).require ());
7914       int num_instr = aarch64_internal_mov_immediate
7915                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7916       return num_instr < 3;
7917     }
7918
7919   return false;
7920 }
7921
7922 /* Return TRUE if rtx X is immediate constant 0.0 */
7923 bool
7924 aarch64_float_const_zero_rtx_p (rtx x)
7925 {
7926   if (GET_MODE (x) == VOIDmode)
7927     return false;
7928
7929   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7930     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7931   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7932 }
7933
7934 /* Return TRUE if rtx X is immediate constant that fits in a single
7935    MOVI immediate operation.  */
7936 bool
7937 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7938 {
7939   if (!TARGET_SIMD)
7940      return false;
7941
7942   machine_mode vmode;
7943   scalar_int_mode imode;
7944   unsigned HOST_WIDE_INT ival;
7945
7946   if (GET_CODE (x) == CONST_DOUBLE
7947       && SCALAR_FLOAT_MODE_P (mode))
7948     {
7949       if (!aarch64_reinterpret_float_as_int (x, &ival))
7950         return false;
7951
7952       /* We make a general exception for 0.  */
7953       if (aarch64_float_const_zero_rtx_p (x))
7954         return true;
7955
7956       imode = int_mode_for_mode (mode).require ();
7957     }
7958   else if (GET_CODE (x) == CONST_INT
7959            && is_a <scalar_int_mode> (mode, &imode))
7960     ival = INTVAL (x);
7961   else
7962     return false;
7963
7964    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7965      a 128 bit vector mode.  */
7966   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7967
7968   vmode = aarch64_simd_container_mode (imode, width);
7969   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7970
7971   return aarch64_simd_valid_immediate (v_op, NULL);
7972 }
7973
7974
7975 /* Return the fixed registers used for condition codes.  */
7976
7977 static bool
7978 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7979 {
7980   *p1 = CC_REGNUM;
7981   *p2 = INVALID_REGNUM;
7982   return true;
7983 }
7984
7985 /* This function is used by the call expanders of the machine description.
7986    RESULT is the register in which the result is returned.  It's NULL for
7987    "call" and "sibcall".
7988    MEM is the location of the function call.
7989    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
7990    SIBCALL indicates whether this function call is normal call or sibling call.
7991    It will generate different pattern accordingly.  */
7992
7993 void
7994 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
7995 {
7996   rtx call, callee, tmp;
7997   rtvec vec;
7998   machine_mode mode;
7999
8000   gcc_assert (MEM_P (mem));
8001   callee = XEXP (mem, 0);
8002   mode = GET_MODE (callee);
8003   gcc_assert (mode == Pmode);
8004
8005   /* Decide if we should generate indirect calls by loading the
8006      address of the callee into a register before performing
8007      the branch-and-link.  */
8008   if (SYMBOL_REF_P (callee)
8009       ? (aarch64_is_long_call_p (callee)
8010          || aarch64_is_noplt_call_p (callee))
8011       : !REG_P (callee))
8012     XEXP (mem, 0) = force_reg (mode, callee);
8013
8014   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8015
8016   if (result != NULL_RTX)
8017     call = gen_rtx_SET (result, call);
8018
8019   if (sibcall)
8020     tmp = ret_rtx;
8021   else
8022     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8023
8024   gcc_assert (CONST_INT_P (callee_abi));
8025   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8026                                UNSPEC_CALLEE_ABI);
8027
8028   vec = gen_rtvec (3, call, callee_abi, tmp);
8029   call = gen_rtx_PARALLEL (VOIDmode, vec);
8030
8031   aarch64_emit_call_insn (call);
8032 }
8033
8034 /* Emit call insn with PAT and do aarch64-specific handling.  */
8035
8036 void
8037 aarch64_emit_call_insn (rtx pat)
8038 {
8039   rtx insn = emit_call_insn (pat);
8040
8041   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8042   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8043   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8044 }
8045
8046 machine_mode
8047 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8048 {
8049   machine_mode mode_x = GET_MODE (x);
8050   rtx_code code_x = GET_CODE (x);
8051
8052   /* All floating point compares return CCFP if it is an equality
8053      comparison, and CCFPE otherwise.  */
8054   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8055     {
8056       switch (code)
8057         {
8058         case EQ:
8059         case NE:
8060         case UNORDERED:
8061         case ORDERED:
8062         case UNLT:
8063         case UNLE:
8064         case UNGT:
8065         case UNGE:
8066         case UNEQ:
8067           return CCFPmode;
8068
8069         case LT:
8070         case LE:
8071         case GT:
8072         case GE:
8073         case LTGT:
8074           return CCFPEmode;
8075
8076         default:
8077           gcc_unreachable ();
8078         }
8079     }
8080
8081   /* Equality comparisons of short modes against zero can be performed
8082      using the TST instruction with the appropriate bitmask.  */
8083   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8084       && (code == EQ || code == NE)
8085       && (mode_x == HImode || mode_x == QImode))
8086     return CC_NZmode;
8087
8088   /* Similarly, comparisons of zero_extends from shorter modes can
8089      be performed using an ANDS with an immediate mask.  */
8090   if (y == const0_rtx && code_x == ZERO_EXTEND
8091       && (mode_x == SImode || mode_x == DImode)
8092       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8093       && (code == EQ || code == NE))
8094     return CC_NZmode;
8095
8096   if ((mode_x == SImode || mode_x == DImode)
8097       && y == const0_rtx
8098       && (code == EQ || code == NE || code == LT || code == GE)
8099       && (code_x == PLUS || code_x == MINUS || code_x == AND
8100           || code_x == NEG
8101           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8102               && CONST_INT_P (XEXP (x, 2)))))
8103     return CC_NZmode;
8104
8105   /* A compare with a shifted operand.  Because of canonicalization,
8106      the comparison will have to be swapped when we emit the assembly
8107      code.  */
8108   if ((mode_x == SImode || mode_x == DImode)
8109       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8110       && (code_x == ASHIFT || code_x == ASHIFTRT
8111           || code_x == LSHIFTRT
8112           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8113     return CC_SWPmode;
8114
8115   /* Similarly for a negated operand, but we can only do this for
8116      equalities.  */
8117   if ((mode_x == SImode || mode_x == DImode)
8118       && (REG_P (y) || GET_CODE (y) == SUBREG)
8119       && (code == EQ || code == NE)
8120       && code_x == NEG)
8121     return CC_Zmode;
8122
8123   /* A test for unsigned overflow from an addition.  */
8124   if ((mode_x == DImode || mode_x == TImode)
8125       && (code == LTU || code == GEU)
8126       && code_x == PLUS
8127       && rtx_equal_p (XEXP (x, 0), y))
8128     return CC_Cmode;
8129
8130   /* A test for unsigned overflow from an add with carry.  */
8131   if ((mode_x == DImode || mode_x == TImode)
8132       && (code == LTU || code == GEU)
8133       && code_x == PLUS
8134       && CONST_SCALAR_INT_P (y)
8135       && (rtx_mode_t (y, mode_x)
8136           == (wi::shwi (1, mode_x)
8137               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8138     return CC_ADCmode;
8139
8140   /* A test for signed overflow.  */
8141   if ((mode_x == DImode || mode_x == TImode)
8142       && code == NE
8143       && code_x == PLUS
8144       && GET_CODE (y) == SIGN_EXTEND)
8145     return CC_Vmode;
8146
8147   /* For everything else, return CCmode.  */
8148   return CCmode;
8149 }
8150
8151 static int
8152 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8153
8154 int
8155 aarch64_get_condition_code (rtx x)
8156 {
8157   machine_mode mode = GET_MODE (XEXP (x, 0));
8158   enum rtx_code comp_code = GET_CODE (x);
8159
8160   if (GET_MODE_CLASS (mode) != MODE_CC)
8161     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8162   return aarch64_get_condition_code_1 (mode, comp_code);
8163 }
8164
8165 static int
8166 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8167 {
8168   switch (mode)
8169     {
8170     case E_CCFPmode:
8171     case E_CCFPEmode:
8172       switch (comp_code)
8173         {
8174         case GE: return AARCH64_GE;
8175         case GT: return AARCH64_GT;
8176         case LE: return AARCH64_LS;
8177         case LT: return AARCH64_MI;
8178         case NE: return AARCH64_NE;
8179         case EQ: return AARCH64_EQ;
8180         case ORDERED: return AARCH64_VC;
8181         case UNORDERED: return AARCH64_VS;
8182         case UNLT: return AARCH64_LT;
8183         case UNLE: return AARCH64_LE;
8184         case UNGT: return AARCH64_HI;
8185         case UNGE: return AARCH64_PL;
8186         default: return -1;
8187         }
8188       break;
8189
8190     case E_CCmode:
8191       switch (comp_code)
8192         {
8193         case NE: return AARCH64_NE;
8194         case EQ: return AARCH64_EQ;
8195         case GE: return AARCH64_GE;
8196         case GT: return AARCH64_GT;
8197         case LE: return AARCH64_LE;
8198         case LT: return AARCH64_LT;
8199         case GEU: return AARCH64_CS;
8200         case GTU: return AARCH64_HI;
8201         case LEU: return AARCH64_LS;
8202         case LTU: return AARCH64_CC;
8203         default: return -1;
8204         }
8205       break;
8206
8207     case E_CC_SWPmode:
8208       switch (comp_code)
8209         {
8210         case NE: return AARCH64_NE;
8211         case EQ: return AARCH64_EQ;
8212         case GE: return AARCH64_LE;
8213         case GT: return AARCH64_LT;
8214         case LE: return AARCH64_GE;
8215         case LT: return AARCH64_GT;
8216         case GEU: return AARCH64_LS;
8217         case GTU: return AARCH64_CC;
8218         case LEU: return AARCH64_CS;
8219         case LTU: return AARCH64_HI;
8220         default: return -1;
8221         }
8222       break;
8223
8224     case E_CC_NZCmode:
8225       switch (comp_code)
8226         {
8227         case NE: return AARCH64_NE; /* = any */
8228         case EQ: return AARCH64_EQ; /* = none */
8229         case GE: return AARCH64_PL; /* = nfrst */
8230         case LT: return AARCH64_MI; /* = first */
8231         case GEU: return AARCH64_CS; /* = nlast */
8232         case GTU: return AARCH64_HI; /* = pmore */
8233         case LEU: return AARCH64_LS; /* = plast */
8234         case LTU: return AARCH64_CC; /* = last */
8235         default: return -1;
8236         }
8237       break;
8238
8239     case E_CC_NZmode:
8240       switch (comp_code)
8241         {
8242         case NE: return AARCH64_NE;
8243         case EQ: return AARCH64_EQ;
8244         case GE: return AARCH64_PL;
8245         case LT: return AARCH64_MI;
8246         default: return -1;
8247         }
8248       break;
8249
8250     case E_CC_Zmode:
8251       switch (comp_code)
8252         {
8253         case NE: return AARCH64_NE;
8254         case EQ: return AARCH64_EQ;
8255         default: return -1;
8256         }
8257       break;
8258
8259     case E_CC_Cmode:
8260       switch (comp_code)
8261         {
8262         case LTU: return AARCH64_CS;
8263         case GEU: return AARCH64_CC;
8264         default: return -1;
8265         }
8266       break;
8267
8268     case E_CC_ADCmode:
8269       switch (comp_code)
8270         {
8271         case GEU: return AARCH64_CS;
8272         case LTU: return AARCH64_CC;
8273         default: return -1;
8274         }
8275       break;
8276
8277     case E_CC_Vmode:
8278       switch (comp_code)
8279         {
8280         case NE: return AARCH64_VS;
8281         case EQ: return AARCH64_VC;
8282         default: return -1;
8283         }
8284       break;
8285
8286     default:
8287       return -1;
8288     }
8289
8290   return -1;
8291 }
8292
8293 bool
8294 aarch64_const_vec_all_same_in_range_p (rtx x,
8295                                        HOST_WIDE_INT minval,
8296                                        HOST_WIDE_INT maxval)
8297 {
8298   rtx elt;
8299   return (const_vec_duplicate_p (x, &elt)
8300           && CONST_INT_P (elt)
8301           && IN_RANGE (INTVAL (elt), minval, maxval));
8302 }
8303
8304 bool
8305 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8306 {
8307   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8308 }
8309
8310 /* Return true if VEC is a constant in which every element is in the range
8311    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8312
8313 static bool
8314 aarch64_const_vec_all_in_range_p (rtx vec,
8315                                   HOST_WIDE_INT minval,
8316                                   HOST_WIDE_INT maxval)
8317 {
8318   if (GET_CODE (vec) != CONST_VECTOR
8319       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8320     return false;
8321
8322   int nunits;
8323   if (!CONST_VECTOR_STEPPED_P (vec))
8324     nunits = const_vector_encoded_nelts (vec);
8325   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8326     return false;
8327
8328   for (int i = 0; i < nunits; i++)
8329     {
8330       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8331       if (!CONST_INT_P (vec_elem)
8332           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8333         return false;
8334     }
8335   return true;
8336 }
8337
8338 /* N Z C V.  */
8339 #define AARCH64_CC_V 1
8340 #define AARCH64_CC_C (1 << 1)
8341 #define AARCH64_CC_Z (1 << 2)
8342 #define AARCH64_CC_N (1 << 3)
8343
8344 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8345 static const int aarch64_nzcv_codes[] =
8346 {
8347   0,            /* EQ, Z == 1.  */
8348   AARCH64_CC_Z, /* NE, Z == 0.  */
8349   0,            /* CS, C == 1.  */
8350   AARCH64_CC_C, /* CC, C == 0.  */
8351   0,            /* MI, N == 1.  */
8352   AARCH64_CC_N, /* PL, N == 0.  */
8353   0,            /* VS, V == 1.  */
8354   AARCH64_CC_V, /* VC, V == 0.  */
8355   0,            /* HI, C ==1 && Z == 0.  */
8356   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8357   AARCH64_CC_V, /* GE, N == V.  */
8358   0,            /* LT, N != V.  */
8359   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8360   0,            /* LE, !(Z == 0 && N == V).  */
8361   0,            /* AL, Any.  */
8362   0             /* NV, Any.  */
8363 };
8364
8365 /* Print floating-point vector immediate operand X to F, negating it
8366    first if NEGATE is true.  Return true on success, false if it isn't
8367    a constant we can handle.  */
8368
8369 static bool
8370 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8371 {
8372   rtx elt;
8373
8374   if (!const_vec_duplicate_p (x, &elt))
8375     return false;
8376
8377   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8378   if (negate)
8379     r = real_value_negate (&r);
8380
8381   /* Handle the SVE single-bit immediates specially, since they have a
8382      fixed form in the assembly syntax.  */
8383   if (real_equal (&r, &dconst0))
8384     asm_fprintf (f, "0.0");
8385   else if (real_equal (&r, &dconst2))
8386     asm_fprintf (f, "2.0");
8387   else if (real_equal (&r, &dconst1))
8388     asm_fprintf (f, "1.0");
8389   else if (real_equal (&r, &dconsthalf))
8390     asm_fprintf (f, "0.5");
8391   else
8392     {
8393       const int buf_size = 20;
8394       char float_buf[buf_size] = {'\0'};
8395       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8396                                 1, GET_MODE (elt));
8397       asm_fprintf (f, "%s", float_buf);
8398     }
8399
8400   return true;
8401 }
8402
8403 /* Return the equivalent letter for size.  */
8404 static char
8405 sizetochar (int size)
8406 {
8407   switch (size)
8408     {
8409     case 64: return 'd';
8410     case 32: return 's';
8411     case 16: return 'h';
8412     case 8 : return 'b';
8413     default: gcc_unreachable ();
8414     }
8415 }
8416
8417 /* Print operand X to file F in a target specific manner according to CODE.
8418    The acceptable formatting commands given by CODE are:
8419      'c':               An integer or symbol address without a preceding #
8420                         sign.
8421      'C':               Take the duplicated element in a vector constant
8422                         and print it in hex.
8423      'D':               Take the duplicated element in a vector constant
8424                         and print it as an unsigned integer, in decimal.
8425      'e':               Print the sign/zero-extend size as a character 8->b,
8426                         16->h, 32->w.  Can also be used for masks:
8427                         0xff->b, 0xffff->h, 0xffffffff->w.
8428      'I':               If the operand is a duplicated vector constant,
8429                         replace it with the duplicated scalar.  If the
8430                         operand is then a floating-point constant, replace
8431                         it with the integer bit representation.  Print the
8432                         transformed constant as a signed decimal number.
8433      'p':               Prints N such that 2^N == X (X must be power of 2 and
8434                         const int).
8435      'P':               Print the number of non-zero bits in X (a const_int).
8436      'H':               Print the higher numbered register of a pair (TImode)
8437                         of regs.
8438      'm':               Print a condition (eq, ne, etc).
8439      'M':               Same as 'm', but invert condition.
8440      'N':               Take the duplicated element in a vector constant
8441                         and print the negative of it in decimal.
8442      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8443      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8444                         The register printed is the FP/SIMD register name
8445                         of X + 0/1/2/3 for S/T/U/V.
8446      'R':               Print a scalar Integer/FP/SIMD register name + 1.
8447      'X':               Print bottom 16 bits of integer constant in hex.
8448      'w/x':             Print a general register name or the zero register
8449                         (32-bit or 64-bit).
8450      '0':               Print a normal operand, if it's a general register,
8451                         then we assume DImode.
8452      'k':               Print NZCV for conditional compare instructions.
8453      'A':               Output address constant representing the first
8454                         argument of X, specifying a relocation offset
8455                         if appropriate.
8456      'L':               Output constant address specified by X
8457                         with a relocation offset if appropriate.
8458      'G':               Prints address of X, specifying a PC relative
8459                         relocation mode if appropriate.
8460      'y':               Output address of LDP or STP - this is used for
8461                         some LDP/STPs which don't use a PARALLEL in their
8462                         pattern (so the mode needs to be adjusted).
8463      'z':               Output address of a typical LDP or STP.  */
8464
8465 static void
8466 aarch64_print_operand (FILE *f, rtx x, int code)
8467 {
8468   rtx elt;
8469   switch (code)
8470     {
8471     case 'c':
8472       switch (GET_CODE (x))
8473         {
8474         case CONST_INT:
8475           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8476           break;
8477
8478         case SYMBOL_REF:
8479           output_addr_const (f, x);
8480           break;
8481
8482         case CONST:
8483           if (GET_CODE (XEXP (x, 0)) == PLUS
8484               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8485             {
8486               output_addr_const (f, x);
8487               break;
8488             }
8489           /* Fall through.  */
8490
8491         default:
8492           output_operand_lossage ("unsupported operand for code '%c'", code);
8493         }
8494       break;
8495
8496     case 'e':
8497       {
8498         x = unwrap_const_vec_duplicate (x);
8499         if (!CONST_INT_P (x))
8500           {
8501             output_operand_lossage ("invalid operand for '%%%c'", code);
8502             return;
8503           }
8504
8505         HOST_WIDE_INT val = INTVAL (x);
8506         if ((val & ~7) == 8 || val == 0xff)
8507           fputc ('b', f);
8508         else if ((val & ~7) == 16 || val == 0xffff)
8509           fputc ('h', f);
8510         else if ((val & ~7) == 32 || val == 0xffffffff)
8511           fputc ('w', f);
8512         else
8513           {
8514             output_operand_lossage ("invalid operand for '%%%c'", code);
8515             return;
8516           }
8517       }
8518       break;
8519
8520     case 'p':
8521       {
8522         int n;
8523
8524         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8525           {
8526             output_operand_lossage ("invalid operand for '%%%c'", code);
8527             return;
8528           }
8529
8530         asm_fprintf (f, "%d", n);
8531       }
8532       break;
8533
8534     case 'P':
8535       if (!CONST_INT_P (x))
8536         {
8537           output_operand_lossage ("invalid operand for '%%%c'", code);
8538           return;
8539         }
8540
8541       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8542       break;
8543
8544     case 'H':
8545       if (x == const0_rtx)
8546         {
8547           asm_fprintf (f, "xzr");
8548           break;
8549         }
8550
8551       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8552         {
8553           output_operand_lossage ("invalid operand for '%%%c'", code);
8554           return;
8555         }
8556
8557       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8558       break;
8559
8560     case 'I':
8561       {
8562         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8563         if (CONST_INT_P (x))
8564           asm_fprintf (f, "%wd", INTVAL (x));
8565         else
8566           {
8567             output_operand_lossage ("invalid operand for '%%%c'", code);
8568             return;
8569           }
8570         break;
8571       }
8572
8573     case 'M':
8574     case 'm':
8575       {
8576         int cond_code;
8577         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8578         if (x == const_true_rtx)
8579           {
8580             if (code == 'M')
8581               fputs ("nv", f);
8582             return;
8583           }
8584
8585         if (!COMPARISON_P (x))
8586           {
8587             output_operand_lossage ("invalid operand for '%%%c'", code);
8588             return;
8589           }
8590
8591         cond_code = aarch64_get_condition_code (x);
8592         gcc_assert (cond_code >= 0);
8593         if (code == 'M')
8594           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8595         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8596           fputs (aarch64_sve_condition_codes[cond_code], f);
8597         else
8598           fputs (aarch64_condition_codes[cond_code], f);
8599       }
8600       break;
8601
8602     case 'N':
8603       if (!const_vec_duplicate_p (x, &elt))
8604         {
8605           output_operand_lossage ("invalid vector constant");
8606           return;
8607         }
8608
8609       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8610         asm_fprintf (f, "%wd", -INTVAL (elt));
8611       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8612                && aarch64_print_vector_float_operand (f, x, true))
8613         ;
8614       else
8615         {
8616           output_operand_lossage ("invalid vector constant");
8617           return;
8618         }
8619       break;
8620
8621     case 'b':
8622     case 'h':
8623     case 's':
8624     case 'd':
8625     case 'q':
8626       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8627         {
8628           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8629           return;
8630         }
8631       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8632       break;
8633
8634     case 'S':
8635     case 'T':
8636     case 'U':
8637     case 'V':
8638       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8639         {
8640           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8641           return;
8642         }
8643       asm_fprintf (f, "%c%d",
8644                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8645                    REGNO (x) - V0_REGNUM + (code - 'S'));
8646       break;
8647
8648     case 'R':
8649       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
8650         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8651       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8652         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
8653       else
8654         output_operand_lossage ("incompatible register operand for '%%%c'",
8655                                 code);
8656       break;
8657
8658     case 'X':
8659       if (!CONST_INT_P (x))
8660         {
8661           output_operand_lossage ("invalid operand for '%%%c'", code);
8662           return;
8663         }
8664       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8665       break;
8666
8667     case 'C':
8668       {
8669         /* Print a replicated constant in hex.  */
8670         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8671           {
8672             output_operand_lossage ("invalid operand for '%%%c'", code);
8673             return;
8674           }
8675         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8676         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8677       }
8678       break;
8679
8680     case 'D':
8681       {
8682         /* Print a replicated constant in decimal, treating it as
8683            unsigned.  */
8684         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8685           {
8686             output_operand_lossage ("invalid operand for '%%%c'", code);
8687             return;
8688           }
8689         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8690         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8691       }
8692       break;
8693
8694     case 'w':
8695     case 'x':
8696       if (x == const0_rtx
8697           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8698         {
8699           asm_fprintf (f, "%czr", code);
8700           break;
8701         }
8702
8703       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8704         {
8705           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8706           break;
8707         }
8708
8709       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8710         {
8711           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8712           break;
8713         }
8714
8715       /* Fall through */
8716
8717     case 0:
8718       if (x == NULL)
8719         {
8720           output_operand_lossage ("missing operand");
8721           return;
8722         }
8723
8724       switch (GET_CODE (x))
8725         {
8726         case REG:
8727           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8728             {
8729               if (REG_NREGS (x) == 1)
8730                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8731               else
8732                 {
8733                   char suffix
8734                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8735                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8736                                REGNO (x) - V0_REGNUM, suffix,
8737                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8738                 }
8739             }
8740           else
8741             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8742           break;
8743
8744         case MEM:
8745           output_address (GET_MODE (x), XEXP (x, 0));
8746           break;
8747
8748         case LABEL_REF:
8749         case SYMBOL_REF:
8750           output_addr_const (asm_out_file, x);
8751           break;
8752
8753         case CONST_INT:
8754           asm_fprintf (f, "%wd", INTVAL (x));
8755           break;
8756
8757         case CONST:
8758           if (!VECTOR_MODE_P (GET_MODE (x)))
8759             {
8760               output_addr_const (asm_out_file, x);
8761               break;
8762             }
8763           /* fall through */
8764
8765         case CONST_VECTOR:
8766           if (!const_vec_duplicate_p (x, &elt))
8767             {
8768               output_operand_lossage ("invalid vector constant");
8769               return;
8770             }
8771
8772           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8773             asm_fprintf (f, "%wd", INTVAL (elt));
8774           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8775                    && aarch64_print_vector_float_operand (f, x, false))
8776             ;
8777           else
8778             {
8779               output_operand_lossage ("invalid vector constant");
8780               return;
8781             }
8782           break;
8783
8784         case CONST_DOUBLE:
8785           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8786              be getting CONST_DOUBLEs holding integers.  */
8787           gcc_assert (GET_MODE (x) != VOIDmode);
8788           if (aarch64_float_const_zero_rtx_p (x))
8789             {
8790               fputc ('0', f);
8791               break;
8792             }
8793           else if (aarch64_float_const_representable_p (x))
8794             {
8795 #define buf_size 20
8796               char float_buf[buf_size] = {'\0'};
8797               real_to_decimal_for_mode (float_buf,
8798                                         CONST_DOUBLE_REAL_VALUE (x),
8799                                         buf_size, buf_size,
8800                                         1, GET_MODE (x));
8801               asm_fprintf (asm_out_file, "%s", float_buf);
8802               break;
8803 #undef buf_size
8804             }
8805           output_operand_lossage ("invalid constant");
8806           return;
8807         default:
8808           output_operand_lossage ("invalid operand");
8809           return;
8810         }
8811       break;
8812
8813     case 'A':
8814       if (GET_CODE (x) == HIGH)
8815         x = XEXP (x, 0);
8816
8817       switch (aarch64_classify_symbolic_expression (x))
8818         {
8819         case SYMBOL_SMALL_GOT_4G:
8820           asm_fprintf (asm_out_file, ":got:");
8821           break;
8822
8823         case SYMBOL_SMALL_TLSGD:
8824           asm_fprintf (asm_out_file, ":tlsgd:");
8825           break;
8826
8827         case SYMBOL_SMALL_TLSDESC:
8828           asm_fprintf (asm_out_file, ":tlsdesc:");
8829           break;
8830
8831         case SYMBOL_SMALL_TLSIE:
8832           asm_fprintf (asm_out_file, ":gottprel:");
8833           break;
8834
8835         case SYMBOL_TLSLE24:
8836           asm_fprintf (asm_out_file, ":tprel:");
8837           break;
8838
8839         case SYMBOL_TINY_GOT:
8840           gcc_unreachable ();
8841           break;
8842
8843         default:
8844           break;
8845         }
8846       output_addr_const (asm_out_file, x);
8847       break;
8848
8849     case 'L':
8850       switch (aarch64_classify_symbolic_expression (x))
8851         {
8852         case SYMBOL_SMALL_GOT_4G:
8853           asm_fprintf (asm_out_file, ":lo12:");
8854           break;
8855
8856         case SYMBOL_SMALL_TLSGD:
8857           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8858           break;
8859
8860         case SYMBOL_SMALL_TLSDESC:
8861           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8862           break;
8863
8864         case SYMBOL_SMALL_TLSIE:
8865           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8866           break;
8867
8868         case SYMBOL_TLSLE12:
8869           asm_fprintf (asm_out_file, ":tprel_lo12:");
8870           break;
8871
8872         case SYMBOL_TLSLE24:
8873           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8874           break;
8875
8876         case SYMBOL_TINY_GOT:
8877           asm_fprintf (asm_out_file, ":got:");
8878           break;
8879
8880         case SYMBOL_TINY_TLSIE:
8881           asm_fprintf (asm_out_file, ":gottprel:");
8882           break;
8883
8884         default:
8885           break;
8886         }
8887       output_addr_const (asm_out_file, x);
8888       break;
8889
8890     case 'G':
8891       switch (aarch64_classify_symbolic_expression (x))
8892         {
8893         case SYMBOL_TLSLE24:
8894           asm_fprintf (asm_out_file, ":tprel_hi12:");
8895           break;
8896         default:
8897           break;
8898         }
8899       output_addr_const (asm_out_file, x);
8900       break;
8901
8902     case 'k':
8903       {
8904         HOST_WIDE_INT cond_code;
8905
8906         if (!CONST_INT_P (x))
8907           {
8908             output_operand_lossage ("invalid operand for '%%%c'", code);
8909             return;
8910           }
8911
8912         cond_code = INTVAL (x);
8913         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8914         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8915       }
8916       break;
8917
8918     case 'y':
8919     case 'z':
8920       {
8921         machine_mode mode = GET_MODE (x);
8922
8923         if (GET_CODE (x) != MEM
8924             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8925           {
8926             output_operand_lossage ("invalid operand for '%%%c'", code);
8927             return;
8928           }
8929
8930         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8931                                             code == 'y'
8932                                             ? ADDR_QUERY_LDP_STP_N
8933                                             : ADDR_QUERY_LDP_STP))
8934           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8935       }
8936       break;
8937
8938     default:
8939       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8940       return;
8941     }
8942 }
8943
8944 /* Print address 'x' of a memory access with mode 'mode'.
8945    'op' is the context required by aarch64_classify_address.  It can either be
8946    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8947 static bool
8948 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8949                                 aarch64_addr_query_type type)
8950 {
8951   struct aarch64_address_info addr;
8952   unsigned int size;
8953
8954   /* Check all addresses are Pmode - including ILP32.  */
8955   if (GET_MODE (x) != Pmode
8956       && (!CONST_INT_P (x)
8957           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8958     {
8959       output_operand_lossage ("invalid address mode");
8960       return false;
8961     }
8962
8963   if (aarch64_classify_address (&addr, x, mode, true, type))
8964     switch (addr.type)
8965       {
8966       case ADDRESS_REG_IMM:
8967         if (known_eq (addr.const_offset, 0))
8968           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8969         else if (aarch64_sve_data_mode_p (mode))
8970           {
8971             HOST_WIDE_INT vnum
8972               = exact_div (addr.const_offset,
8973                            BYTES_PER_SVE_VECTOR).to_constant ();
8974             asm_fprintf (f, "[%s, #%wd, mul vl]",
8975                          reg_names[REGNO (addr.base)], vnum);
8976           }
8977         else if (aarch64_sve_pred_mode_p (mode))
8978           {
8979             HOST_WIDE_INT vnum
8980               = exact_div (addr.const_offset,
8981                            BYTES_PER_SVE_PRED).to_constant ();
8982             asm_fprintf (f, "[%s, #%wd, mul vl]",
8983                          reg_names[REGNO (addr.base)], vnum);
8984           }
8985         else
8986           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8987                        INTVAL (addr.offset));
8988         return true;
8989
8990       case ADDRESS_REG_REG:
8991         if (addr.shift == 0)
8992           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8993                        reg_names [REGNO (addr.offset)]);
8994         else
8995           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8996                        reg_names [REGNO (addr.offset)], addr.shift);
8997         return true;
8998
8999       case ADDRESS_REG_UXTW:
9000         if (addr.shift == 0)
9001           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9002                        REGNO (addr.offset) - R0_REGNUM);
9003         else
9004           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9005                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9006         return true;
9007
9008       case ADDRESS_REG_SXTW:
9009         if (addr.shift == 0)
9010           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9011                        REGNO (addr.offset) - R0_REGNUM);
9012         else
9013           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9014                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9015         return true;
9016
9017       case ADDRESS_REG_WB:
9018         /* Writeback is only supported for fixed-width modes.  */
9019         size = GET_MODE_SIZE (mode).to_constant ();
9020         switch (GET_CODE (x))
9021           {
9022           case PRE_INC:
9023             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9024             return true;
9025           case POST_INC:
9026             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9027             return true;
9028           case PRE_DEC:
9029             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9030             return true;
9031           case POST_DEC:
9032             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9033             return true;
9034           case PRE_MODIFY:
9035             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9036                          INTVAL (addr.offset));
9037             return true;
9038           case POST_MODIFY:
9039             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9040                          INTVAL (addr.offset));
9041             return true;
9042           default:
9043             break;
9044           }
9045         break;
9046
9047       case ADDRESS_LO_SUM:
9048         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9049         output_addr_const (f, addr.offset);
9050         asm_fprintf (f, "]");
9051         return true;
9052
9053       case ADDRESS_SYMBOLIC:
9054         output_addr_const (f, x);
9055         return true;
9056       }
9057
9058   return false;
9059 }
9060
9061 /* Print address 'x' of a memory access with mode 'mode'.  */
9062 static void
9063 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9064 {
9065   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9066     output_addr_const (f, x);
9067 }
9068
9069 bool
9070 aarch64_label_mentioned_p (rtx x)
9071 {
9072   const char *fmt;
9073   int i;
9074
9075   if (GET_CODE (x) == LABEL_REF)
9076     return true;
9077
9078   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9079      referencing instruction, but they are constant offsets, not
9080      symbols.  */
9081   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9082     return false;
9083
9084   fmt = GET_RTX_FORMAT (GET_CODE (x));
9085   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9086     {
9087       if (fmt[i] == 'E')
9088         {
9089           int j;
9090
9091           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9092             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9093               return 1;
9094         }
9095       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9096         return 1;
9097     }
9098
9099   return 0;
9100 }
9101
9102 /* Implement REGNO_REG_CLASS.  */
9103
9104 enum reg_class
9105 aarch64_regno_regclass (unsigned regno)
9106 {
9107   if (GP_REGNUM_P (regno))
9108     return GENERAL_REGS;
9109
9110   if (regno == SP_REGNUM)
9111     return STACK_REG;
9112
9113   if (regno == FRAME_POINTER_REGNUM
9114       || regno == ARG_POINTER_REGNUM)
9115     return POINTER_REGS;
9116
9117   if (FP_REGNUM_P (regno))
9118     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9119             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9120
9121   if (PR_REGNUM_P (regno))
9122     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9123
9124   return NO_REGS;
9125 }
9126
9127 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9128    If OFFSET is out of range, return an offset of an anchor point
9129    that is in range.  Return 0 otherwise.  */
9130
9131 static HOST_WIDE_INT
9132 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9133                        machine_mode mode)
9134 {
9135   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9136   if (size > 16)
9137     return (offset + 0x400) & ~0x7f0;
9138
9139   /* For offsets that aren't a multiple of the access size, the limit is
9140      -256...255.  */
9141   if (offset & (size - 1))
9142     {
9143       /* BLKmode typically uses LDP of X-registers.  */
9144       if (mode == BLKmode)
9145         return (offset + 512) & ~0x3ff;
9146       return (offset + 0x100) & ~0x1ff;
9147     }
9148
9149   /* Small negative offsets are supported.  */
9150   if (IN_RANGE (offset, -256, 0))
9151     return 0;
9152
9153   if (mode == TImode || mode == TFmode)
9154     return (offset + 0x100) & ~0x1ff;
9155
9156   /* Use 12-bit offset by access size.  */
9157   return offset & (~0xfff * size);
9158 }
9159
9160 static rtx
9161 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9162 {
9163   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9164      where mask is selected by alignment and size of the offset.
9165      We try to pick as large a range for the offset as possible to
9166      maximize the chance of a CSE.  However, for aligned addresses
9167      we limit the range to 4k so that structures with different sized
9168      elements are likely to use the same base.  We need to be careful
9169      not to split a CONST for some forms of address expression, otherwise
9170      it will generate sub-optimal code.  */
9171
9172   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9173     {
9174       rtx base = XEXP (x, 0);
9175       rtx offset_rtx = XEXP (x, 1);
9176       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9177
9178       if (GET_CODE (base) == PLUS)
9179         {
9180           rtx op0 = XEXP (base, 0);
9181           rtx op1 = XEXP (base, 1);
9182
9183           /* Force any scaling into a temp for CSE.  */
9184           op0 = force_reg (Pmode, op0);
9185           op1 = force_reg (Pmode, op1);
9186
9187           /* Let the pointer register be in op0.  */
9188           if (REG_POINTER (op1))
9189             std::swap (op0, op1);
9190
9191           /* If the pointer is virtual or frame related, then we know that
9192              virtual register instantiation or register elimination is going
9193              to apply a second constant.  We want the two constants folded
9194              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9195           if (virt_or_elim_regno_p (REGNO (op0)))
9196             {
9197               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9198                                    NULL_RTX, true, OPTAB_DIRECT);
9199               return gen_rtx_PLUS (Pmode, base, op1);
9200             }
9201
9202           /* Otherwise, in order to encourage CSE (and thence loop strength
9203              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9204           base = expand_binop (Pmode, add_optab, op0, op1,
9205                                NULL_RTX, true, OPTAB_DIRECT);
9206           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9207         }
9208
9209       HOST_WIDE_INT size;
9210       if (GET_MODE_SIZE (mode).is_constant (&size))
9211         {
9212           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9213                                                              mode);
9214           if (base_offset != 0)
9215             {
9216               base = plus_constant (Pmode, base, base_offset);
9217               base = force_operand (base, NULL_RTX);
9218               return plus_constant (Pmode, base, offset - base_offset);
9219             }
9220         }
9221     }
9222
9223   return x;
9224 }
9225
9226 static reg_class_t
9227 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9228                           reg_class_t rclass,
9229                           machine_mode mode,
9230                           secondary_reload_info *sri)
9231 {
9232   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9233      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9234      comment at the head of aarch64-sve.md for more details about the
9235      big-endian handling.  */
9236   if (BYTES_BIG_ENDIAN
9237       && reg_class_subset_p (rclass, FP_REGS)
9238       && !((REG_P (x) && HARD_REGISTER_P (x))
9239            || aarch64_simd_valid_immediate (x, NULL))
9240       && aarch64_sve_data_mode_p (mode))
9241     {
9242       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9243       return NO_REGS;
9244     }
9245
9246   /* If we have to disable direct literal pool loads and stores because the
9247      function is too big, then we need a scratch register.  */
9248   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9249       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9250           || targetm.vector_mode_supported_p (GET_MODE (x)))
9251       && !aarch64_pcrelative_literal_loads)
9252     {
9253       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9254       return NO_REGS;
9255     }
9256
9257   /* Without the TARGET_SIMD instructions we cannot move a Q register
9258      to a Q register directly.  We need a scratch.  */
9259   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9260       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9261       && reg_class_subset_p (rclass, FP_REGS))
9262     {
9263       sri->icode = code_for_aarch64_reload_mov (mode);
9264       return NO_REGS;
9265     }
9266
9267   /* A TFmode or TImode memory access should be handled via an FP_REGS
9268      because AArch64 has richer addressing modes for LDR/STR instructions
9269      than LDP/STP instructions.  */
9270   if (TARGET_FLOAT && rclass == GENERAL_REGS
9271       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9272     return FP_REGS;
9273
9274   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9275       return GENERAL_REGS;
9276
9277   return NO_REGS;
9278 }
9279
9280 static bool
9281 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9282 {
9283   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9284
9285   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9286      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9287   if (frame_pointer_needed)
9288     return to == HARD_FRAME_POINTER_REGNUM;
9289   return true;
9290 }
9291
9292 poly_int64
9293 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9294 {
9295   if (to == HARD_FRAME_POINTER_REGNUM)
9296     {
9297       if (from == ARG_POINTER_REGNUM)
9298         return cfun->machine->frame.hard_fp_offset;
9299
9300       if (from == FRAME_POINTER_REGNUM)
9301         return cfun->machine->frame.hard_fp_offset
9302                - cfun->machine->frame.locals_offset;
9303     }
9304
9305   if (to == STACK_POINTER_REGNUM)
9306     {
9307       if (from == FRAME_POINTER_REGNUM)
9308           return cfun->machine->frame.frame_size
9309                  - cfun->machine->frame.locals_offset;
9310     }
9311
9312   return cfun->machine->frame.frame_size;
9313 }
9314
9315 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9316    previous frame.  */
9317
9318 rtx
9319 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9320 {
9321   if (count != 0)
9322     return const0_rtx;
9323   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9324 }
9325
9326
9327 static void
9328 aarch64_asm_trampoline_template (FILE *f)
9329 {
9330   int offset1 = 16;
9331   int offset2 = 20;
9332
9333   if (aarch64_bti_enabled ())
9334     {
9335       asm_fprintf (f, "\thint\t34 // bti c\n");
9336       offset1 -= 4;
9337       offset2 -= 4;
9338     }
9339
9340   if (TARGET_ILP32)
9341     {
9342       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9343       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9344                    offset1);
9345     }
9346   else
9347     {
9348       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9349       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9350                    offset2);
9351     }
9352   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9353
9354   /* The trampoline needs an extra padding instruction.  In case if BTI is
9355      enabled the padding instruction is replaced by the BTI instruction at
9356      the beginning.  */
9357   if (!aarch64_bti_enabled ())
9358     assemble_aligned_integer (4, const0_rtx);
9359
9360   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9361   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9362 }
9363
9364 static void
9365 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9366 {
9367   rtx fnaddr, mem, a_tramp;
9368   const int tramp_code_sz = 16;
9369
9370   /* Don't need to copy the trailing D-words, we fill those in below.  */
9371   emit_block_move (m_tramp, assemble_trampoline_template (),
9372                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9373   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9374   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9375   if (GET_MODE (fnaddr) != ptr_mode)
9376     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9377   emit_move_insn (mem, fnaddr);
9378
9379   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9380   emit_move_insn (mem, chain_value);
9381
9382   /* XXX We should really define a "clear_cache" pattern and use
9383      gen_clear_cache().  */
9384   a_tramp = XEXP (m_tramp, 0);
9385   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9386                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9387                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9388                      ptr_mode);
9389 }
9390
9391 static unsigned char
9392 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9393 {
9394   /* ??? Logically we should only need to provide a value when
9395      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9396      can hold MODE, but at the moment we need to handle all modes.
9397      Just ignore any runtime parts for registers that can't store them.  */
9398   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9399   unsigned int nregs;
9400   switch (regclass)
9401     {
9402     case TAILCALL_ADDR_REGS:
9403     case POINTER_REGS:
9404     case GENERAL_REGS:
9405     case ALL_REGS:
9406     case POINTER_AND_FP_REGS:
9407     case FP_REGS:
9408     case FP_LO_REGS:
9409     case FP_LO8_REGS:
9410       if (aarch64_sve_data_mode_p (mode)
9411           && constant_multiple_p (GET_MODE_SIZE (mode),
9412                                   BYTES_PER_SVE_VECTOR, &nregs))
9413         return nregs;
9414       return (aarch64_vector_data_mode_p (mode)
9415               ? CEIL (lowest_size, UNITS_PER_VREG)
9416               : CEIL (lowest_size, UNITS_PER_WORD));
9417     case STACK_REG:
9418     case PR_REGS:
9419     case PR_LO_REGS:
9420     case PR_HI_REGS:
9421       return 1;
9422
9423     case NO_REGS:
9424       return 0;
9425
9426     default:
9427       break;
9428     }
9429   gcc_unreachable ();
9430 }
9431
9432 static reg_class_t
9433 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9434 {
9435   if (regclass == POINTER_REGS)
9436     return GENERAL_REGS;
9437
9438   if (regclass == STACK_REG)
9439     {
9440       if (REG_P(x)
9441           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9442           return regclass;
9443
9444       return NO_REGS;
9445     }
9446
9447   /* Register eliminiation can result in a request for
9448      SP+constant->FP_REGS.  We cannot support such operations which
9449      use SP as source and an FP_REG as destination, so reject out
9450      right now.  */
9451   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9452     {
9453       rtx lhs = XEXP (x, 0);
9454
9455       /* Look through a possible SUBREG introduced by ILP32.  */
9456       if (GET_CODE (lhs) == SUBREG)
9457         lhs = SUBREG_REG (lhs);
9458
9459       gcc_assert (REG_P (lhs));
9460       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9461                                       POINTER_REGS));
9462       return NO_REGS;
9463     }
9464
9465   return regclass;
9466 }
9467
9468 void
9469 aarch64_asm_output_labelref (FILE* f, const char *name)
9470 {
9471   asm_fprintf (f, "%U%s", name);
9472 }
9473
9474 static void
9475 aarch64_elf_asm_constructor (rtx symbol, int priority)
9476 {
9477   if (priority == DEFAULT_INIT_PRIORITY)
9478     default_ctor_section_asm_out_constructor (symbol, priority);
9479   else
9480     {
9481       section *s;
9482       /* While priority is known to be in range [0, 65535], so 18 bytes
9483          would be enough, the compiler might not know that.  To avoid
9484          -Wformat-truncation false positive, use a larger size.  */
9485       char buf[23];
9486       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9487       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9488       switch_to_section (s);
9489       assemble_align (POINTER_SIZE);
9490       assemble_aligned_integer (POINTER_BYTES, symbol);
9491     }
9492 }
9493
9494 static void
9495 aarch64_elf_asm_destructor (rtx symbol, int priority)
9496 {
9497   if (priority == DEFAULT_INIT_PRIORITY)
9498     default_dtor_section_asm_out_destructor (symbol, priority);
9499   else
9500     {
9501       section *s;
9502       /* While priority is known to be in range [0, 65535], so 18 bytes
9503          would be enough, the compiler might not know that.  To avoid
9504          -Wformat-truncation false positive, use a larger size.  */
9505       char buf[23];
9506       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9507       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9508       switch_to_section (s);
9509       assemble_align (POINTER_SIZE);
9510       assemble_aligned_integer (POINTER_BYTES, symbol);
9511     }
9512 }
9513
9514 const char*
9515 aarch64_output_casesi (rtx *operands)
9516 {
9517   char buf[100];
9518   char label[100];
9519   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9520   int index;
9521   static const char *const patterns[4][2] =
9522   {
9523     {
9524       "ldrb\t%w3, [%0,%w1,uxtw]",
9525       "add\t%3, %4, %w3, sxtb #2"
9526     },
9527     {
9528       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9529       "add\t%3, %4, %w3, sxth #2"
9530     },
9531     {
9532       "ldr\t%w3, [%0,%w1,uxtw #2]",
9533       "add\t%3, %4, %w3, sxtw #2"
9534     },
9535     /* We assume that DImode is only generated when not optimizing and
9536        that we don't really need 64-bit address offsets.  That would
9537        imply an object file with 8GB of code in a single function!  */
9538     {
9539       "ldr\t%w3, [%0,%w1,uxtw #2]",
9540       "add\t%3, %4, %w3, sxtw #2"
9541     }
9542   };
9543
9544   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9545
9546   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9547   index = exact_log2 (GET_MODE_SIZE (mode));
9548
9549   gcc_assert (index >= 0 && index <= 3);
9550
9551   /* Need to implement table size reduction, by chaning the code below.  */
9552   output_asm_insn (patterns[index][0], operands);
9553   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9554   snprintf (buf, sizeof (buf),
9555             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9556   output_asm_insn (buf, operands);
9557   output_asm_insn (patterns[index][1], operands);
9558   output_asm_insn ("br\t%3", operands);
9559   assemble_label (asm_out_file, label);
9560   return "";
9561 }
9562
9563
9564 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9565    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9566    operator.  */
9567
9568 int
9569 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9570 {
9571   if (shift >= 0 && shift <= 3)
9572     {
9573       int size;
9574       for (size = 8; size <= 32; size *= 2)
9575         {
9576           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9577           if (mask == bits << shift)
9578             return size;
9579         }
9580     }
9581   return 0;
9582 }
9583
9584 /* Constant pools are per function only when PC relative
9585    literal loads are true or we are in the large memory
9586    model.  */
9587
9588 static inline bool
9589 aarch64_can_use_per_function_literal_pools_p (void)
9590 {
9591   return (aarch64_pcrelative_literal_loads
9592           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9593 }
9594
9595 static bool
9596 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9597 {
9598   /* We can't use blocks for constants when we're using a per-function
9599      constant pool.  */
9600   return !aarch64_can_use_per_function_literal_pools_p ();
9601 }
9602
9603 /* Select appropriate section for constants depending
9604    on where we place literal pools.  */
9605
9606 static section *
9607 aarch64_select_rtx_section (machine_mode mode,
9608                             rtx x,
9609                             unsigned HOST_WIDE_INT align)
9610 {
9611   if (aarch64_can_use_per_function_literal_pools_p ())
9612     return function_section (current_function_decl);
9613
9614   return default_elf_select_rtx_section (mode, x, align);
9615 }
9616
9617 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9618 void
9619 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9620                                   HOST_WIDE_INT offset)
9621 {
9622   /* When using per-function literal pools, we must ensure that any code
9623      section is aligned to the minimal instruction length, lest we get
9624      errors from the assembler re "unaligned instructions".  */
9625   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9626     ASM_OUTPUT_ALIGN (f, 2);
9627 }
9628
9629 /* Costs.  */
9630
9631 /* Helper function for rtx cost calculation.  Strip a shift expression
9632    from X.  Returns the inner operand if successful, or the original
9633    expression on failure.  */
9634 static rtx
9635 aarch64_strip_shift (rtx x)
9636 {
9637   rtx op = x;
9638
9639   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9640      we can convert both to ROR during final output.  */
9641   if ((GET_CODE (op) == ASHIFT
9642        || GET_CODE (op) == ASHIFTRT
9643        || GET_CODE (op) == LSHIFTRT
9644        || GET_CODE (op) == ROTATERT
9645        || GET_CODE (op) == ROTATE)
9646       && CONST_INT_P (XEXP (op, 1)))
9647     return XEXP (op, 0);
9648
9649   if (GET_CODE (op) == MULT
9650       && CONST_INT_P (XEXP (op, 1))
9651       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9652     return XEXP (op, 0);
9653
9654   return x;
9655 }
9656
9657 /* Helper function for rtx cost calculation.  Strip an extend
9658    expression from X.  Returns the inner operand if successful, or the
9659    original expression on failure.  We deal with a number of possible
9660    canonicalization variations here. If STRIP_SHIFT is true, then
9661    we can strip off a shift also.  */
9662 static rtx
9663 aarch64_strip_extend (rtx x, bool strip_shift)
9664 {
9665   scalar_int_mode mode;
9666   rtx op = x;
9667
9668   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9669     return op;
9670
9671   /* Zero and sign extraction of a widened value.  */
9672   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9673       && XEXP (op, 2) == const0_rtx
9674       && GET_CODE (XEXP (op, 0)) == MULT
9675       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9676                                          XEXP (op, 1)))
9677     return XEXP (XEXP (op, 0), 0);
9678
9679   /* It can also be represented (for zero-extend) as an AND with an
9680      immediate.  */
9681   if (GET_CODE (op) == AND
9682       && GET_CODE (XEXP (op, 0)) == MULT
9683       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9684       && CONST_INT_P (XEXP (op, 1))
9685       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9686                            INTVAL (XEXP (op, 1))) != 0)
9687     return XEXP (XEXP (op, 0), 0);
9688
9689   /* Now handle extended register, as this may also have an optional
9690      left shift by 1..4.  */
9691   if (strip_shift
9692       && GET_CODE (op) == ASHIFT
9693       && CONST_INT_P (XEXP (op, 1))
9694       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9695     op = XEXP (op, 0);
9696
9697   if (GET_CODE (op) == ZERO_EXTEND
9698       || GET_CODE (op) == SIGN_EXTEND)
9699     op = XEXP (op, 0);
9700
9701   if (op != x)
9702     return op;
9703
9704   return x;
9705 }
9706
9707 /* Return true iff CODE is a shift supported in combination
9708    with arithmetic instructions.  */
9709
9710 static bool
9711 aarch64_shift_p (enum rtx_code code)
9712 {
9713   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9714 }
9715
9716
9717 /* Return true iff X is a cheap shift without a sign extend. */
9718
9719 static bool
9720 aarch64_cheap_mult_shift_p (rtx x)
9721 {
9722   rtx op0, op1;
9723
9724   op0 = XEXP (x, 0);
9725   op1 = XEXP (x, 1);
9726
9727   if (!(aarch64_tune_params.extra_tuning_flags
9728                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9729     return false;
9730
9731   if (GET_CODE (op0) == SIGN_EXTEND)
9732     return false;
9733
9734   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9735       && UINTVAL (op1) <= 4)
9736     return true;
9737
9738   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9739     return false;
9740
9741   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9742
9743   if (l2 > 0 && l2 <= 4)
9744     return true;
9745
9746   return false;
9747 }
9748
9749 /* Helper function for rtx cost calculation.  Calculate the cost of
9750    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9751    Return the calculated cost of the expression, recursing manually in to
9752    operands where needed.  */
9753
9754 static int
9755 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9756 {
9757   rtx op0, op1;
9758   const struct cpu_cost_table *extra_cost
9759     = aarch64_tune_params.insn_extra_cost;
9760   int cost = 0;
9761   bool compound_p = (outer == PLUS || outer == MINUS);
9762   machine_mode mode = GET_MODE (x);
9763
9764   gcc_checking_assert (code == MULT);
9765
9766   op0 = XEXP (x, 0);
9767   op1 = XEXP (x, 1);
9768
9769   if (VECTOR_MODE_P (mode))
9770     mode = GET_MODE_INNER (mode);
9771
9772   /* Integer multiply/fma.  */
9773   if (GET_MODE_CLASS (mode) == MODE_INT)
9774     {
9775       /* The multiply will be canonicalized as a shift, cost it as such.  */
9776       if (aarch64_shift_p (GET_CODE (x))
9777           || (CONST_INT_P (op1)
9778               && exact_log2 (INTVAL (op1)) > 0))
9779         {
9780           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9781                            || GET_CODE (op0) == SIGN_EXTEND;
9782           if (speed)
9783             {
9784               if (compound_p)
9785                 {
9786                   /* If the shift is considered cheap,
9787                      then don't add any cost. */
9788                   if (aarch64_cheap_mult_shift_p (x))
9789                     ;
9790                   else if (REG_P (op1))
9791                     /* ARITH + shift-by-register.  */
9792                     cost += extra_cost->alu.arith_shift_reg;
9793                   else if (is_extend)
9794                     /* ARITH + extended register.  We don't have a cost field
9795                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9796                     cost += extra_cost->alu.extend_arith;
9797                   else
9798                     /* ARITH + shift-by-immediate.  */
9799                     cost += extra_cost->alu.arith_shift;
9800                 }
9801               else
9802                 /* LSL (immediate).  */
9803                 cost += extra_cost->alu.shift;
9804
9805             }
9806           /* Strip extends as we will have costed them in the case above.  */
9807           if (is_extend)
9808             op0 = aarch64_strip_extend (op0, true);
9809
9810           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9811
9812           return cost;
9813         }
9814
9815       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9816          compound and let the below cases handle it.  After all, MNEG is a
9817          special-case alias of MSUB.  */
9818       if (GET_CODE (op0) == NEG)
9819         {
9820           op0 = XEXP (op0, 0);
9821           compound_p = true;
9822         }
9823
9824       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9825       if ((GET_CODE (op0) == ZERO_EXTEND
9826            && GET_CODE (op1) == ZERO_EXTEND)
9827           || (GET_CODE (op0) == SIGN_EXTEND
9828               && GET_CODE (op1) == SIGN_EXTEND))
9829         {
9830           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9831           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9832
9833           if (speed)
9834             {
9835               if (compound_p)
9836                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9837                 cost += extra_cost->mult[0].extend_add;
9838               else
9839                 /* MUL/SMULL/UMULL.  */
9840                 cost += extra_cost->mult[0].extend;
9841             }
9842
9843           return cost;
9844         }
9845
9846       /* This is either an integer multiply or a MADD.  In both cases
9847          we want to recurse and cost the operands.  */
9848       cost += rtx_cost (op0, mode, MULT, 0, speed);
9849       cost += rtx_cost (op1, mode, MULT, 1, speed);
9850
9851       if (speed)
9852         {
9853           if (compound_p)
9854             /* MADD/MSUB.  */
9855             cost += extra_cost->mult[mode == DImode].add;
9856           else
9857             /* MUL.  */
9858             cost += extra_cost->mult[mode == DImode].simple;
9859         }
9860
9861       return cost;
9862     }
9863   else
9864     {
9865       if (speed)
9866         {
9867           /* Floating-point FMA/FMUL can also support negations of the
9868              operands, unless the rounding mode is upward or downward in
9869              which case FNMUL is different than FMUL with operand negation.  */
9870           bool neg0 = GET_CODE (op0) == NEG;
9871           bool neg1 = GET_CODE (op1) == NEG;
9872           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9873             {
9874               if (neg0)
9875                 op0 = XEXP (op0, 0);
9876               if (neg1)
9877                 op1 = XEXP (op1, 0);
9878             }
9879
9880           if (compound_p)
9881             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9882             cost += extra_cost->fp[mode == DFmode].fma;
9883           else
9884             /* FMUL/FNMUL.  */
9885             cost += extra_cost->fp[mode == DFmode].mult;
9886         }
9887
9888       cost += rtx_cost (op0, mode, MULT, 0, speed);
9889       cost += rtx_cost (op1, mode, MULT, 1, speed);
9890       return cost;
9891     }
9892 }
9893
9894 static int
9895 aarch64_address_cost (rtx x,
9896                       machine_mode mode,
9897                       addr_space_t as ATTRIBUTE_UNUSED,
9898                       bool speed)
9899 {
9900   enum rtx_code c = GET_CODE (x);
9901   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9902   struct aarch64_address_info info;
9903   int cost = 0;
9904   info.shift = 0;
9905
9906   if (!aarch64_classify_address (&info, x, mode, false))
9907     {
9908       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9909         {
9910           /* This is a CONST or SYMBOL ref which will be split
9911              in a different way depending on the code model in use.
9912              Cost it through the generic infrastructure.  */
9913           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9914           /* Divide through by the cost of one instruction to
9915              bring it to the same units as the address costs.  */
9916           cost_symbol_ref /= COSTS_N_INSNS (1);
9917           /* The cost is then the cost of preparing the address,
9918              followed by an immediate (possibly 0) offset.  */
9919           return cost_symbol_ref + addr_cost->imm_offset;
9920         }
9921       else
9922         {
9923           /* This is most likely a jump table from a case
9924              statement.  */
9925           return addr_cost->register_offset;
9926         }
9927     }
9928
9929   switch (info.type)
9930     {
9931       case ADDRESS_LO_SUM:
9932       case ADDRESS_SYMBOLIC:
9933       case ADDRESS_REG_IMM:
9934         cost += addr_cost->imm_offset;
9935         break;
9936
9937       case ADDRESS_REG_WB:
9938         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9939           cost += addr_cost->pre_modify;
9940         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9941           cost += addr_cost->post_modify;
9942         else
9943           gcc_unreachable ();
9944
9945         break;
9946
9947       case ADDRESS_REG_REG:
9948         cost += addr_cost->register_offset;
9949         break;
9950
9951       case ADDRESS_REG_SXTW:
9952         cost += addr_cost->register_sextend;
9953         break;
9954
9955       case ADDRESS_REG_UXTW:
9956         cost += addr_cost->register_zextend;
9957         break;
9958
9959       default:
9960         gcc_unreachable ();
9961     }
9962
9963
9964   if (info.shift > 0)
9965     {
9966       /* For the sake of calculating the cost of the shifted register
9967          component, we can treat same sized modes in the same way.  */
9968       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9969         cost += addr_cost->addr_scale_costs.hi;
9970       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9971         cost += addr_cost->addr_scale_costs.si;
9972       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9973         cost += addr_cost->addr_scale_costs.di;
9974       else
9975         /* We can't tell, or this is a 128-bit vector.  */
9976         cost += addr_cost->addr_scale_costs.ti;
9977     }
9978
9979   return cost;
9980 }
9981
9982 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9983    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9984    to be taken.  */
9985
9986 int
9987 aarch64_branch_cost (bool speed_p, bool predictable_p)
9988 {
9989   /* When optimizing for speed, use the cost of unpredictable branches.  */
9990   const struct cpu_branch_cost *branch_costs =
9991     aarch64_tune_params.branch_costs;
9992
9993   if (!speed_p || predictable_p)
9994     return branch_costs->predictable;
9995   else
9996     return branch_costs->unpredictable;
9997 }
9998
9999 /* Return true if the RTX X in mode MODE is a zero or sign extract
10000    usable in an ADD or SUB (extended register) instruction.  */
10001 static bool
10002 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10003 {
10004   /* Catch add with a sign extract.
10005      This is add_<optab><mode>_multp2.  */
10006   if (GET_CODE (x) == SIGN_EXTRACT
10007       || GET_CODE (x) == ZERO_EXTRACT)
10008     {
10009       rtx op0 = XEXP (x, 0);
10010       rtx op1 = XEXP (x, 1);
10011       rtx op2 = XEXP (x, 2);
10012
10013       if (GET_CODE (op0) == MULT
10014           && CONST_INT_P (op1)
10015           && op2 == const0_rtx
10016           && CONST_INT_P (XEXP (op0, 1))
10017           && aarch64_is_extend_from_extract (mode,
10018                                              XEXP (op0, 1),
10019                                              op1))
10020         {
10021           return true;
10022         }
10023     }
10024   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10025      No shift.  */
10026   else if (GET_CODE (x) == SIGN_EXTEND
10027            || GET_CODE (x) == ZERO_EXTEND)
10028     return REG_P (XEXP (x, 0));
10029
10030   return false;
10031 }
10032
10033 static bool
10034 aarch64_frint_unspec_p (unsigned int u)
10035 {
10036   switch (u)
10037     {
10038       case UNSPEC_FRINTZ:
10039       case UNSPEC_FRINTP:
10040       case UNSPEC_FRINTM:
10041       case UNSPEC_FRINTA:
10042       case UNSPEC_FRINTN:
10043       case UNSPEC_FRINTX:
10044       case UNSPEC_FRINTI:
10045         return true;
10046
10047       default:
10048         return false;
10049     }
10050 }
10051
10052 /* Return true iff X is an rtx that will match an extr instruction
10053    i.e. as described in the *extr<mode>5_insn family of patterns.
10054    OP0 and OP1 will be set to the operands of the shifts involved
10055    on success and will be NULL_RTX otherwise.  */
10056
10057 static bool
10058 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10059 {
10060   rtx op0, op1;
10061   scalar_int_mode mode;
10062   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10063     return false;
10064
10065   *res_op0 = NULL_RTX;
10066   *res_op1 = NULL_RTX;
10067
10068   if (GET_CODE (x) != IOR)
10069     return false;
10070
10071   op0 = XEXP (x, 0);
10072   op1 = XEXP (x, 1);
10073
10074   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10075       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10076     {
10077      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10078       if (GET_CODE (op1) == ASHIFT)
10079         std::swap (op0, op1);
10080
10081       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10082         return false;
10083
10084       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10085       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10086
10087       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10088           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10089         {
10090           *res_op0 = XEXP (op0, 0);
10091           *res_op1 = XEXP (op1, 0);
10092           return true;
10093         }
10094     }
10095
10096   return false;
10097 }
10098
10099 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10100    storing it in *COST.  Result is true if the total cost of the operation
10101    has now been calculated.  */
10102 static bool
10103 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10104 {
10105   rtx inner;
10106   rtx comparator;
10107   enum rtx_code cmpcode;
10108
10109   if (COMPARISON_P (op0))
10110     {
10111       inner = XEXP (op0, 0);
10112       comparator = XEXP (op0, 1);
10113       cmpcode = GET_CODE (op0);
10114     }
10115   else
10116     {
10117       inner = op0;
10118       comparator = const0_rtx;
10119       cmpcode = NE;
10120     }
10121
10122   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10123     {
10124       /* Conditional branch.  */
10125       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10126         return true;
10127       else
10128         {
10129           if (cmpcode == NE || cmpcode == EQ)
10130             {
10131               if (comparator == const0_rtx)
10132                 {
10133                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10134                   if (GET_CODE (inner) == ZERO_EXTRACT)
10135                     /* TBZ/TBNZ.  */
10136                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10137                                        ZERO_EXTRACT, 0, speed);
10138                   else
10139                     /* CBZ/CBNZ.  */
10140                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10141
10142                 return true;
10143               }
10144             }
10145           else if (cmpcode == LT || cmpcode == GE)
10146             {
10147               /* TBZ/TBNZ.  */
10148               if (comparator == const0_rtx)
10149                 return true;
10150             }
10151         }
10152     }
10153   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10154     {
10155       /* CCMP.  */
10156       if (GET_CODE (op1) == COMPARE)
10157         {
10158           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10159           if (XEXP (op1, 1) == const0_rtx)
10160             *cost += 1;
10161           if (speed)
10162             {
10163               machine_mode mode = GET_MODE (XEXP (op1, 0));
10164               const struct cpu_cost_table *extra_cost
10165                 = aarch64_tune_params.insn_extra_cost;
10166
10167               if (GET_MODE_CLASS (mode) == MODE_INT)
10168                 *cost += extra_cost->alu.arith;
10169               else
10170                 *cost += extra_cost->fp[mode == DFmode].compare;
10171             }
10172           return true;
10173         }
10174
10175       /* It's a conditional operation based on the status flags,
10176          so it must be some flavor of CSEL.  */
10177
10178       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10179       if (GET_CODE (op1) == NEG
10180           || GET_CODE (op1) == NOT
10181           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10182         op1 = XEXP (op1, 0);
10183       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10184         {
10185           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10186           op1 = XEXP (op1, 0);
10187           op2 = XEXP (op2, 0);
10188         }
10189
10190       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10191       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10192       return true;
10193     }
10194
10195   /* We don't know what this is, cost all operands.  */
10196   return false;
10197 }
10198
10199 /* Check whether X is a bitfield operation of the form shift + extend that
10200    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10201    operand to which the bitfield operation is applied.  Otherwise return
10202    NULL_RTX.  */
10203
10204 static rtx
10205 aarch64_extend_bitfield_pattern_p (rtx x)
10206 {
10207   rtx_code outer_code = GET_CODE (x);
10208   machine_mode outer_mode = GET_MODE (x);
10209
10210   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10211       && outer_mode != SImode && outer_mode != DImode)
10212     return NULL_RTX;
10213
10214   rtx inner = XEXP (x, 0);
10215   rtx_code inner_code = GET_CODE (inner);
10216   machine_mode inner_mode = GET_MODE (inner);
10217   rtx op = NULL_RTX;
10218
10219   switch (inner_code)
10220     {
10221       case ASHIFT:
10222         if (CONST_INT_P (XEXP (inner, 1))
10223             && (inner_mode == QImode || inner_mode == HImode))
10224           op = XEXP (inner, 0);
10225         break;
10226       case LSHIFTRT:
10227         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10228             && (inner_mode == QImode || inner_mode == HImode))
10229           op = XEXP (inner, 0);
10230         break;
10231       case ASHIFTRT:
10232         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10233             && (inner_mode == QImode || inner_mode == HImode))
10234           op = XEXP (inner, 0);
10235         break;
10236       default:
10237         break;
10238     }
10239
10240   return op;
10241 }
10242
10243 /* Return true if the mask and a shift amount from an RTX of the form
10244    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10245    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10246
10247 bool
10248 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10249                                     rtx shft_amnt)
10250 {
10251   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10252          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10253          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10254          && (INTVAL (mask)
10255              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10256 }
10257
10258 /* Return true if the masks and a shift amount from an RTX of the form
10259    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10260    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10261
10262 bool
10263 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10264                                    unsigned HOST_WIDE_INT mask1,
10265                                    unsigned HOST_WIDE_INT shft_amnt,
10266                                    unsigned HOST_WIDE_INT mask2)
10267 {
10268   unsigned HOST_WIDE_INT t;
10269
10270   /* Verify that there is no overlap in what bits are set in the two masks.  */
10271   if (mask1 != ~mask2)
10272     return false;
10273
10274   /* Verify that mask2 is not all zeros or ones.  */
10275   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10276     return false;
10277
10278   /* The shift amount should always be less than the mode size.  */
10279   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10280
10281   /* Verify that the mask being shifted is contiguous and would be in the
10282      least significant bits after shifting by shft_amnt.  */
10283   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10284   return (t == (t & -t));
10285 }
10286
10287 /* Calculate the cost of calculating X, storing it in *COST.  Result
10288    is true if the total cost of the operation has now been calculated.  */
10289 static bool
10290 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10291                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10292 {
10293   rtx op0, op1, op2;
10294   const struct cpu_cost_table *extra_cost
10295     = aarch64_tune_params.insn_extra_cost;
10296   int code = GET_CODE (x);
10297   scalar_int_mode int_mode;
10298
10299   /* By default, assume that everything has equivalent cost to the
10300      cheapest instruction.  Any additional costs are applied as a delta
10301      above this default.  */
10302   *cost = COSTS_N_INSNS (1);
10303
10304   switch (code)
10305     {
10306     case SET:
10307       /* The cost depends entirely on the operands to SET.  */
10308       *cost = 0;
10309       op0 = SET_DEST (x);
10310       op1 = SET_SRC (x);
10311
10312       switch (GET_CODE (op0))
10313         {
10314         case MEM:
10315           if (speed)
10316             {
10317               rtx address = XEXP (op0, 0);
10318               if (VECTOR_MODE_P (mode))
10319                 *cost += extra_cost->ldst.storev;
10320               else if (GET_MODE_CLASS (mode) == MODE_INT)
10321                 *cost += extra_cost->ldst.store;
10322               else if (mode == SFmode)
10323                 *cost += extra_cost->ldst.storef;
10324               else if (mode == DFmode)
10325                 *cost += extra_cost->ldst.stored;
10326
10327               *cost +=
10328                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10329                                                      0, speed));
10330             }
10331
10332           *cost += rtx_cost (op1, mode, SET, 1, speed);
10333           return true;
10334
10335         case SUBREG:
10336           if (! REG_P (SUBREG_REG (op0)))
10337             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10338
10339           /* Fall through.  */
10340         case REG:
10341           /* The cost is one per vector-register copied.  */
10342           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10343             {
10344               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10345               *cost = COSTS_N_INSNS (nregs);
10346             }
10347           /* const0_rtx is in general free, but we will use an
10348              instruction to set a register to 0.  */
10349           else if (REG_P (op1) || op1 == const0_rtx)
10350             {
10351               /* The cost is 1 per register copied.  */
10352               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10353               *cost = COSTS_N_INSNS (nregs);
10354             }
10355           else
10356             /* Cost is just the cost of the RHS of the set.  */
10357             *cost += rtx_cost (op1, mode, SET, 1, speed);
10358           return true;
10359
10360         case ZERO_EXTRACT:
10361         case SIGN_EXTRACT:
10362           /* Bit-field insertion.  Strip any redundant widening of
10363              the RHS to meet the width of the target.  */
10364           if (GET_CODE (op1) == SUBREG)
10365             op1 = SUBREG_REG (op1);
10366           if ((GET_CODE (op1) == ZERO_EXTEND
10367                || GET_CODE (op1) == SIGN_EXTEND)
10368               && CONST_INT_P (XEXP (op0, 1))
10369               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10370               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10371             op1 = XEXP (op1, 0);
10372
10373           if (CONST_INT_P (op1))
10374             {
10375               /* MOV immediate is assumed to always be cheap.  */
10376               *cost = COSTS_N_INSNS (1);
10377             }
10378           else
10379             {
10380               /* BFM.  */
10381               if (speed)
10382                 *cost += extra_cost->alu.bfi;
10383               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10384             }
10385
10386           return true;
10387
10388         default:
10389           /* We can't make sense of this, assume default cost.  */
10390           *cost = COSTS_N_INSNS (1);
10391           return false;
10392         }
10393       return false;
10394
10395     case CONST_INT:
10396       /* If an instruction can incorporate a constant within the
10397          instruction, the instruction's expression avoids calling
10398          rtx_cost() on the constant.  If rtx_cost() is called on a
10399          constant, then it is usually because the constant must be
10400          moved into a register by one or more instructions.
10401
10402          The exception is constant 0, which can be expressed
10403          as XZR/WZR and is therefore free.  The exception to this is
10404          if we have (set (reg) (const0_rtx)) in which case we must cost
10405          the move.  However, we can catch that when we cost the SET, so
10406          we don't need to consider that here.  */
10407       if (x == const0_rtx)
10408         *cost = 0;
10409       else
10410         {
10411           /* To an approximation, building any other constant is
10412              proportionally expensive to the number of instructions
10413              required to build that constant.  This is true whether we
10414              are compiling for SPEED or otherwise.  */
10415           if (!is_a <scalar_int_mode> (mode, &int_mode))
10416             int_mode = word_mode;
10417           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10418                                  (NULL_RTX, x, false, int_mode));
10419         }
10420       return true;
10421
10422     case CONST_DOUBLE:
10423
10424       /* First determine number of instructions to do the move
10425           as an integer constant.  */
10426       if (!aarch64_float_const_representable_p (x)
10427            && !aarch64_can_const_movi_rtx_p (x, mode)
10428            && aarch64_float_const_rtx_p (x))
10429         {
10430           unsigned HOST_WIDE_INT ival;
10431           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10432           gcc_assert (succeed);
10433
10434           scalar_int_mode imode = (mode == HFmode
10435                                    ? SImode
10436                                    : int_mode_for_mode (mode).require ());
10437           int ncost = aarch64_internal_mov_immediate
10438                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10439           *cost += COSTS_N_INSNS (ncost);
10440           return true;
10441         }
10442
10443       if (speed)
10444         {
10445           /* mov[df,sf]_aarch64.  */
10446           if (aarch64_float_const_representable_p (x))
10447             /* FMOV (scalar immediate).  */
10448             *cost += extra_cost->fp[mode == DFmode].fpconst;
10449           else if (!aarch64_float_const_zero_rtx_p (x))
10450             {
10451               /* This will be a load from memory.  */
10452               if (mode == DFmode)
10453                 *cost += extra_cost->ldst.loadd;
10454               else
10455                 *cost += extra_cost->ldst.loadf;
10456             }
10457           else
10458             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10459                or MOV v0.s[0], wzr - neither of which are modeled by the
10460                cost tables.  Just use the default cost.  */
10461             {
10462             }
10463         }
10464
10465       return true;
10466
10467     case MEM:
10468       if (speed)
10469         {
10470           /* For loads we want the base cost of a load, plus an
10471              approximation for the additional cost of the addressing
10472              mode.  */
10473           rtx address = XEXP (x, 0);
10474           if (VECTOR_MODE_P (mode))
10475             *cost += extra_cost->ldst.loadv;
10476           else if (GET_MODE_CLASS (mode) == MODE_INT)
10477             *cost += extra_cost->ldst.load;
10478           else if (mode == SFmode)
10479             *cost += extra_cost->ldst.loadf;
10480           else if (mode == DFmode)
10481             *cost += extra_cost->ldst.loadd;
10482
10483           *cost +=
10484                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10485                                                      0, speed));
10486         }
10487
10488       return true;
10489
10490     case NEG:
10491       op0 = XEXP (x, 0);
10492
10493       if (VECTOR_MODE_P (mode))
10494         {
10495           if (speed)
10496             {
10497               /* FNEG.  */
10498               *cost += extra_cost->vect.alu;
10499             }
10500           return false;
10501         }
10502
10503       if (GET_MODE_CLASS (mode) == MODE_INT)
10504         {
10505           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10506               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10507             {
10508               /* CSETM.  */
10509               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10510               return true;
10511             }
10512
10513           /* Cost this as SUB wzr, X.  */
10514           op0 = CONST0_RTX (mode);
10515           op1 = XEXP (x, 0);
10516           goto cost_minus;
10517         }
10518
10519       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10520         {
10521           /* Support (neg(fma...)) as a single instruction only if
10522              sign of zeros is unimportant.  This matches the decision
10523              making in aarch64.md.  */
10524           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10525             {
10526               /* FNMADD.  */
10527               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10528               return true;
10529             }
10530           if (GET_CODE (op0) == MULT)
10531             {
10532               /* FNMUL.  */
10533               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10534               return true;
10535             }
10536           if (speed)
10537             /* FNEG.  */
10538             *cost += extra_cost->fp[mode == DFmode].neg;
10539           return false;
10540         }
10541
10542       return false;
10543
10544     case CLRSB:
10545     case CLZ:
10546       if (speed)
10547         {
10548           if (VECTOR_MODE_P (mode))
10549             *cost += extra_cost->vect.alu;
10550           else
10551             *cost += extra_cost->alu.clz;
10552         }
10553
10554       return false;
10555
10556     case COMPARE:
10557       op0 = XEXP (x, 0);
10558       op1 = XEXP (x, 1);
10559
10560       if (op1 == const0_rtx
10561           && GET_CODE (op0) == AND)
10562         {
10563           x = op0;
10564           mode = GET_MODE (op0);
10565           goto cost_logic;
10566         }
10567
10568       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10569         {
10570           /* TODO: A write to the CC flags possibly costs extra, this
10571              needs encoding in the cost tables.  */
10572
10573           mode = GET_MODE (op0);
10574           /* ANDS.  */
10575           if (GET_CODE (op0) == AND)
10576             {
10577               x = op0;
10578               goto cost_logic;
10579             }
10580
10581           if (GET_CODE (op0) == PLUS)
10582             {
10583               /* ADDS (and CMN alias).  */
10584               x = op0;
10585               goto cost_plus;
10586             }
10587
10588           if (GET_CODE (op0) == MINUS)
10589             {
10590               /* SUBS.  */
10591               x = op0;
10592               goto cost_minus;
10593             }
10594
10595           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10596               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10597               && CONST_INT_P (XEXP (op0, 2)))
10598             {
10599               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10600                  Handle it here directly rather than going to cost_logic
10601                  since we know the immediate generated for the TST is valid
10602                  so we can avoid creating an intermediate rtx for it only
10603                  for costing purposes.  */
10604               if (speed)
10605                 *cost += extra_cost->alu.logical;
10606
10607               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10608                                  ZERO_EXTRACT, 0, speed);
10609               return true;
10610             }
10611
10612           if (GET_CODE (op1) == NEG)
10613             {
10614               /* CMN.  */
10615               if (speed)
10616                 *cost += extra_cost->alu.arith;
10617
10618               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10619               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10620               return true;
10621             }
10622
10623           /* CMP.
10624
10625              Compare can freely swap the order of operands, and
10626              canonicalization puts the more complex operation first.
10627              But the integer MINUS logic expects the shift/extend
10628              operation in op1.  */
10629           if (! (REG_P (op0)
10630                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10631           {
10632             op0 = XEXP (x, 1);
10633             op1 = XEXP (x, 0);
10634           }
10635           goto cost_minus;
10636         }
10637
10638       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10639         {
10640           /* FCMP.  */
10641           if (speed)
10642             *cost += extra_cost->fp[mode == DFmode].compare;
10643
10644           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10645             {
10646               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10647               /* FCMP supports constant 0.0 for no extra cost. */
10648               return true;
10649             }
10650           return false;
10651         }
10652
10653       if (VECTOR_MODE_P (mode))
10654         {
10655           /* Vector compare.  */
10656           if (speed)
10657             *cost += extra_cost->vect.alu;
10658
10659           if (aarch64_float_const_zero_rtx_p (op1))
10660             {
10661               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10662                  cost.  */
10663               return true;
10664             }
10665           return false;
10666         }
10667       return false;
10668
10669     case MINUS:
10670       {
10671         op0 = XEXP (x, 0);
10672         op1 = XEXP (x, 1);
10673
10674 cost_minus:
10675         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10676
10677         /* Detect valid immediates.  */
10678         if ((GET_MODE_CLASS (mode) == MODE_INT
10679              || (GET_MODE_CLASS (mode) == MODE_CC
10680                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10681             && CONST_INT_P (op1)
10682             && aarch64_uimm12_shift (INTVAL (op1)))
10683           {
10684             if (speed)
10685               /* SUB(S) (immediate).  */
10686               *cost += extra_cost->alu.arith;
10687             return true;
10688           }
10689
10690         /* Look for SUB (extended register).  */
10691         if (is_a <scalar_int_mode> (mode, &int_mode)
10692             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10693           {
10694             if (speed)
10695               *cost += extra_cost->alu.extend_arith;
10696
10697             op1 = aarch64_strip_extend (op1, true);
10698             *cost += rtx_cost (op1, VOIDmode,
10699                                (enum rtx_code) GET_CODE (op1), 0, speed);
10700             return true;
10701           }
10702
10703         rtx new_op1 = aarch64_strip_extend (op1, false);
10704
10705         /* Cost this as an FMA-alike operation.  */
10706         if ((GET_CODE (new_op1) == MULT
10707              || aarch64_shift_p (GET_CODE (new_op1)))
10708             && code != COMPARE)
10709           {
10710             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10711                                             (enum rtx_code) code,
10712                                             speed);
10713             return true;
10714           }
10715
10716         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10717
10718         if (speed)
10719           {
10720             if (VECTOR_MODE_P (mode))
10721               {
10722                 /* Vector SUB.  */
10723                 *cost += extra_cost->vect.alu;
10724               }
10725             else if (GET_MODE_CLASS (mode) == MODE_INT)
10726               {
10727                 /* SUB(S).  */
10728                 *cost += extra_cost->alu.arith;
10729               }
10730             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10731               {
10732                 /* FSUB.  */
10733                 *cost += extra_cost->fp[mode == DFmode].addsub;
10734               }
10735           }
10736         return true;
10737       }
10738
10739     case PLUS:
10740       {
10741         rtx new_op0;
10742
10743         op0 = XEXP (x, 0);
10744         op1 = XEXP (x, 1);
10745
10746 cost_plus:
10747         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10748             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10749           {
10750             /* CSINC.  */
10751             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10752             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10753             return true;
10754           }
10755
10756         if (GET_MODE_CLASS (mode) == MODE_INT
10757             && (aarch64_plus_immediate (op1, mode)
10758                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10759           {
10760             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10761
10762             if (speed)
10763               /* ADD (immediate).  */
10764               *cost += extra_cost->alu.arith;
10765             return true;
10766           }
10767
10768         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10769
10770         /* Look for ADD (extended register).  */
10771         if (is_a <scalar_int_mode> (mode, &int_mode)
10772             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10773           {
10774             if (speed)
10775               *cost += extra_cost->alu.extend_arith;
10776
10777             op0 = aarch64_strip_extend (op0, true);
10778             *cost += rtx_cost (op0, VOIDmode,
10779                                (enum rtx_code) GET_CODE (op0), 0, speed);
10780             return true;
10781           }
10782
10783         /* Strip any extend, leave shifts behind as we will
10784            cost them through mult_cost.  */
10785         new_op0 = aarch64_strip_extend (op0, false);
10786
10787         if (GET_CODE (new_op0) == MULT
10788             || aarch64_shift_p (GET_CODE (new_op0)))
10789           {
10790             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10791                                             speed);
10792             return true;
10793           }
10794
10795         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10796
10797         if (speed)
10798           {
10799             if (VECTOR_MODE_P (mode))
10800               {
10801                 /* Vector ADD.  */
10802                 *cost += extra_cost->vect.alu;
10803               }
10804             else if (GET_MODE_CLASS (mode) == MODE_INT)
10805               {
10806                 /* ADD.  */
10807                 *cost += extra_cost->alu.arith;
10808               }
10809             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10810               {
10811                 /* FADD.  */
10812                 *cost += extra_cost->fp[mode == DFmode].addsub;
10813               }
10814           }
10815         return true;
10816       }
10817
10818     case BSWAP:
10819       *cost = COSTS_N_INSNS (1);
10820
10821       if (speed)
10822         {
10823           if (VECTOR_MODE_P (mode))
10824             *cost += extra_cost->vect.alu;
10825           else
10826             *cost += extra_cost->alu.rev;
10827         }
10828       return false;
10829
10830     case IOR:
10831       if (aarch_rev16_p (x))
10832         {
10833           *cost = COSTS_N_INSNS (1);
10834
10835           if (speed)
10836             {
10837               if (VECTOR_MODE_P (mode))
10838                 *cost += extra_cost->vect.alu;
10839               else
10840                 *cost += extra_cost->alu.rev;
10841             }
10842           return true;
10843         }
10844
10845       if (aarch64_extr_rtx_p (x, &op0, &op1))
10846         {
10847           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10848           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10849           if (speed)
10850             *cost += extra_cost->alu.shift;
10851
10852           return true;
10853         }
10854     /* Fall through.  */
10855     case XOR:
10856     case AND:
10857     cost_logic:
10858       op0 = XEXP (x, 0);
10859       op1 = XEXP (x, 1);
10860
10861       if (VECTOR_MODE_P (mode))
10862         {
10863           if (speed)
10864             *cost += extra_cost->vect.alu;
10865           return true;
10866         }
10867
10868       if (code == AND
10869           && GET_CODE (op0) == MULT
10870           && CONST_INT_P (XEXP (op0, 1))
10871           && CONST_INT_P (op1)
10872           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10873                                INTVAL (op1)) != 0)
10874         {
10875           /* This is a UBFM/SBFM.  */
10876           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10877           if (speed)
10878             *cost += extra_cost->alu.bfx;
10879           return true;
10880         }
10881
10882       if (is_int_mode (mode, &int_mode))
10883         {
10884           if (CONST_INT_P (op1))
10885             {
10886               /* We have a mask + shift version of a UBFIZ
10887                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10888               if (GET_CODE (op0) == ASHIFT
10889                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10890                                                          XEXP (op0, 1)))
10891                 {
10892                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10893                                      (enum rtx_code) code, 0, speed);
10894                   if (speed)
10895                     *cost += extra_cost->alu.bfx;
10896
10897                   return true;
10898                 }
10899               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10900                 {
10901                 /* We possibly get the immediate for free, this is not
10902                    modelled.  */
10903                   *cost += rtx_cost (op0, int_mode,
10904                                      (enum rtx_code) code, 0, speed);
10905                   if (speed)
10906                     *cost += extra_cost->alu.logical;
10907
10908                   return true;
10909                 }
10910             }
10911           else
10912             {
10913               rtx new_op0 = op0;
10914
10915               /* Handle ORN, EON, or BIC.  */
10916               if (GET_CODE (op0) == NOT)
10917                 op0 = XEXP (op0, 0);
10918
10919               new_op0 = aarch64_strip_shift (op0);
10920
10921               /* If we had a shift on op0 then this is a logical-shift-
10922                  by-register/immediate operation.  Otherwise, this is just
10923                  a logical operation.  */
10924               if (speed)
10925                 {
10926                   if (new_op0 != op0)
10927                     {
10928                       /* Shift by immediate.  */
10929                       if (CONST_INT_P (XEXP (op0, 1)))
10930                         *cost += extra_cost->alu.log_shift;
10931                       else
10932                         *cost += extra_cost->alu.log_shift_reg;
10933                     }
10934                   else
10935                     *cost += extra_cost->alu.logical;
10936                 }
10937
10938               /* In both cases we want to cost both operands.  */
10939               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10940                                  0, speed);
10941               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10942                                  1, speed);
10943
10944               return true;
10945             }
10946         }
10947       return false;
10948
10949     case NOT:
10950       x = XEXP (x, 0);
10951       op0 = aarch64_strip_shift (x);
10952
10953       if (VECTOR_MODE_P (mode))
10954         {
10955           /* Vector NOT.  */
10956           *cost += extra_cost->vect.alu;
10957           return false;
10958         }
10959
10960       /* MVN-shifted-reg.  */
10961       if (op0 != x)
10962         {
10963           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10964
10965           if (speed)
10966             *cost += extra_cost->alu.log_shift;
10967
10968           return true;
10969         }
10970       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10971          Handle the second form here taking care that 'a' in the above can
10972          be a shift.  */
10973       else if (GET_CODE (op0) == XOR)
10974         {
10975           rtx newop0 = XEXP (op0, 0);
10976           rtx newop1 = XEXP (op0, 1);
10977           rtx op0_stripped = aarch64_strip_shift (newop0);
10978
10979           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10980           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10981
10982           if (speed)
10983             {
10984               if (op0_stripped != newop0)
10985                 *cost += extra_cost->alu.log_shift;
10986               else
10987                 *cost += extra_cost->alu.logical;
10988             }
10989
10990           return true;
10991         }
10992       /* MVN.  */
10993       if (speed)
10994         *cost += extra_cost->alu.logical;
10995
10996       return false;
10997
10998     case ZERO_EXTEND:
10999
11000       op0 = XEXP (x, 0);
11001       /* If a value is written in SI mode, then zero extended to DI
11002          mode, the operation will in general be free as a write to
11003          a 'w' register implicitly zeroes the upper bits of an 'x'
11004          register.  However, if this is
11005
11006            (set (reg) (zero_extend (reg)))
11007
11008          we must cost the explicit register move.  */
11009       if (mode == DImode
11010           && GET_MODE (op0) == SImode
11011           && outer == SET)
11012         {
11013           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11014
11015         /* If OP_COST is non-zero, then the cost of the zero extend
11016            is effectively the cost of the inner operation.  Otherwise
11017            we have a MOV instruction and we take the cost from the MOV
11018            itself.  This is true independently of whether we are
11019            optimizing for space or time.  */
11020           if (op_cost)
11021             *cost = op_cost;
11022
11023           return true;
11024         }
11025       else if (MEM_P (op0))
11026         {
11027           /* All loads can zero extend to any size for free.  */
11028           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11029           return true;
11030         }
11031
11032       op0 = aarch64_extend_bitfield_pattern_p (x);
11033       if (op0)
11034         {
11035           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11036           if (speed)
11037             *cost += extra_cost->alu.bfx;
11038           return true;
11039         }
11040
11041       if (speed)
11042         {
11043           if (VECTOR_MODE_P (mode))
11044             {
11045               /* UMOV.  */
11046               *cost += extra_cost->vect.alu;
11047             }
11048           else
11049             {
11050               /* We generate an AND instead of UXTB/UXTH.  */
11051               *cost += extra_cost->alu.logical;
11052             }
11053         }
11054       return false;
11055
11056     case SIGN_EXTEND:
11057       if (MEM_P (XEXP (x, 0)))
11058         {
11059           /* LDRSH.  */
11060           if (speed)
11061             {
11062               rtx address = XEXP (XEXP (x, 0), 0);
11063               *cost += extra_cost->ldst.load_sign_extend;
11064
11065               *cost +=
11066                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11067                                                      0, speed));
11068             }
11069           return true;
11070         }
11071
11072       op0 = aarch64_extend_bitfield_pattern_p (x);
11073       if (op0)
11074         {
11075           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11076           if (speed)
11077             *cost += extra_cost->alu.bfx;
11078           return true;
11079         }
11080
11081       if (speed)
11082         {
11083           if (VECTOR_MODE_P (mode))
11084             *cost += extra_cost->vect.alu;
11085           else
11086             *cost += extra_cost->alu.extend;
11087         }
11088       return false;
11089
11090     case ASHIFT:
11091       op0 = XEXP (x, 0);
11092       op1 = XEXP (x, 1);
11093
11094       if (CONST_INT_P (op1))
11095         {
11096           if (speed)
11097             {
11098               if (VECTOR_MODE_P (mode))
11099                 {
11100                   /* Vector shift (immediate).  */
11101                   *cost += extra_cost->vect.alu;
11102                 }
11103               else
11104                 {
11105                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11106                      aliases.  */
11107                   *cost += extra_cost->alu.shift;
11108                 }
11109             }
11110
11111           /* We can incorporate zero/sign extend for free.  */
11112           if (GET_CODE (op0) == ZERO_EXTEND
11113               || GET_CODE (op0) == SIGN_EXTEND)
11114             op0 = XEXP (op0, 0);
11115
11116           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11117           return true;
11118         }
11119       else
11120         {
11121           if (VECTOR_MODE_P (mode))
11122             {
11123               if (speed)
11124                 /* Vector shift (register).  */
11125                 *cost += extra_cost->vect.alu;
11126             }
11127           else
11128             {
11129               if (speed)
11130                 /* LSLV.  */
11131                 *cost += extra_cost->alu.shift_reg;
11132
11133               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11134                   && CONST_INT_P (XEXP (op1, 1))
11135                   && known_eq (INTVAL (XEXP (op1, 1)),
11136                                GET_MODE_BITSIZE (mode) - 1))
11137                 {
11138                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11139                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11140                      don't recurse into it.  */
11141                   return true;
11142                 }
11143             }
11144           return false;  /* All arguments need to be in registers.  */
11145         }
11146
11147     case ROTATE:
11148     case ROTATERT:
11149     case LSHIFTRT:
11150     case ASHIFTRT:
11151       op0 = XEXP (x, 0);
11152       op1 = XEXP (x, 1);
11153
11154       if (CONST_INT_P (op1))
11155         {
11156           /* ASR (immediate) and friends.  */
11157           if (speed)
11158             {
11159               if (VECTOR_MODE_P (mode))
11160                 *cost += extra_cost->vect.alu;
11161               else
11162                 *cost += extra_cost->alu.shift;
11163             }
11164
11165           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11166           return true;
11167         }
11168       else
11169         {
11170           if (VECTOR_MODE_P (mode))
11171             {
11172               if (speed)
11173                 /* Vector shift (register).  */
11174                 *cost += extra_cost->vect.alu;
11175             }
11176           else
11177             {
11178               if (speed)
11179                 /* ASR (register) and friends.  */
11180                 *cost += extra_cost->alu.shift_reg;
11181
11182               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11183                   && CONST_INT_P (XEXP (op1, 1))
11184                   && known_eq (INTVAL (XEXP (op1, 1)),
11185                                GET_MODE_BITSIZE (mode) - 1))
11186                 {
11187                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11188                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11189                      don't recurse into it.  */
11190                   return true;
11191                 }
11192             }
11193           return false;  /* All arguments need to be in registers.  */
11194         }
11195
11196     case SYMBOL_REF:
11197
11198       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11199           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11200         {
11201           /* LDR.  */
11202           if (speed)
11203             *cost += extra_cost->ldst.load;
11204         }
11205       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11206                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11207         {
11208           /* ADRP, followed by ADD.  */
11209           *cost += COSTS_N_INSNS (1);
11210           if (speed)
11211             *cost += 2 * extra_cost->alu.arith;
11212         }
11213       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11214                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11215         {
11216           /* ADR.  */
11217           if (speed)
11218             *cost += extra_cost->alu.arith;
11219         }
11220
11221       if (flag_pic)
11222         {
11223           /* One extra load instruction, after accessing the GOT.  */
11224           *cost += COSTS_N_INSNS (1);
11225           if (speed)
11226             *cost += extra_cost->ldst.load;
11227         }
11228       return true;
11229
11230     case HIGH:
11231     case LO_SUM:
11232       /* ADRP/ADD (immediate).  */
11233       if (speed)
11234         *cost += extra_cost->alu.arith;
11235       return true;
11236
11237     case ZERO_EXTRACT:
11238     case SIGN_EXTRACT:
11239       /* UBFX/SBFX.  */
11240       if (speed)
11241         {
11242           if (VECTOR_MODE_P (mode))
11243             *cost += extra_cost->vect.alu;
11244           else
11245             *cost += extra_cost->alu.bfx;
11246         }
11247
11248       /* We can trust that the immediates used will be correct (there
11249          are no by-register forms), so we need only cost op0.  */
11250       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11251       return true;
11252
11253     case MULT:
11254       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11255       /* aarch64_rtx_mult_cost always handles recursion to its
11256          operands.  */
11257       return true;
11258
11259     case MOD:
11260     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11261        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11262        an unconditional negate.  This case should only ever be reached through
11263        the set_smod_pow2_cheap check in expmed.c.  */
11264       if (CONST_INT_P (XEXP (x, 1))
11265           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11266           && (mode == SImode || mode == DImode))
11267         {
11268           /* We expand to 4 instructions.  Reset the baseline.  */
11269           *cost = COSTS_N_INSNS (4);
11270
11271           if (speed)
11272             *cost += 2 * extra_cost->alu.logical
11273                      + 2 * extra_cost->alu.arith;
11274
11275           return true;
11276         }
11277
11278     /* Fall-through.  */
11279     case UMOD:
11280       if (speed)
11281         {
11282           /* Slighly prefer UMOD over SMOD.  */
11283           if (VECTOR_MODE_P (mode))
11284             *cost += extra_cost->vect.alu;
11285           else if (GET_MODE_CLASS (mode) == MODE_INT)
11286             *cost += (extra_cost->mult[mode == DImode].add
11287                       + extra_cost->mult[mode == DImode].idiv
11288                       + (code == MOD ? 1 : 0));
11289         }
11290       return false;  /* All arguments need to be in registers.  */
11291
11292     case DIV:
11293     case UDIV:
11294     case SQRT:
11295       if (speed)
11296         {
11297           if (VECTOR_MODE_P (mode))
11298             *cost += extra_cost->vect.alu;
11299           else if (GET_MODE_CLASS (mode) == MODE_INT)
11300             /* There is no integer SQRT, so only DIV and UDIV can get
11301                here.  */
11302             *cost += (extra_cost->mult[mode == DImode].idiv
11303                      /* Slighly prefer UDIV over SDIV.  */
11304                      + (code == DIV ? 1 : 0));
11305           else
11306             *cost += extra_cost->fp[mode == DFmode].div;
11307         }
11308       return false;  /* All arguments need to be in registers.  */
11309
11310     case IF_THEN_ELSE:
11311       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11312                                          XEXP (x, 2), cost, speed);
11313
11314     case EQ:
11315     case NE:
11316     case GT:
11317     case GTU:
11318     case LT:
11319     case LTU:
11320     case GE:
11321     case GEU:
11322     case LE:
11323     case LEU:
11324
11325       return false; /* All arguments must be in registers.  */
11326
11327     case FMA:
11328       op0 = XEXP (x, 0);
11329       op1 = XEXP (x, 1);
11330       op2 = XEXP (x, 2);
11331
11332       if (speed)
11333         {
11334           if (VECTOR_MODE_P (mode))
11335             *cost += extra_cost->vect.alu;
11336           else
11337             *cost += extra_cost->fp[mode == DFmode].fma;
11338         }
11339
11340       /* FMSUB, FNMADD, and FNMSUB are free.  */
11341       if (GET_CODE (op0) == NEG)
11342         op0 = XEXP (op0, 0);
11343
11344       if (GET_CODE (op2) == NEG)
11345         op2 = XEXP (op2, 0);
11346
11347       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11348          and the by-element operand as operand 0.  */
11349       if (GET_CODE (op1) == NEG)
11350         op1 = XEXP (op1, 0);
11351
11352       /* Catch vector-by-element operations.  The by-element operand can
11353          either be (vec_duplicate (vec_select (x))) or just
11354          (vec_select (x)), depending on whether we are multiplying by
11355          a vector or a scalar.
11356
11357          Canonicalization is not very good in these cases, FMA4 will put the
11358          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11359       if (GET_CODE (op0) == VEC_DUPLICATE)
11360         op0 = XEXP (op0, 0);
11361       else if (GET_CODE (op1) == VEC_DUPLICATE)
11362         op1 = XEXP (op1, 0);
11363
11364       if (GET_CODE (op0) == VEC_SELECT)
11365         op0 = XEXP (op0, 0);
11366       else if (GET_CODE (op1) == VEC_SELECT)
11367         op1 = XEXP (op1, 0);
11368
11369       /* If the remaining parameters are not registers,
11370          get the cost to put them into registers.  */
11371       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11372       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11373       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11374       return true;
11375
11376     case FLOAT:
11377     case UNSIGNED_FLOAT:
11378       if (speed)
11379         *cost += extra_cost->fp[mode == DFmode].fromint;
11380       return false;
11381
11382     case FLOAT_EXTEND:
11383       if (speed)
11384         {
11385           if (VECTOR_MODE_P (mode))
11386             {
11387               /*Vector truncate.  */
11388               *cost += extra_cost->vect.alu;
11389             }
11390           else
11391             *cost += extra_cost->fp[mode == DFmode].widen;
11392         }
11393       return false;
11394
11395     case FLOAT_TRUNCATE:
11396       if (speed)
11397         {
11398           if (VECTOR_MODE_P (mode))
11399             {
11400               /*Vector conversion.  */
11401               *cost += extra_cost->vect.alu;
11402             }
11403           else
11404             *cost += extra_cost->fp[mode == DFmode].narrow;
11405         }
11406       return false;
11407
11408     case FIX:
11409     case UNSIGNED_FIX:
11410       x = XEXP (x, 0);
11411       /* Strip the rounding part.  They will all be implemented
11412          by the fcvt* family of instructions anyway.  */
11413       if (GET_CODE (x) == UNSPEC)
11414         {
11415           unsigned int uns_code = XINT (x, 1);
11416
11417           if (uns_code == UNSPEC_FRINTA
11418               || uns_code == UNSPEC_FRINTM
11419               || uns_code == UNSPEC_FRINTN
11420               || uns_code == UNSPEC_FRINTP
11421               || uns_code == UNSPEC_FRINTZ)
11422             x = XVECEXP (x, 0, 0);
11423         }
11424
11425       if (speed)
11426         {
11427           if (VECTOR_MODE_P (mode))
11428             *cost += extra_cost->vect.alu;
11429           else
11430             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11431         }
11432
11433       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11434          fixed-point fcvt.  */
11435       if (GET_CODE (x) == MULT
11436           && ((VECTOR_MODE_P (mode)
11437                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11438               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11439         {
11440           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11441                              0, speed);
11442           return true;
11443         }
11444
11445       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11446       return true;
11447
11448     case ABS:
11449       if (VECTOR_MODE_P (mode))
11450         {
11451           /* ABS (vector).  */
11452           if (speed)
11453             *cost += extra_cost->vect.alu;
11454         }
11455       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11456         {
11457           op0 = XEXP (x, 0);
11458
11459           /* FABD, which is analogous to FADD.  */
11460           if (GET_CODE (op0) == MINUS)
11461             {
11462               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11463               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11464               if (speed)
11465                 *cost += extra_cost->fp[mode == DFmode].addsub;
11466
11467               return true;
11468             }
11469           /* Simple FABS is analogous to FNEG.  */
11470           if (speed)
11471             *cost += extra_cost->fp[mode == DFmode].neg;
11472         }
11473       else
11474         {
11475           /* Integer ABS will either be split to
11476              two arithmetic instructions, or will be an ABS
11477              (scalar), which we don't model.  */
11478           *cost = COSTS_N_INSNS (2);
11479           if (speed)
11480             *cost += 2 * extra_cost->alu.arith;
11481         }
11482       return false;
11483
11484     case SMAX:
11485     case SMIN:
11486       if (speed)
11487         {
11488           if (VECTOR_MODE_P (mode))
11489             *cost += extra_cost->vect.alu;
11490           else
11491             {
11492               /* FMAXNM/FMINNM/FMAX/FMIN.
11493                  TODO: This may not be accurate for all implementations, but
11494                  we do not model this in the cost tables.  */
11495               *cost += extra_cost->fp[mode == DFmode].addsub;
11496             }
11497         }
11498       return false;
11499
11500     case UNSPEC:
11501       /* The floating point round to integer frint* instructions.  */
11502       if (aarch64_frint_unspec_p (XINT (x, 1)))
11503         {
11504           if (speed)
11505             *cost += extra_cost->fp[mode == DFmode].roundint;
11506
11507           return false;
11508         }
11509
11510       if (XINT (x, 1) == UNSPEC_RBIT)
11511         {
11512           if (speed)
11513             *cost += extra_cost->alu.rev;
11514
11515           return false;
11516         }
11517       break;
11518
11519     case TRUNCATE:
11520
11521       /* Decompose <su>muldi3_highpart.  */
11522       if (/* (truncate:DI  */
11523           mode == DImode
11524           /*   (lshiftrt:TI  */
11525           && GET_MODE (XEXP (x, 0)) == TImode
11526           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11527           /*      (mult:TI  */
11528           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11529           /*        (ANY_EXTEND:TI (reg:DI))
11530                     (ANY_EXTEND:TI (reg:DI)))  */
11531           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11532                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11533               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11534                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11535           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11536           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11537           /*     (const_int 64)  */
11538           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11539           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11540         {
11541           /* UMULH/SMULH.  */
11542           if (speed)
11543             *cost += extra_cost->mult[mode == DImode].extend;
11544           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11545                              mode, MULT, 0, speed);
11546           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11547                              mode, MULT, 1, speed);
11548           return true;
11549         }
11550
11551       /* Fall through.  */
11552     default:
11553       break;
11554     }
11555
11556   if (dump_file
11557       && flag_aarch64_verbose_cost)
11558     fprintf (dump_file,
11559       "\nFailed to cost RTX.  Assuming default cost.\n");
11560
11561   return true;
11562 }
11563
11564 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11565    calculated for X.  This cost is stored in *COST.  Returns true
11566    if the total cost of X was calculated.  */
11567 static bool
11568 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11569                    int param, int *cost, bool speed)
11570 {
11571   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11572
11573   if (dump_file
11574       && flag_aarch64_verbose_cost)
11575     {
11576       print_rtl_single (dump_file, x);
11577       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11578                speed ? "Hot" : "Cold",
11579                *cost, result ? "final" : "partial");
11580     }
11581
11582   return result;
11583 }
11584
11585 static int
11586 aarch64_register_move_cost (machine_mode mode,
11587                             reg_class_t from_i, reg_class_t to_i)
11588 {
11589   enum reg_class from = (enum reg_class) from_i;
11590   enum reg_class to = (enum reg_class) to_i;
11591   const struct cpu_regmove_cost *regmove_cost
11592     = aarch64_tune_params.regmove_cost;
11593
11594   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11595   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11596     to = GENERAL_REGS;
11597
11598   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11599     from = GENERAL_REGS;
11600
11601   /* Moving between GPR and stack cost is the same as GP2GP.  */
11602   if ((from == GENERAL_REGS && to == STACK_REG)
11603       || (to == GENERAL_REGS && from == STACK_REG))
11604     return regmove_cost->GP2GP;
11605
11606   /* To/From the stack register, we move via the gprs.  */
11607   if (to == STACK_REG || from == STACK_REG)
11608     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11609             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11610
11611   if (known_eq (GET_MODE_SIZE (mode), 16))
11612     {
11613       /* 128-bit operations on general registers require 2 instructions.  */
11614       if (from == GENERAL_REGS && to == GENERAL_REGS)
11615         return regmove_cost->GP2GP * 2;
11616       else if (from == GENERAL_REGS)
11617         return regmove_cost->GP2FP * 2;
11618       else if (to == GENERAL_REGS)
11619         return regmove_cost->FP2GP * 2;
11620
11621       /* When AdvSIMD instructions are disabled it is not possible to move
11622          a 128-bit value directly between Q registers.  This is handled in
11623          secondary reload.  A general register is used as a scratch to move
11624          the upper DI value and the lower DI value is moved directly,
11625          hence the cost is the sum of three moves. */
11626       if (! TARGET_SIMD)
11627         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11628
11629       return regmove_cost->FP2FP;
11630     }
11631
11632   if (from == GENERAL_REGS && to == GENERAL_REGS)
11633     return regmove_cost->GP2GP;
11634   else if (from == GENERAL_REGS)
11635     return regmove_cost->GP2FP;
11636   else if (to == GENERAL_REGS)
11637     return regmove_cost->FP2GP;
11638
11639   return regmove_cost->FP2FP;
11640 }
11641
11642 static int
11643 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11644                           reg_class_t rclass ATTRIBUTE_UNUSED,
11645                           bool in ATTRIBUTE_UNUSED)
11646 {
11647   return aarch64_tune_params.memmov_cost;
11648 }
11649
11650 /* Implement TARGET_INIT_BUILTINS.  */
11651 static void
11652 aarch64_init_builtins ()
11653 {
11654   aarch64_general_init_builtins ();
11655 }
11656
11657 /* Implement TARGET_FOLD_BUILTIN.  */
11658 static tree
11659 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
11660 {
11661   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11662   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11663   tree type = TREE_TYPE (TREE_TYPE (fndecl));
11664   switch (code & AARCH64_BUILTIN_CLASS)
11665     {
11666     case AARCH64_BUILTIN_GENERAL:
11667       return aarch64_general_fold_builtin (subcode, type, nargs, args);
11668     }
11669   gcc_unreachable ();
11670 }
11671
11672 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
11673 static bool
11674 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
11675 {
11676   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
11677   tree fndecl = gimple_call_fndecl (stmt);
11678   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11679   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11680   gimple *new_stmt = NULL;
11681   switch (code & AARCH64_BUILTIN_CLASS)
11682     {
11683     case AARCH64_BUILTIN_GENERAL:
11684       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
11685       break;
11686     }
11687
11688   if (!new_stmt)
11689     return false;
11690
11691   gsi_replace (gsi, new_stmt, true);
11692   return true;
11693 }
11694
11695 /* Implement TARGET_EXPAND_BUILTIN.  */
11696 static rtx
11697 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int)
11698 {
11699   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11700   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11701   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11702   switch (code & AARCH64_BUILTIN_CLASS)
11703     {
11704     case AARCH64_BUILTIN_GENERAL:
11705       return aarch64_general_expand_builtin (subcode, exp, target);
11706     }
11707   gcc_unreachable ();
11708 }
11709
11710 /* Implement TARGET_BUILTIN_DECL.  */
11711 static tree
11712 aarch64_builtin_decl (unsigned int code, bool initialize_p)
11713 {
11714   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11715   switch (code & AARCH64_BUILTIN_CLASS)
11716     {
11717     case AARCH64_BUILTIN_GENERAL:
11718       return aarch64_general_builtin_decl (subcode, initialize_p);
11719     }
11720   gcc_unreachable ();
11721 }
11722
11723 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11724    to optimize 1.0/sqrt.  */
11725
11726 static bool
11727 use_rsqrt_p (machine_mode mode)
11728 {
11729   return (!flag_trapping_math
11730           && flag_unsafe_math_optimizations
11731           && ((aarch64_tune_params.approx_modes->recip_sqrt
11732                & AARCH64_APPROX_MODE (mode))
11733               || flag_mrecip_low_precision_sqrt));
11734 }
11735
11736 /* Function to decide when to use the approximate reciprocal square root
11737    builtin.  */
11738
11739 static tree
11740 aarch64_builtin_reciprocal (tree fndecl)
11741 {
11742   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11743
11744   if (!use_rsqrt_p (mode))
11745     return NULL_TREE;
11746   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11747   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11748   switch (code & AARCH64_BUILTIN_CLASS)
11749     {
11750     case AARCH64_BUILTIN_GENERAL:
11751       return aarch64_general_builtin_rsqrt (subcode);
11752     }
11753   gcc_unreachable ();
11754 }
11755
11756 /* Emit instruction sequence to compute either the approximate square root
11757    or its approximate reciprocal, depending on the flag RECP, and return
11758    whether the sequence was emitted or not.  */
11759
11760 bool
11761 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11762 {
11763   machine_mode mode = GET_MODE (dst);
11764
11765   if (GET_MODE_INNER (mode) == HFmode)
11766     {
11767       gcc_assert (!recp);
11768       return false;
11769     }
11770
11771   if (!recp)
11772     {
11773       if (!(flag_mlow_precision_sqrt
11774             || (aarch64_tune_params.approx_modes->sqrt
11775                 & AARCH64_APPROX_MODE (mode))))
11776         return false;
11777
11778       if (flag_finite_math_only
11779           || flag_trapping_math
11780           || !flag_unsafe_math_optimizations
11781           || optimize_function_for_size_p (cfun))
11782         return false;
11783     }
11784   else
11785     /* Caller assumes we cannot fail.  */
11786     gcc_assert (use_rsqrt_p (mode));
11787
11788   machine_mode mmsk = mode_for_int_vector (mode).require ();
11789   rtx xmsk = gen_reg_rtx (mmsk);
11790   if (!recp)
11791     /* When calculating the approximate square root, compare the
11792        argument with 0.0 and create a mask.  */
11793     emit_insn (gen_rtx_SET (xmsk,
11794                             gen_rtx_NEG (mmsk,
11795                                          gen_rtx_EQ (mmsk, src,
11796                                                      CONST0_RTX (mode)))));
11797
11798   /* Estimate the approximate reciprocal square root.  */
11799   rtx xdst = gen_reg_rtx (mode);
11800   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11801
11802   /* Iterate over the series twice for SF and thrice for DF.  */
11803   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11804
11805   /* Optionally iterate over the series once less for faster performance
11806      while sacrificing the accuracy.  */
11807   if ((recp && flag_mrecip_low_precision_sqrt)
11808       || (!recp && flag_mlow_precision_sqrt))
11809     iterations--;
11810
11811   /* Iterate over the series to calculate the approximate reciprocal square
11812      root.  */
11813   rtx x1 = gen_reg_rtx (mode);
11814   while (iterations--)
11815     {
11816       rtx x2 = gen_reg_rtx (mode);
11817       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11818
11819       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11820
11821       if (iterations > 0)
11822         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11823     }
11824
11825   if (!recp)
11826     {
11827       /* Qualify the approximate reciprocal square root when the argument is
11828          0.0 by squashing the intermediary result to 0.0.  */
11829       rtx xtmp = gen_reg_rtx (mmsk);
11830       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11831                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11832       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11833
11834       /* Calculate the approximate square root.  */
11835       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11836     }
11837
11838   /* Finalize the approximation.  */
11839   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11840
11841   return true;
11842 }
11843
11844 /* Emit the instruction sequence to compute the approximation for the division
11845    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11846
11847 bool
11848 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11849 {
11850   machine_mode mode = GET_MODE (quo);
11851
11852   if (GET_MODE_INNER (mode) == HFmode)
11853     return false;
11854
11855   bool use_approx_division_p = (flag_mlow_precision_div
11856                                 || (aarch64_tune_params.approx_modes->division
11857                                     & AARCH64_APPROX_MODE (mode)));
11858
11859   if (!flag_finite_math_only
11860       || flag_trapping_math
11861       || !flag_unsafe_math_optimizations
11862       || optimize_function_for_size_p (cfun)
11863       || !use_approx_division_p)
11864     return false;
11865
11866   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11867     return false;
11868
11869   /* Estimate the approximate reciprocal.  */
11870   rtx xrcp = gen_reg_rtx (mode);
11871   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11872
11873   /* Iterate over the series twice for SF and thrice for DF.  */
11874   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11875
11876   /* Optionally iterate over the series once less for faster performance,
11877      while sacrificing the accuracy.  */
11878   if (flag_mlow_precision_div)
11879     iterations--;
11880
11881   /* Iterate over the series to calculate the approximate reciprocal.  */
11882   rtx xtmp = gen_reg_rtx (mode);
11883   while (iterations--)
11884     {
11885       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11886
11887       if (iterations > 0)
11888         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11889     }
11890
11891   if (num != CONST1_RTX (mode))
11892     {
11893       /* As the approximate reciprocal of DEN is already calculated, only
11894          calculate the approximate division when NUM is not 1.0.  */
11895       rtx xnum = force_reg (mode, num);
11896       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11897     }
11898
11899   /* Finalize the approximation.  */
11900   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11901   return true;
11902 }
11903
11904 /* Return the number of instructions that can be issued per cycle.  */
11905 static int
11906 aarch64_sched_issue_rate (void)
11907 {
11908   return aarch64_tune_params.issue_rate;
11909 }
11910
11911 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
11912 static int
11913 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
11914 {
11915   if (DEBUG_INSN_P (insn))
11916     return more;
11917
11918   rtx_code code = GET_CODE (PATTERN (insn));
11919   if (code == USE || code == CLOBBER)
11920     return more;
11921
11922   if (get_attr_type (insn) == TYPE_NO_INSN)
11923     return more;
11924
11925   return more - 1;
11926 }
11927
11928 static int
11929 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11930 {
11931   int issue_rate = aarch64_sched_issue_rate ();
11932
11933   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11934 }
11935
11936
11937 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11938    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11939    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11940
11941 static int
11942 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11943                                                     int ready_index)
11944 {
11945   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11946 }
11947
11948
11949 /* Vectorizer cost model target hooks.  */
11950
11951 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11952 static int
11953 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11954                                     tree vectype,
11955                                     int misalign ATTRIBUTE_UNUSED)
11956 {
11957   unsigned elements;
11958   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11959   bool fp = false;
11960
11961   if (vectype != NULL)
11962     fp = FLOAT_TYPE_P (vectype);
11963
11964   switch (type_of_cost)
11965     {
11966       case scalar_stmt:
11967         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11968
11969       case scalar_load:
11970         return costs->scalar_load_cost;
11971
11972       case scalar_store:
11973         return costs->scalar_store_cost;
11974
11975       case vector_stmt:
11976         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11977
11978       case vector_load:
11979         return costs->vec_align_load_cost;
11980
11981       case vector_store:
11982         return costs->vec_store_cost;
11983
11984       case vec_to_scalar:
11985         return costs->vec_to_scalar_cost;
11986
11987       case scalar_to_vec:
11988         return costs->scalar_to_vec_cost;
11989
11990       case unaligned_load:
11991       case vector_gather_load:
11992         return costs->vec_unalign_load_cost;
11993
11994       case unaligned_store:
11995       case vector_scatter_store:
11996         return costs->vec_unalign_store_cost;
11997
11998       case cond_branch_taken:
11999         return costs->cond_taken_branch_cost;
12000
12001       case cond_branch_not_taken:
12002         return costs->cond_not_taken_branch_cost;
12003
12004       case vec_perm:
12005         return costs->vec_permute_cost;
12006
12007       case vec_promote_demote:
12008         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12009
12010       case vec_construct:
12011         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12012         return elements / 2 + 1;
12013
12014       default:
12015         gcc_unreachable ();
12016     }
12017 }
12018
12019 /* Implement targetm.vectorize.add_stmt_cost.  */
12020 static unsigned
12021 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
12022                        struct _stmt_vec_info *stmt_info, int misalign,
12023                        enum vect_cost_model_location where)
12024 {
12025   unsigned *cost = (unsigned *) data;
12026   unsigned retval = 0;
12027
12028   if (flag_vect_cost_model)
12029     {
12030       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
12031       int stmt_cost =
12032             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
12033
12034       /* Statements in an inner loop relative to the loop being
12035          vectorized are weighted more heavily.  The value here is
12036          arbitrary and could potentially be improved with analysis.  */
12037       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
12038         count *= 50; /*  FIXME  */
12039
12040       retval = (unsigned) (count * stmt_cost);
12041       cost[where] += retval;
12042     }
12043
12044   return retval;
12045 }
12046
12047 static void initialize_aarch64_code_model (struct gcc_options *);
12048
12049 /* Parse the TO_PARSE string and put the architecture struct that it
12050    selects into RES and the architectural features into ISA_FLAGS.
12051    Return an aarch64_parse_opt_result describing the parse result.
12052    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12053    When the TO_PARSE string contains an invalid extension,
12054    a copy of the string is created and stored to INVALID_EXTENSION.  */
12055
12056 static enum aarch64_parse_opt_result
12057 aarch64_parse_arch (const char *to_parse, const struct processor **res,
12058                     uint64_t *isa_flags, std::string *invalid_extension)
12059 {
12060   const char *ext;
12061   const struct processor *arch;
12062   size_t len;
12063
12064   ext = strchr (to_parse, '+');
12065
12066   if (ext != NULL)
12067     len = ext - to_parse;
12068   else
12069     len = strlen (to_parse);
12070
12071   if (len == 0)
12072     return AARCH64_PARSE_MISSING_ARG;
12073
12074
12075   /* Loop through the list of supported ARCHes to find a match.  */
12076   for (arch = all_architectures; arch->name != NULL; arch++)
12077     {
12078       if (strlen (arch->name) == len
12079           && strncmp (arch->name, to_parse, len) == 0)
12080         {
12081           uint64_t isa_temp = arch->flags;
12082
12083           if (ext != NULL)
12084             {
12085               /* TO_PARSE string contains at least one extension.  */
12086               enum aarch64_parse_opt_result ext_res
12087                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12088
12089               if (ext_res != AARCH64_PARSE_OK)
12090                 return ext_res;
12091             }
12092           /* Extension parsing was successful.  Confirm the result
12093              arch and ISA flags.  */
12094           *res = arch;
12095           *isa_flags = isa_temp;
12096           return AARCH64_PARSE_OK;
12097         }
12098     }
12099
12100   /* ARCH name not found in list.  */
12101   return AARCH64_PARSE_INVALID_ARG;
12102 }
12103
12104 /* Parse the TO_PARSE string and put the result tuning in RES and the
12105    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
12106    describing the parse result.  If there is an error parsing, RES and
12107    ISA_FLAGS are left unchanged.
12108    When the TO_PARSE string contains an invalid extension,
12109    a copy of the string is created and stored to INVALID_EXTENSION.  */
12110
12111 static enum aarch64_parse_opt_result
12112 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12113                    uint64_t *isa_flags, std::string *invalid_extension)
12114 {
12115   const char *ext;
12116   const struct processor *cpu;
12117   size_t len;
12118
12119   ext = strchr (to_parse, '+');
12120
12121   if (ext != NULL)
12122     len = ext - to_parse;
12123   else
12124     len = strlen (to_parse);
12125
12126   if (len == 0)
12127     return AARCH64_PARSE_MISSING_ARG;
12128
12129
12130   /* Loop through the list of supported CPUs to find a match.  */
12131   for (cpu = all_cores; cpu->name != NULL; cpu++)
12132     {
12133       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12134         {
12135           uint64_t isa_temp = cpu->flags;
12136
12137
12138           if (ext != NULL)
12139             {
12140               /* TO_PARSE string contains at least one extension.  */
12141               enum aarch64_parse_opt_result ext_res
12142                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12143
12144               if (ext_res != AARCH64_PARSE_OK)
12145                 return ext_res;
12146             }
12147           /* Extension parsing was successfull.  Confirm the result
12148              cpu and ISA flags.  */
12149           *res = cpu;
12150           *isa_flags = isa_temp;
12151           return AARCH64_PARSE_OK;
12152         }
12153     }
12154
12155   /* CPU name not found in list.  */
12156   return AARCH64_PARSE_INVALID_ARG;
12157 }
12158
12159 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12160    Return an aarch64_parse_opt_result describing the parse result.
12161    If the parsing fails the RES does not change.  */
12162
12163 static enum aarch64_parse_opt_result
12164 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12165 {
12166   const struct processor *cpu;
12167
12168   /* Loop through the list of supported CPUs to find a match.  */
12169   for (cpu = all_cores; cpu->name != NULL; cpu++)
12170     {
12171       if (strcmp (cpu->name, to_parse) == 0)
12172         {
12173           *res = cpu;
12174           return AARCH64_PARSE_OK;
12175         }
12176     }
12177
12178   /* CPU name not found in list.  */
12179   return AARCH64_PARSE_INVALID_ARG;
12180 }
12181
12182 /* Parse TOKEN, which has length LENGTH to see if it is an option
12183    described in FLAG.  If it is, return the index bit for that fusion type.
12184    If not, error (printing OPTION_NAME) and return zero.  */
12185
12186 static unsigned int
12187 aarch64_parse_one_option_token (const char *token,
12188                                 size_t length,
12189                                 const struct aarch64_flag_desc *flag,
12190                                 const char *option_name)
12191 {
12192   for (; flag->name != NULL; flag++)
12193     {
12194       if (length == strlen (flag->name)
12195           && !strncmp (flag->name, token, length))
12196         return flag->flag;
12197     }
12198
12199   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12200   return 0;
12201 }
12202
12203 /* Parse OPTION which is a comma-separated list of flags to enable.
12204    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12205    default state we inherit from the CPU tuning structures.  OPTION_NAME
12206    gives the top-level option we are parsing in the -moverride string,
12207    for use in error messages.  */
12208
12209 static unsigned int
12210 aarch64_parse_boolean_options (const char *option,
12211                                const struct aarch64_flag_desc *flags,
12212                                unsigned int initial_state,
12213                                const char *option_name)
12214 {
12215   const char separator = '.';
12216   const char* specs = option;
12217   const char* ntoken = option;
12218   unsigned int found_flags = initial_state;
12219
12220   while ((ntoken = strchr (specs, separator)))
12221     {
12222       size_t token_length = ntoken - specs;
12223       unsigned token_ops = aarch64_parse_one_option_token (specs,
12224                                                            token_length,
12225                                                            flags,
12226                                                            option_name);
12227       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12228          in the token stream, reset the supported operations.  So:
12229
12230            adrp+add.cmp+branch.none.adrp+add
12231
12232            would have the result of turning on only adrp+add fusion.  */
12233       if (!token_ops)
12234         found_flags = 0;
12235
12236       found_flags |= token_ops;
12237       specs = ++ntoken;
12238     }
12239
12240   /* We ended with a comma, print something.  */
12241   if (!(*specs))
12242     {
12243       error ("%s string ill-formed\n", option_name);
12244       return 0;
12245     }
12246
12247   /* We still have one more token to parse.  */
12248   size_t token_length = strlen (specs);
12249   unsigned token_ops = aarch64_parse_one_option_token (specs,
12250                                                        token_length,
12251                                                        flags,
12252                                                        option_name);
12253    if (!token_ops)
12254      found_flags = 0;
12255
12256   found_flags |= token_ops;
12257   return found_flags;
12258 }
12259
12260 /* Support for overriding instruction fusion.  */
12261
12262 static void
12263 aarch64_parse_fuse_string (const char *fuse_string,
12264                             struct tune_params *tune)
12265 {
12266   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12267                                                      aarch64_fusible_pairs,
12268                                                      tune->fusible_ops,
12269                                                      "fuse=");
12270 }
12271
12272 /* Support for overriding other tuning flags.  */
12273
12274 static void
12275 aarch64_parse_tune_string (const char *tune_string,
12276                             struct tune_params *tune)
12277 {
12278   tune->extra_tuning_flags
12279     = aarch64_parse_boolean_options (tune_string,
12280                                      aarch64_tuning_flags,
12281                                      tune->extra_tuning_flags,
12282                                      "tune=");
12283 }
12284
12285 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12286    Accept the valid SVE vector widths allowed by
12287    aarch64_sve_vector_bits_enum and use it to override sve_width
12288    in TUNE.  */
12289
12290 static void
12291 aarch64_parse_sve_width_string (const char *tune_string,
12292                                 struct tune_params *tune)
12293 {
12294   int width = -1;
12295
12296   int n = sscanf (tune_string, "%d", &width);
12297   if (n == EOF)
12298     {
12299       error ("invalid format for sve_width");
12300       return;
12301     }
12302   switch (width)
12303     {
12304     case SVE_128:
12305     case SVE_256:
12306     case SVE_512:
12307     case SVE_1024:
12308     case SVE_2048:
12309       break;
12310     default:
12311       error ("invalid sve_width value: %d", width);
12312     }
12313   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12314 }
12315
12316 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12317    we understand.  If it is, extract the option string and handoff to
12318    the appropriate function.  */
12319
12320 void
12321 aarch64_parse_one_override_token (const char* token,
12322                                   size_t length,
12323                                   struct tune_params *tune)
12324 {
12325   const struct aarch64_tuning_override_function *fn
12326     = aarch64_tuning_override_functions;
12327
12328   const char *option_part = strchr (token, '=');
12329   if (!option_part)
12330     {
12331       error ("tuning string missing in option (%s)", token);
12332       return;
12333     }
12334
12335   /* Get the length of the option name.  */
12336   length = option_part - token;
12337   /* Skip the '=' to get to the option string.  */
12338   option_part++;
12339
12340   for (; fn->name != NULL; fn++)
12341     {
12342       if (!strncmp (fn->name, token, length))
12343         {
12344           fn->parse_override (option_part, tune);
12345           return;
12346         }
12347     }
12348
12349   error ("unknown tuning option (%s)",token);
12350   return;
12351 }
12352
12353 /* A checking mechanism for the implementation of the tls size.  */
12354
12355 static void
12356 initialize_aarch64_tls_size (struct gcc_options *opts)
12357 {
12358   if (aarch64_tls_size == 0)
12359     aarch64_tls_size = 24;
12360
12361   switch (opts->x_aarch64_cmodel_var)
12362     {
12363     case AARCH64_CMODEL_TINY:
12364       /* Both the default and maximum TLS size allowed under tiny is 1M which
12365          needs two instructions to address, so we clamp the size to 24.  */
12366       if (aarch64_tls_size > 24)
12367         aarch64_tls_size = 24;
12368       break;
12369     case AARCH64_CMODEL_SMALL:
12370       /* The maximum TLS size allowed under small is 4G.  */
12371       if (aarch64_tls_size > 32)
12372         aarch64_tls_size = 32;
12373       break;
12374     case AARCH64_CMODEL_LARGE:
12375       /* The maximum TLS size allowed under large is 16E.
12376          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12377       if (aarch64_tls_size > 48)
12378         aarch64_tls_size = 48;
12379       break;
12380     default:
12381       gcc_unreachable ();
12382     }
12383
12384   return;
12385 }
12386
12387 /* Parse STRING looking for options in the format:
12388      string     :: option:string
12389      option     :: name=substring
12390      name       :: {a-z}
12391      substring  :: defined by option.  */
12392
12393 static void
12394 aarch64_parse_override_string (const char* input_string,
12395                                struct tune_params* tune)
12396 {
12397   const char separator = ':';
12398   size_t string_length = strlen (input_string) + 1;
12399   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12400   char *string = string_root;
12401   strncpy (string, input_string, string_length);
12402   string[string_length - 1] = '\0';
12403
12404   char* ntoken = string;
12405
12406   while ((ntoken = strchr (string, separator)))
12407     {
12408       size_t token_length = ntoken - string;
12409       /* Make this substring look like a string.  */
12410       *ntoken = '\0';
12411       aarch64_parse_one_override_token (string, token_length, tune);
12412       string = ++ntoken;
12413     }
12414
12415   /* One last option to parse.  */
12416   aarch64_parse_one_override_token (string, strlen (string), tune);
12417   free (string_root);
12418 }
12419
12420
12421 static void
12422 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12423 {
12424   if (accepted_branch_protection_string)
12425     {
12426       opts->x_aarch64_branch_protection_string
12427         = xstrdup (accepted_branch_protection_string);
12428     }
12429
12430   /* PR 70044: We have to be careful about being called multiple times for the
12431      same function.  This means all changes should be repeatable.  */
12432
12433   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12434      Disable the frame pointer flag so the mid-end will not use a frame
12435      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12436      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12437      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12438   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12439   if (opts->x_flag_omit_frame_pointer == 0)
12440     opts->x_flag_omit_frame_pointer = 2;
12441
12442   /* If not optimizing for size, set the default
12443      alignment to what the target wants.  */
12444   if (!opts->x_optimize_size)
12445     {
12446       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12447         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12448       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12449         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12450       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12451         opts->x_str_align_functions = aarch64_tune_params.function_align;
12452     }
12453
12454   /* We default to no pc-relative literal loads.  */
12455
12456   aarch64_pcrelative_literal_loads = false;
12457
12458   /* If -mpc-relative-literal-loads is set on the command line, this
12459      implies that the user asked for PC relative literal loads.  */
12460   if (opts->x_pcrelative_literal_loads == 1)
12461     aarch64_pcrelative_literal_loads = true;
12462
12463   /* In the tiny memory model it makes no sense to disallow PC relative
12464      literal pool loads.  */
12465   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12466       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12467     aarch64_pcrelative_literal_loads = true;
12468
12469   /* When enabling the lower precision Newton series for the square root, also
12470      enable it for the reciprocal square root, since the latter is an
12471      intermediary step for the former.  */
12472   if (flag_mlow_precision_sqrt)
12473     flag_mrecip_low_precision_sqrt = true;
12474 }
12475
12476 /* 'Unpack' up the internal tuning structs and update the options
12477     in OPTS.  The caller must have set up selected_tune and selected_arch
12478     as all the other target-specific codegen decisions are
12479     derived from them.  */
12480
12481 void
12482 aarch64_override_options_internal (struct gcc_options *opts)
12483 {
12484   aarch64_tune_flags = selected_tune->flags;
12485   aarch64_tune = selected_tune->sched_core;
12486   /* Make a copy of the tuning parameters attached to the core, which
12487      we may later overwrite.  */
12488   aarch64_tune_params = *(selected_tune->tune);
12489   aarch64_architecture_version = selected_arch->architecture_version;
12490
12491   if (opts->x_aarch64_override_tune_string)
12492     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12493                                   &aarch64_tune_params);
12494
12495   /* This target defaults to strict volatile bitfields.  */
12496   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12497     opts->x_flag_strict_volatile_bitfields = 1;
12498
12499   if (aarch64_stack_protector_guard == SSP_GLOBAL
12500       && opts->x_aarch64_stack_protector_guard_offset_str)
12501     {
12502       error ("incompatible options %<-mstack-protector-guard=global%> and "
12503              "%<-mstack-protector-guard-offset=%s%>",
12504              aarch64_stack_protector_guard_offset_str);
12505     }
12506
12507   if (aarch64_stack_protector_guard == SSP_SYSREG
12508       && !(opts->x_aarch64_stack_protector_guard_offset_str
12509            && opts->x_aarch64_stack_protector_guard_reg_str))
12510     {
12511       error ("both %<-mstack-protector-guard-offset%> and "
12512              "%<-mstack-protector-guard-reg%> must be used "
12513              "with %<-mstack-protector-guard=sysreg%>");
12514     }
12515
12516   if (opts->x_aarch64_stack_protector_guard_reg_str)
12517     {
12518       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12519           error ("specify a system register with a small string length.");
12520     }
12521
12522   if (opts->x_aarch64_stack_protector_guard_offset_str)
12523     {
12524       char *end;
12525       const char *str = aarch64_stack_protector_guard_offset_str;
12526       errno = 0;
12527       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12528       if (!*str || *end || errno)
12529         error ("%qs is not a valid offset in %qs", str,
12530                "-mstack-protector-guard-offset=");
12531       aarch64_stack_protector_guard_offset = offs;
12532     }
12533
12534   initialize_aarch64_code_model (opts);
12535   initialize_aarch64_tls_size (opts);
12536
12537   int queue_depth = 0;
12538   switch (aarch64_tune_params.autoprefetcher_model)
12539     {
12540       case tune_params::AUTOPREFETCHER_OFF:
12541         queue_depth = -1;
12542         break;
12543       case tune_params::AUTOPREFETCHER_WEAK:
12544         queue_depth = 0;
12545         break;
12546       case tune_params::AUTOPREFETCHER_STRONG:
12547         queue_depth = max_insn_queue_index + 1;
12548         break;
12549       default:
12550         gcc_unreachable ();
12551     }
12552
12553   /* We don't mind passing in global_options_set here as we don't use
12554      the *options_set structs anyway.  */
12555   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12556                          queue_depth,
12557                          opts->x_param_values,
12558                          global_options_set.x_param_values);
12559
12560   /* Set up parameters to be used in prefetching algorithm.  Do not
12561      override the defaults unless we are tuning for a core we have
12562      researched values for.  */
12563   if (aarch64_tune_params.prefetch->num_slots > 0)
12564     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12565                            aarch64_tune_params.prefetch->num_slots,
12566                            opts->x_param_values,
12567                            global_options_set.x_param_values);
12568   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12569     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12570                            aarch64_tune_params.prefetch->l1_cache_size,
12571                            opts->x_param_values,
12572                            global_options_set.x_param_values);
12573   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12574     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12575                            aarch64_tune_params.prefetch->l1_cache_line_size,
12576                            opts->x_param_values,
12577                            global_options_set.x_param_values);
12578   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12579     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12580                            aarch64_tune_params.prefetch->l2_cache_size,
12581                            opts->x_param_values,
12582                            global_options_set.x_param_values);
12583   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12584     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12585                            0,
12586                            opts->x_param_values,
12587                            global_options_set.x_param_values);
12588   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12589     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12590                            aarch64_tune_params.prefetch->minimum_stride,
12591                            opts->x_param_values,
12592                            global_options_set.x_param_values);
12593
12594   /* Use the alternative scheduling-pressure algorithm by default.  */
12595   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12596                          opts->x_param_values,
12597                          global_options_set.x_param_values);
12598
12599   /* If the user hasn't changed it via configure then set the default to 64 KB
12600      for the backend.  */
12601   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12602                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12603                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12604                          opts->x_param_values,
12605                          global_options_set.x_param_values);
12606
12607   /* Validate the guard size.  */
12608   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12609
12610   /* Enforce that interval is the same size as size so the mid-end does the
12611      right thing.  */
12612   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12613                          guard_size,
12614                          opts->x_param_values,
12615                          global_options_set.x_param_values);
12616
12617   /* The maybe_set calls won't update the value if the user has explicitly set
12618      one.  Which means we need to validate that probing interval and guard size
12619      are equal.  */
12620   int probe_interval
12621     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12622   if (guard_size != probe_interval)
12623     error ("stack clash guard size %<%d%> must be equal to probing interval "
12624            "%<%d%>", guard_size, probe_interval);
12625
12626   /* Enable sw prefetching at specified optimization level for
12627      CPUS that have prefetch.  Lower optimization level threshold by 1
12628      when profiling is enabled.  */
12629   if (opts->x_flag_prefetch_loop_arrays < 0
12630       && !opts->x_optimize_size
12631       && aarch64_tune_params.prefetch->default_opt_level >= 0
12632       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12633     opts->x_flag_prefetch_loop_arrays = 1;
12634
12635   if (opts->x_aarch64_arch_string == NULL)
12636     opts->x_aarch64_arch_string = selected_arch->name;
12637   if (opts->x_aarch64_cpu_string == NULL)
12638     opts->x_aarch64_cpu_string = selected_cpu->name;
12639   if (opts->x_aarch64_tune_string == NULL)
12640     opts->x_aarch64_tune_string = selected_tune->name;
12641
12642   aarch64_override_options_after_change_1 (opts);
12643 }
12644
12645 /* Print a hint with a suggestion for a core or architecture name that
12646    most closely resembles what the user passed in STR.  ARCH is true if
12647    the user is asking for an architecture name.  ARCH is false if the user
12648    is asking for a core name.  */
12649
12650 static void
12651 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12652 {
12653   auto_vec<const char *> candidates;
12654   const struct processor *entry = arch ? all_architectures : all_cores;
12655   for (; entry->name != NULL; entry++)
12656     candidates.safe_push (entry->name);
12657
12658 #ifdef HAVE_LOCAL_CPU_DETECT
12659   /* Add also "native" as possible value.  */
12660   if (arch)
12661     candidates.safe_push ("native");
12662 #endif
12663
12664   char *s;
12665   const char *hint = candidates_list_and_hint (str, s, candidates);
12666   if (hint)
12667     inform (input_location, "valid arguments are: %s;"
12668                              " did you mean %qs?", s, hint);
12669   else
12670     inform (input_location, "valid arguments are: %s", s);
12671
12672   XDELETEVEC (s);
12673 }
12674
12675 /* Print a hint with a suggestion for a core name that most closely resembles
12676    what the user passed in STR.  */
12677
12678 inline static void
12679 aarch64_print_hint_for_core (const char *str)
12680 {
12681   aarch64_print_hint_for_core_or_arch (str, false);
12682 }
12683
12684 /* Print a hint with a suggestion for an architecture name that most closely
12685    resembles what the user passed in STR.  */
12686
12687 inline static void
12688 aarch64_print_hint_for_arch (const char *str)
12689 {
12690   aarch64_print_hint_for_core_or_arch (str, true);
12691 }
12692
12693
12694 /* Print a hint with a suggestion for an extension name
12695    that most closely resembles what the user passed in STR.  */
12696
12697 void
12698 aarch64_print_hint_for_extensions (const std::string &str)
12699 {
12700   auto_vec<const char *> candidates;
12701   aarch64_get_all_extension_candidates (&candidates);
12702   char *s;
12703   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12704   if (hint)
12705     inform (input_location, "valid arguments are: %s;"
12706                              " did you mean %qs?", s, hint);
12707   else
12708     inform (input_location, "valid arguments are: %s;", s);
12709
12710   XDELETEVEC (s);
12711 }
12712
12713 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12714    specified in STR and throw errors if appropriate.  Put the results if
12715    they are valid in RES and ISA_FLAGS.  Return whether the option is
12716    valid.  */
12717
12718 static bool
12719 aarch64_validate_mcpu (const char *str, const struct processor **res,
12720                        uint64_t *isa_flags)
12721 {
12722   std::string invalid_extension;
12723   enum aarch64_parse_opt_result parse_res
12724     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12725
12726   if (parse_res == AARCH64_PARSE_OK)
12727     return true;
12728
12729   switch (parse_res)
12730     {
12731       case AARCH64_PARSE_MISSING_ARG:
12732         error ("missing cpu name in %<-mcpu=%s%>", str);
12733         break;
12734       case AARCH64_PARSE_INVALID_ARG:
12735         error ("unknown value %qs for %<-mcpu%>", str);
12736         aarch64_print_hint_for_core (str);
12737         break;
12738       case AARCH64_PARSE_INVALID_FEATURE:
12739         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12740                invalid_extension.c_str (), str);
12741         aarch64_print_hint_for_extensions (invalid_extension);
12742         break;
12743       default:
12744         gcc_unreachable ();
12745     }
12746
12747   return false;
12748 }
12749
12750 /* Parses CONST_STR for branch protection features specified in
12751    aarch64_branch_protect_types, and set any global variables required.  Returns
12752    the parsing result and assigns LAST_STR to the last processed token from
12753    CONST_STR so that it can be used for error reporting.  */
12754
12755 static enum
12756 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12757                                                           char** last_str)
12758 {
12759   char *str_root = xstrdup (const_str);
12760   char* token_save = NULL;
12761   char *str = strtok_r (str_root, "+", &token_save);
12762   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12763   if (!str)
12764     res = AARCH64_PARSE_MISSING_ARG;
12765   else
12766     {
12767       char *next_str = strtok_r (NULL, "+", &token_save);
12768       /* Reset the branch protection features to their defaults.  */
12769       aarch64_handle_no_branch_protection (NULL, NULL);
12770
12771       while (str && res == AARCH64_PARSE_OK)
12772         {
12773           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12774           bool found = false;
12775           /* Search for this type.  */
12776           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12777             {
12778               if (strcmp (str, type->name) == 0)
12779                 {
12780                   found = true;
12781                   res = type->handler (str, next_str);
12782                   str = next_str;
12783                   next_str = strtok_r (NULL, "+", &token_save);
12784                 }
12785               else
12786                 type++;
12787             }
12788           if (found && res == AARCH64_PARSE_OK)
12789             {
12790               bool found_subtype = true;
12791               /* Loop through each token until we find one that isn't a
12792                  subtype.  */
12793               while (found_subtype)
12794                 {
12795                   found_subtype = false;
12796                   const aarch64_branch_protect_type *subtype = type->subtypes;
12797                   /* Search for the subtype.  */
12798                   while (str && subtype && subtype->name && !found_subtype
12799                           && res == AARCH64_PARSE_OK)
12800                     {
12801                       if (strcmp (str, subtype->name) == 0)
12802                         {
12803                           found_subtype = true;
12804                           res = subtype->handler (str, next_str);
12805                           str = next_str;
12806                           next_str = strtok_r (NULL, "+", &token_save);
12807                         }
12808                       else
12809                         subtype++;
12810                     }
12811                 }
12812             }
12813           else if (!found)
12814             res = AARCH64_PARSE_INVALID_ARG;
12815         }
12816     }
12817   /* Copy the last processed token into the argument to pass it back.
12818     Used by option and attribute validation to print the offending token.  */
12819   if (last_str)
12820     {
12821       if (str) strcpy (*last_str, str);
12822       else *last_str = NULL;
12823     }
12824   if (res == AARCH64_PARSE_OK)
12825     {
12826       /* If needed, alloc the accepted string then copy in const_str.
12827         Used by override_option_after_change_1.  */
12828       if (!accepted_branch_protection_string)
12829         accepted_branch_protection_string = (char *) xmalloc (
12830                                                       BRANCH_PROTECT_STR_MAX
12831                                                         + 1);
12832       strncpy (accepted_branch_protection_string, const_str,
12833                 BRANCH_PROTECT_STR_MAX + 1);
12834       /* Forcibly null-terminate.  */
12835       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12836     }
12837   return res;
12838 }
12839
12840 static bool
12841 aarch64_validate_mbranch_protection (const char *const_str)
12842 {
12843   char *str = (char *) xmalloc (strlen (const_str));
12844   enum aarch64_parse_opt_result res =
12845     aarch64_parse_branch_protection (const_str, &str);
12846   if (res == AARCH64_PARSE_INVALID_ARG)
12847     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12848   else if (res == AARCH64_PARSE_MISSING_ARG)
12849     error ("missing argument for %<-mbranch-protection=%>");
12850   free (str);
12851   return res == AARCH64_PARSE_OK;
12852 }
12853
12854 /* Validate a command-line -march option.  Parse the arch and extensions
12855    (if any) specified in STR and throw errors if appropriate.  Put the
12856    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12857    option is valid.  */
12858
12859 static bool
12860 aarch64_validate_march (const char *str, const struct processor **res,
12861                          uint64_t *isa_flags)
12862 {
12863   std::string invalid_extension;
12864   enum aarch64_parse_opt_result parse_res
12865     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12866
12867   if (parse_res == AARCH64_PARSE_OK)
12868     return true;
12869
12870   switch (parse_res)
12871     {
12872       case AARCH64_PARSE_MISSING_ARG:
12873         error ("missing arch name in %<-march=%s%>", str);
12874         break;
12875       case AARCH64_PARSE_INVALID_ARG:
12876         error ("unknown value %qs for %<-march%>", str);
12877         aarch64_print_hint_for_arch (str);
12878         break;
12879       case AARCH64_PARSE_INVALID_FEATURE:
12880         error ("invalid feature modifier %qs in %<-march=%s%>",
12881                invalid_extension.c_str (), str);
12882         aarch64_print_hint_for_extensions (invalid_extension);
12883         break;
12884       default:
12885         gcc_unreachable ();
12886     }
12887
12888   return false;
12889 }
12890
12891 /* Validate a command-line -mtune option.  Parse the cpu
12892    specified in STR and throw errors if appropriate.  Put the
12893    result, if it is valid, in RES.  Return whether the option is
12894    valid.  */
12895
12896 static bool
12897 aarch64_validate_mtune (const char *str, const struct processor **res)
12898 {
12899   enum aarch64_parse_opt_result parse_res
12900     = aarch64_parse_tune (str, res);
12901
12902   if (parse_res == AARCH64_PARSE_OK)
12903     return true;
12904
12905   switch (parse_res)
12906     {
12907       case AARCH64_PARSE_MISSING_ARG:
12908         error ("missing cpu name in %<-mtune=%s%>", str);
12909         break;
12910       case AARCH64_PARSE_INVALID_ARG:
12911         error ("unknown value %qs for %<-mtune%>", str);
12912         aarch64_print_hint_for_core (str);
12913         break;
12914       default:
12915         gcc_unreachable ();
12916     }
12917   return false;
12918 }
12919
12920 /* Return the CPU corresponding to the enum CPU.
12921    If it doesn't specify a cpu, return the default.  */
12922
12923 static const struct processor *
12924 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12925 {
12926   if (cpu != aarch64_none)
12927     return &all_cores[cpu];
12928
12929   /* The & 0x3f is to extract the bottom 6 bits that encode the
12930      default cpu as selected by the --with-cpu GCC configure option
12931      in config.gcc.
12932      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12933      flags mechanism should be reworked to make it more sane.  */
12934   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12935 }
12936
12937 /* Return the architecture corresponding to the enum ARCH.
12938    If it doesn't specify a valid architecture, return the default.  */
12939
12940 static const struct processor *
12941 aarch64_get_arch (enum aarch64_arch arch)
12942 {
12943   if (arch != aarch64_no_arch)
12944     return &all_architectures[arch];
12945
12946   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12947
12948   return &all_architectures[cpu->arch];
12949 }
12950
12951 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12952
12953 static poly_uint16
12954 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12955 {
12956   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12957      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12958      deciding which .md file patterns to use and when deciding whether
12959      something is a legitimate address or constant.  */
12960   if (value == SVE_SCALABLE || value == SVE_128)
12961     return poly_uint16 (2, 2);
12962   else
12963     return (int) value / 64;
12964 }
12965
12966 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12967    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12968    tuning structs.  In particular it must set selected_tune and
12969    aarch64_isa_flags that define the available ISA features and tuning
12970    decisions.  It must also set selected_arch as this will be used to
12971    output the .arch asm tags for each function.  */
12972
12973 static void
12974 aarch64_override_options (void)
12975 {
12976   uint64_t cpu_isa = 0;
12977   uint64_t arch_isa = 0;
12978   aarch64_isa_flags = 0;
12979
12980   bool valid_cpu = true;
12981   bool valid_tune = true;
12982   bool valid_arch = true;
12983
12984   selected_cpu = NULL;
12985   selected_arch = NULL;
12986   selected_tune = NULL;
12987
12988   if (aarch64_branch_protection_string)
12989     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12990
12991   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12992      If either of -march or -mtune is given, they override their
12993      respective component of -mcpu.  */
12994   if (aarch64_cpu_string)
12995     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12996                                         &cpu_isa);
12997
12998   if (aarch64_arch_string)
12999     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13000                                           &arch_isa);
13001
13002   if (aarch64_tune_string)
13003     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13004
13005 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13006   SUBTARGET_OVERRIDE_OPTIONS;
13007 #endif
13008
13009   /* If the user did not specify a processor, choose the default
13010      one for them.  This will be the CPU set during configuration using
13011      --with-cpu, otherwise it is "generic".  */
13012   if (!selected_cpu)
13013     {
13014       if (selected_arch)
13015         {
13016           selected_cpu = &all_cores[selected_arch->ident];
13017           aarch64_isa_flags = arch_isa;
13018           explicit_arch = selected_arch->arch;
13019         }
13020       else
13021         {
13022           /* Get default configure-time CPU.  */
13023           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
13024           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
13025         }
13026
13027       if (selected_tune)
13028         explicit_tune_core = selected_tune->ident;
13029     }
13030   /* If both -mcpu and -march are specified check that they are architecturally
13031      compatible, warn if they're not and prefer the -march ISA flags.  */
13032   else if (selected_arch)
13033     {
13034       if (selected_arch->arch != selected_cpu->arch)
13035         {
13036           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13037                        all_architectures[selected_cpu->arch].name,
13038                        selected_arch->name);
13039         }
13040       aarch64_isa_flags = arch_isa;
13041       explicit_arch = selected_arch->arch;
13042       explicit_tune_core = selected_tune ? selected_tune->ident
13043                                           : selected_cpu->ident;
13044     }
13045   else
13046     {
13047       /* -mcpu but no -march.  */
13048       aarch64_isa_flags = cpu_isa;
13049       explicit_tune_core = selected_tune ? selected_tune->ident
13050                                           : selected_cpu->ident;
13051       gcc_assert (selected_cpu);
13052       selected_arch = &all_architectures[selected_cpu->arch];
13053       explicit_arch = selected_arch->arch;
13054     }
13055
13056   /* Set the arch as well as we will need it when outputing
13057      the .arch directive in assembly.  */
13058   if (!selected_arch)
13059     {
13060       gcc_assert (selected_cpu);
13061       selected_arch = &all_architectures[selected_cpu->arch];
13062     }
13063
13064   if (!selected_tune)
13065     selected_tune = selected_cpu;
13066
13067   if (aarch64_enable_bti == 2)
13068     {
13069 #ifdef TARGET_ENABLE_BTI
13070       aarch64_enable_bti = 1;
13071 #else
13072       aarch64_enable_bti = 0;
13073 #endif
13074     }
13075
13076   /* Return address signing is currently not supported for ILP32 targets.  For
13077      LP64 targets use the configured option in the absence of a command-line
13078      option for -mbranch-protection.  */
13079   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
13080     {
13081 #ifdef TARGET_ENABLE_PAC_RET
13082       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13083 #else
13084       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13085 #endif
13086     }
13087
13088 #ifndef HAVE_AS_MABI_OPTION
13089   /* The compiler may have been configured with 2.23.* binutils, which does
13090      not have support for ILP32.  */
13091   if (TARGET_ILP32)
13092     error ("assembler does not support %<-mabi=ilp32%>");
13093 #endif
13094
13095   /* Convert -msve-vector-bits to a VG count.  */
13096   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13097
13098   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13099     sorry ("return address signing is only supported for %<-mabi=lp64%>");
13100
13101   /* Make sure we properly set up the explicit options.  */
13102   if ((aarch64_cpu_string && valid_cpu)
13103        || (aarch64_tune_string && valid_tune))
13104     gcc_assert (explicit_tune_core != aarch64_none);
13105
13106   if ((aarch64_cpu_string && valid_cpu)
13107        || (aarch64_arch_string && valid_arch))
13108     gcc_assert (explicit_arch != aarch64_no_arch);
13109
13110   /* The pass to insert speculation tracking runs before
13111      shrink-wrapping and the latter does not know how to update the
13112      tracking status.  So disable it in this case.  */
13113   if (aarch64_track_speculation)
13114     flag_shrink_wrap = 0;
13115
13116   aarch64_override_options_internal (&global_options);
13117
13118   /* Save these options as the default ones in case we push and pop them later
13119      while processing functions with potential target attributes.  */
13120   target_option_default_node = target_option_current_node
13121       = build_target_option_node (&global_options);
13122 }
13123
13124 /* Implement targetm.override_options_after_change.  */
13125
13126 static void
13127 aarch64_override_options_after_change (void)
13128 {
13129   aarch64_override_options_after_change_1 (&global_options);
13130 }
13131
13132 static struct machine_function *
13133 aarch64_init_machine_status (void)
13134 {
13135   struct machine_function *machine;
13136   machine = ggc_cleared_alloc<machine_function> ();
13137   return machine;
13138 }
13139
13140 void
13141 aarch64_init_expanders (void)
13142 {
13143   init_machine_status = aarch64_init_machine_status;
13144 }
13145
13146 /* A checking mechanism for the implementation of the various code models.  */
13147 static void
13148 initialize_aarch64_code_model (struct gcc_options *opts)
13149 {
13150    if (opts->x_flag_pic)
13151      {
13152        switch (opts->x_aarch64_cmodel_var)
13153          {
13154          case AARCH64_CMODEL_TINY:
13155            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13156            break;
13157          case AARCH64_CMODEL_SMALL:
13158 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13159            aarch64_cmodel = (flag_pic == 2
13160                              ? AARCH64_CMODEL_SMALL_PIC
13161                              : AARCH64_CMODEL_SMALL_SPIC);
13162 #else
13163            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13164 #endif
13165            break;
13166          case AARCH64_CMODEL_LARGE:
13167            sorry ("code model %qs with %<-f%s%>", "large",
13168                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13169            break;
13170          default:
13171            gcc_unreachable ();
13172          }
13173      }
13174    else
13175      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13176 }
13177
13178 /* Implement TARGET_OPTION_SAVE.  */
13179
13180 static void
13181 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13182 {
13183   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13184   ptr->x_aarch64_branch_protection_string
13185     = opts->x_aarch64_branch_protection_string;
13186 }
13187
13188 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13189    using the information saved in PTR.  */
13190
13191 static void
13192 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13193 {
13194   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13195   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13196   opts->x_explicit_arch = ptr->x_explicit_arch;
13197   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13198   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13199   opts->x_aarch64_branch_protection_string
13200     = ptr->x_aarch64_branch_protection_string;
13201   if (opts->x_aarch64_branch_protection_string)
13202     {
13203       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13204                                         NULL);
13205     }
13206
13207   aarch64_override_options_internal (opts);
13208 }
13209
13210 /* Implement TARGET_OPTION_PRINT.  */
13211
13212 static void
13213 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13214 {
13215   const struct processor *cpu
13216     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13217   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13218   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13219   std::string extension
13220     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13221
13222   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13223   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13224            arch->name, extension.c_str ());
13225 }
13226
13227 static GTY(()) tree aarch64_previous_fndecl;
13228
13229 void
13230 aarch64_reset_previous_fndecl (void)
13231 {
13232   aarch64_previous_fndecl = NULL;
13233 }
13234
13235 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13236    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13237    make sure optab availability predicates are recomputed when necessary.  */
13238
13239 void
13240 aarch64_save_restore_target_globals (tree new_tree)
13241 {
13242   if (TREE_TARGET_GLOBALS (new_tree))
13243     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13244   else if (new_tree == target_option_default_node)
13245     restore_target_globals (&default_target_globals);
13246   else
13247     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13248 }
13249
13250 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13251    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13252    of the function, if such exists.  This function may be called multiple
13253    times on a single function so use aarch64_previous_fndecl to avoid
13254    setting up identical state.  */
13255
13256 static void
13257 aarch64_set_current_function (tree fndecl)
13258 {
13259   if (!fndecl || fndecl == aarch64_previous_fndecl)
13260     return;
13261
13262   tree old_tree = (aarch64_previous_fndecl
13263                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13264                    : NULL_TREE);
13265
13266   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13267
13268   /* If current function has no attributes but the previous one did,
13269      use the default node.  */
13270   if (!new_tree && old_tree)
13271     new_tree = target_option_default_node;
13272
13273   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13274      the default have been handled by aarch64_save_restore_target_globals from
13275      aarch64_pragma_target_parse.  */
13276   if (old_tree == new_tree)
13277     return;
13278
13279   aarch64_previous_fndecl = fndecl;
13280
13281   /* First set the target options.  */
13282   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13283
13284   aarch64_save_restore_target_globals (new_tree);
13285 }
13286
13287 /* Enum describing the various ways we can handle attributes.
13288    In many cases we can reuse the generic option handling machinery.  */
13289
13290 enum aarch64_attr_opt_type
13291 {
13292   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13293   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13294   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13295   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13296 };
13297
13298 /* All the information needed to handle a target attribute.
13299    NAME is the name of the attribute.
13300    ATTR_TYPE specifies the type of behavior of the attribute as described
13301    in the definition of enum aarch64_attr_opt_type.
13302    ALLOW_NEG is true if the attribute supports a "no-" form.
13303    HANDLER is the function that takes the attribute string as an argument
13304    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13305    OPT_NUM is the enum specifying the option that the attribute modifies.
13306    This is needed for attributes that mirror the behavior of a command-line
13307    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13308    aarch64_attr_enum.  */
13309
13310 struct aarch64_attribute_info
13311 {
13312   const char *name;
13313   enum aarch64_attr_opt_type attr_type;
13314   bool allow_neg;
13315   bool (*handler) (const char *);
13316   enum opt_code opt_num;
13317 };
13318
13319 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13320
13321 static bool
13322 aarch64_handle_attr_arch (const char *str)
13323 {
13324   const struct processor *tmp_arch = NULL;
13325   std::string invalid_extension;
13326   enum aarch64_parse_opt_result parse_res
13327     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13328
13329   if (parse_res == AARCH64_PARSE_OK)
13330     {
13331       gcc_assert (tmp_arch);
13332       selected_arch = tmp_arch;
13333       explicit_arch = selected_arch->arch;
13334       return true;
13335     }
13336
13337   switch (parse_res)
13338     {
13339       case AARCH64_PARSE_MISSING_ARG:
13340         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13341         break;
13342       case AARCH64_PARSE_INVALID_ARG:
13343         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13344         aarch64_print_hint_for_arch (str);
13345         break;
13346       case AARCH64_PARSE_INVALID_FEATURE:
13347         error ("invalid feature modifier %s of value (\"%s\") in "
13348                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13349         aarch64_print_hint_for_extensions (invalid_extension);
13350         break;
13351       default:
13352         gcc_unreachable ();
13353     }
13354
13355   return false;
13356 }
13357
13358 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13359
13360 static bool
13361 aarch64_handle_attr_cpu (const char *str)
13362 {
13363   const struct processor *tmp_cpu = NULL;
13364   std::string invalid_extension;
13365   enum aarch64_parse_opt_result parse_res
13366     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13367
13368   if (parse_res == AARCH64_PARSE_OK)
13369     {
13370       gcc_assert (tmp_cpu);
13371       selected_tune = tmp_cpu;
13372       explicit_tune_core = selected_tune->ident;
13373
13374       selected_arch = &all_architectures[tmp_cpu->arch];
13375       explicit_arch = selected_arch->arch;
13376       return true;
13377     }
13378
13379   switch (parse_res)
13380     {
13381       case AARCH64_PARSE_MISSING_ARG:
13382         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13383         break;
13384       case AARCH64_PARSE_INVALID_ARG:
13385         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13386         aarch64_print_hint_for_core (str);
13387         break;
13388       case AARCH64_PARSE_INVALID_FEATURE:
13389         error ("invalid feature modifier %s of value (\"%s\") in "
13390                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13391         aarch64_print_hint_for_extensions (invalid_extension);
13392         break;
13393       default:
13394         gcc_unreachable ();
13395     }
13396
13397   return false;
13398 }
13399
13400 /* Handle the argument STR to the branch-protection= attribute.  */
13401
13402  static bool
13403  aarch64_handle_attr_branch_protection (const char* str)
13404  {
13405   char *err_str = (char *) xmalloc (strlen (str));
13406   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13407                                                                       &err_str);
13408   bool success = false;
13409   switch (res)
13410     {
13411      case AARCH64_PARSE_MISSING_ARG:
13412        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13413               " attribute");
13414        break;
13415      case AARCH64_PARSE_INVALID_ARG:
13416        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13417               "=\")%> pragma or attribute", err_str);
13418        break;
13419      case AARCH64_PARSE_OK:
13420        success = true;
13421       /* Fall through.  */
13422      case AARCH64_PARSE_INVALID_FEATURE:
13423        break;
13424      default:
13425        gcc_unreachable ();
13426     }
13427   free (err_str);
13428   return success;
13429  }
13430
13431 /* Handle the argument STR to the tune= target attribute.  */
13432
13433 static bool
13434 aarch64_handle_attr_tune (const char *str)
13435 {
13436   const struct processor *tmp_tune = NULL;
13437   enum aarch64_parse_opt_result parse_res
13438     = aarch64_parse_tune (str, &tmp_tune);
13439
13440   if (parse_res == AARCH64_PARSE_OK)
13441     {
13442       gcc_assert (tmp_tune);
13443       selected_tune = tmp_tune;
13444       explicit_tune_core = selected_tune->ident;
13445       return true;
13446     }
13447
13448   switch (parse_res)
13449     {
13450       case AARCH64_PARSE_INVALID_ARG:
13451         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13452         aarch64_print_hint_for_core (str);
13453         break;
13454       default:
13455         gcc_unreachable ();
13456     }
13457
13458   return false;
13459 }
13460
13461 /* Parse an architecture extensions target attribute string specified in STR.
13462    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13463    if successful.  Update aarch64_isa_flags to reflect the ISA features
13464    modified.  */
13465
13466 static bool
13467 aarch64_handle_attr_isa_flags (char *str)
13468 {
13469   enum aarch64_parse_opt_result parse_res;
13470   uint64_t isa_flags = aarch64_isa_flags;
13471
13472   /* We allow "+nothing" in the beginning to clear out all architectural
13473      features if the user wants to handpick specific features.  */
13474   if (strncmp ("+nothing", str, 8) == 0)
13475     {
13476       isa_flags = 0;
13477       str += 8;
13478     }
13479
13480   std::string invalid_extension;
13481   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13482
13483   if (parse_res == AARCH64_PARSE_OK)
13484     {
13485       aarch64_isa_flags = isa_flags;
13486       return true;
13487     }
13488
13489   switch (parse_res)
13490     {
13491       case AARCH64_PARSE_MISSING_ARG:
13492         error ("missing value in %<target()%> pragma or attribute");
13493         break;
13494
13495       case AARCH64_PARSE_INVALID_FEATURE:
13496         error ("invalid feature modifier %s of value (\"%s\") in "
13497                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13498         break;
13499
13500       default:
13501         gcc_unreachable ();
13502     }
13503
13504  return false;
13505 }
13506
13507 /* The target attributes that we support.  On top of these we also support just
13508    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13509    handled explicitly in aarch64_process_one_target_attr.  */
13510
13511 static const struct aarch64_attribute_info aarch64_attributes[] =
13512 {
13513   { "general-regs-only", aarch64_attr_mask, false, NULL,
13514      OPT_mgeneral_regs_only },
13515   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13516      OPT_mfix_cortex_a53_835769 },
13517   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13518      OPT_mfix_cortex_a53_843419 },
13519   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13520   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13521   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13522      OPT_momit_leaf_frame_pointer },
13523   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13524   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13525      OPT_march_ },
13526   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13527   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13528      OPT_mtune_ },
13529   { "branch-protection", aarch64_attr_custom, false,
13530      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13531   { "sign-return-address", aarch64_attr_enum, false, NULL,
13532      OPT_msign_return_address_ },
13533   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13534 };
13535
13536 /* Parse ARG_STR which contains the definition of one target attribute.
13537    Show appropriate errors if any or return true if the attribute is valid.  */
13538
13539 static bool
13540 aarch64_process_one_target_attr (char *arg_str)
13541 {
13542   bool invert = false;
13543
13544   size_t len = strlen (arg_str);
13545
13546   if (len == 0)
13547     {
13548       error ("malformed %<target()%> pragma or attribute");
13549       return false;
13550     }
13551
13552   char *str_to_check = (char *) alloca (len + 1);
13553   strcpy (str_to_check, arg_str);
13554
13555   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13556      It is easier to detect and handle it explicitly here rather than going
13557      through the machinery for the rest of the target attributes in this
13558      function.  */
13559   if (*str_to_check == '+')
13560     return aarch64_handle_attr_isa_flags (str_to_check);
13561
13562   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13563     {
13564       invert = true;
13565       str_to_check += 3;
13566     }
13567   char *arg = strchr (str_to_check, '=');
13568
13569   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13570      and point ARG to "foo".  */
13571   if (arg)
13572     {
13573       *arg = '\0';
13574       arg++;
13575     }
13576   const struct aarch64_attribute_info *p_attr;
13577   bool found = false;
13578   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13579     {
13580       /* If the names don't match up, or the user has given an argument
13581          to an attribute that doesn't accept one, or didn't give an argument
13582          to an attribute that expects one, fail to match.  */
13583       if (strcmp (str_to_check, p_attr->name) != 0)
13584         continue;
13585
13586       found = true;
13587       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13588                               || p_attr->attr_type == aarch64_attr_enum;
13589
13590       if (attr_need_arg_p ^ (arg != NULL))
13591         {
13592           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13593           return false;
13594         }
13595
13596       /* If the name matches but the attribute does not allow "no-" versions
13597          then we can't match.  */
13598       if (invert && !p_attr->allow_neg)
13599         {
13600           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13601           return false;
13602         }
13603
13604       switch (p_attr->attr_type)
13605         {
13606         /* Has a custom handler registered.
13607            For example, cpu=, arch=, tune=.  */
13608           case aarch64_attr_custom:
13609             gcc_assert (p_attr->handler);
13610             if (!p_attr->handler (arg))
13611               return false;
13612             break;
13613
13614           /* Either set or unset a boolean option.  */
13615           case aarch64_attr_bool:
13616             {
13617               struct cl_decoded_option decoded;
13618
13619               generate_option (p_attr->opt_num, NULL, !invert,
13620                                CL_TARGET, &decoded);
13621               aarch64_handle_option (&global_options, &global_options_set,
13622                                       &decoded, input_location);
13623               break;
13624             }
13625           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13626              should know what mask to apply given the option number.  */
13627           case aarch64_attr_mask:
13628             {
13629               struct cl_decoded_option decoded;
13630               /* We only need to specify the option number.
13631                  aarch64_handle_option will know which mask to apply.  */
13632               decoded.opt_index = p_attr->opt_num;
13633               decoded.value = !invert;
13634               aarch64_handle_option (&global_options, &global_options_set,
13635                                       &decoded, input_location);
13636               break;
13637             }
13638           /* Use the option setting machinery to set an option to an enum.  */
13639           case aarch64_attr_enum:
13640             {
13641               gcc_assert (arg);
13642               bool valid;
13643               int value;
13644               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13645                                               &value, CL_TARGET);
13646               if (valid)
13647                 {
13648                   set_option (&global_options, NULL, p_attr->opt_num, value,
13649                               NULL, DK_UNSPECIFIED, input_location,
13650                               global_dc);
13651                 }
13652               else
13653                 {
13654                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13655                 }
13656               break;
13657             }
13658           default:
13659             gcc_unreachable ();
13660         }
13661     }
13662
13663   /* If we reached here we either have found an attribute and validated
13664      it or didn't match any.  If we matched an attribute but its arguments
13665      were malformed we will have returned false already.  */
13666   return found;
13667 }
13668
13669 /* Count how many times the character C appears in
13670    NULL-terminated string STR.  */
13671
13672 static unsigned int
13673 num_occurences_in_str (char c, char *str)
13674 {
13675   unsigned int res = 0;
13676   while (*str != '\0')
13677     {
13678       if (*str == c)
13679         res++;
13680
13681       str++;
13682     }
13683
13684   return res;
13685 }
13686
13687 /* Parse the tree in ARGS that contains the target attribute information
13688    and update the global target options space.  */
13689
13690 bool
13691 aarch64_process_target_attr (tree args)
13692 {
13693   if (TREE_CODE (args) == TREE_LIST)
13694     {
13695       do
13696         {
13697           tree head = TREE_VALUE (args);
13698           if (head)
13699             {
13700               if (!aarch64_process_target_attr (head))
13701                 return false;
13702             }
13703           args = TREE_CHAIN (args);
13704         } while (args);
13705
13706       return true;
13707     }
13708
13709   if (TREE_CODE (args) != STRING_CST)
13710     {
13711       error ("attribute %<target%> argument not a string");
13712       return false;
13713     }
13714
13715   size_t len = strlen (TREE_STRING_POINTER (args));
13716   char *str_to_check = (char *) alloca (len + 1);
13717   strcpy (str_to_check, TREE_STRING_POINTER (args));
13718
13719   if (len == 0)
13720     {
13721       error ("malformed %<target()%> pragma or attribute");
13722       return false;
13723     }
13724
13725   /* Used to catch empty spaces between commas i.e.
13726      attribute ((target ("attr1,,attr2"))).  */
13727   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13728
13729   /* Handle multiple target attributes separated by ','.  */
13730   char *token = strtok_r (str_to_check, ",", &str_to_check);
13731
13732   unsigned int num_attrs = 0;
13733   while (token)
13734     {
13735       num_attrs++;
13736       if (!aarch64_process_one_target_attr (token))
13737         {
13738           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13739           return false;
13740         }
13741
13742       token = strtok_r (NULL, ",", &str_to_check);
13743     }
13744
13745   if (num_attrs != num_commas + 1)
13746     {
13747       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13748       return false;
13749     }
13750
13751   return true;
13752 }
13753
13754 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13755    process attribute ((target ("..."))).  */
13756
13757 static bool
13758 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13759 {
13760   struct cl_target_option cur_target;
13761   bool ret;
13762   tree old_optimize;
13763   tree new_target, new_optimize;
13764   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13765
13766   /* If what we're processing is the current pragma string then the
13767      target option node is already stored in target_option_current_node
13768      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13769      having to re-parse the string.  This is especially useful to keep
13770      arm_neon.h compile times down since that header contains a lot
13771      of intrinsics enclosed in pragmas.  */
13772   if (!existing_target && args == current_target_pragma)
13773     {
13774       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13775       return true;
13776     }
13777   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13778
13779   old_optimize = build_optimization_node (&global_options);
13780   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13781
13782   /* If the function changed the optimization levels as well as setting
13783      target options, start with the optimizations specified.  */
13784   if (func_optimize && func_optimize != old_optimize)
13785     cl_optimization_restore (&global_options,
13786                              TREE_OPTIMIZATION (func_optimize));
13787
13788   /* Save the current target options to restore at the end.  */
13789   cl_target_option_save (&cur_target, &global_options);
13790
13791   /* If fndecl already has some target attributes applied to it, unpack
13792      them so that we add this attribute on top of them, rather than
13793      overwriting them.  */
13794   if (existing_target)
13795     {
13796       struct cl_target_option *existing_options
13797         = TREE_TARGET_OPTION (existing_target);
13798
13799       if (existing_options)
13800         cl_target_option_restore (&global_options, existing_options);
13801     }
13802   else
13803     cl_target_option_restore (&global_options,
13804                         TREE_TARGET_OPTION (target_option_current_node));
13805
13806   ret = aarch64_process_target_attr (args);
13807
13808   /* Set up any additional state.  */
13809   if (ret)
13810     {
13811       aarch64_override_options_internal (&global_options);
13812       /* Initialize SIMD builtins if we haven't already.
13813          Set current_target_pragma to NULL for the duration so that
13814          the builtin initialization code doesn't try to tag the functions
13815          being built with the attributes specified by any current pragma, thus
13816          going into an infinite recursion.  */
13817       if (TARGET_SIMD)
13818         {
13819           tree saved_current_target_pragma = current_target_pragma;
13820           current_target_pragma = NULL;
13821           aarch64_init_simd_builtins ();
13822           current_target_pragma = saved_current_target_pragma;
13823         }
13824       new_target = build_target_option_node (&global_options);
13825     }
13826   else
13827     new_target = NULL;
13828
13829   new_optimize = build_optimization_node (&global_options);
13830
13831   if (fndecl && ret)
13832     {
13833       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13834
13835       if (old_optimize != new_optimize)
13836         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13837     }
13838
13839   cl_target_option_restore (&global_options, &cur_target);
13840
13841   if (old_optimize != new_optimize)
13842     cl_optimization_restore (&global_options,
13843                              TREE_OPTIMIZATION (old_optimize));
13844   return ret;
13845 }
13846
13847 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13848    tri-bool options (yes, no, don't care) and the default value is
13849    DEF, determine whether to reject inlining.  */
13850
13851 static bool
13852 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13853                                      int dont_care, int def)
13854 {
13855   /* If the callee doesn't care, always allow inlining.  */
13856   if (callee == dont_care)
13857     return true;
13858
13859   /* If the caller doesn't care, always allow inlining.  */
13860   if (caller == dont_care)
13861     return true;
13862
13863   /* Otherwise, allow inlining if either the callee and caller values
13864      agree, or if the callee is using the default value.  */
13865   return (callee == caller || callee == def);
13866 }
13867
13868 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13869    to inline CALLEE into CALLER based on target-specific info.
13870    Make sure that the caller and callee have compatible architectural
13871    features.  Then go through the other possible target attributes
13872    and see if they can block inlining.  Try not to reject always_inline
13873    callees unless they are incompatible architecturally.  */
13874
13875 static bool
13876 aarch64_can_inline_p (tree caller, tree callee)
13877 {
13878   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13879   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13880
13881   struct cl_target_option *caller_opts
13882         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13883                                            : target_option_default_node);
13884
13885   struct cl_target_option *callee_opts
13886         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13887                                            : target_option_default_node);
13888
13889   /* Callee's ISA flags should be a subset of the caller's.  */
13890   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13891        != callee_opts->x_aarch64_isa_flags)
13892     return false;
13893
13894   /* Allow non-strict aligned functions inlining into strict
13895      aligned ones.  */
13896   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13897        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13898       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13899            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13900     return false;
13901
13902   bool always_inline = lookup_attribute ("always_inline",
13903                                           DECL_ATTRIBUTES (callee));
13904
13905   /* If the architectural features match up and the callee is always_inline
13906      then the other attributes don't matter.  */
13907   if (always_inline)
13908     return true;
13909
13910   if (caller_opts->x_aarch64_cmodel_var
13911       != callee_opts->x_aarch64_cmodel_var)
13912     return false;
13913
13914   if (caller_opts->x_aarch64_tls_dialect
13915       != callee_opts->x_aarch64_tls_dialect)
13916     return false;
13917
13918   /* Honour explicit requests to workaround errata.  */
13919   if (!aarch64_tribools_ok_for_inlining_p (
13920           caller_opts->x_aarch64_fix_a53_err835769,
13921           callee_opts->x_aarch64_fix_a53_err835769,
13922           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13923     return false;
13924
13925   if (!aarch64_tribools_ok_for_inlining_p (
13926           caller_opts->x_aarch64_fix_a53_err843419,
13927           callee_opts->x_aarch64_fix_a53_err843419,
13928           2, TARGET_FIX_ERR_A53_843419))
13929     return false;
13930
13931   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13932      caller and calle and they don't match up, reject inlining.  */
13933   if (!aarch64_tribools_ok_for_inlining_p (
13934           caller_opts->x_flag_omit_leaf_frame_pointer,
13935           callee_opts->x_flag_omit_leaf_frame_pointer,
13936           2, 1))
13937     return false;
13938
13939   /* If the callee has specific tuning overrides, respect them.  */
13940   if (callee_opts->x_aarch64_override_tune_string != NULL
13941       && caller_opts->x_aarch64_override_tune_string == NULL)
13942     return false;
13943
13944   /* If the user specified tuning override strings for the
13945      caller and callee and they don't match up, reject inlining.
13946      We just do a string compare here, we don't analyze the meaning
13947      of the string, as it would be too costly for little gain.  */
13948   if (callee_opts->x_aarch64_override_tune_string
13949       && caller_opts->x_aarch64_override_tune_string
13950       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13951                   caller_opts->x_aarch64_override_tune_string) != 0))
13952     return false;
13953
13954   return true;
13955 }
13956
13957 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
13958    been already.  */
13959
13960 unsigned int
13961 aarch64_tlsdesc_abi_id ()
13962 {
13963   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
13964   if (!tlsdesc_abi.initialized_p ())
13965     {
13966       HARD_REG_SET full_reg_clobbers;
13967       CLEAR_HARD_REG_SET (full_reg_clobbers);
13968       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
13969       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
13970       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
13971         SET_HARD_REG_BIT (full_reg_clobbers, regno);
13972       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
13973     }
13974   return tlsdesc_abi.id ();
13975 }
13976
13977 /* Return true if SYMBOL_REF X binds locally.  */
13978
13979 static bool
13980 aarch64_symbol_binds_local_p (const_rtx x)
13981 {
13982   return (SYMBOL_REF_DECL (x)
13983           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13984           : SYMBOL_REF_LOCAL_P (x));
13985 }
13986
13987 /* Return true if SYMBOL_REF X is thread local */
13988 static bool
13989 aarch64_tls_symbol_p (rtx x)
13990 {
13991   if (! TARGET_HAVE_TLS)
13992     return false;
13993
13994   if (GET_CODE (x) != SYMBOL_REF)
13995     return false;
13996
13997   return SYMBOL_REF_TLS_MODEL (x) != 0;
13998 }
13999
14000 /* Classify a TLS symbol into one of the TLS kinds.  */
14001 enum aarch64_symbol_type
14002 aarch64_classify_tls_symbol (rtx x)
14003 {
14004   enum tls_model tls_kind = tls_symbolic_operand_type (x);
14005
14006   switch (tls_kind)
14007     {
14008     case TLS_MODEL_GLOBAL_DYNAMIC:
14009     case TLS_MODEL_LOCAL_DYNAMIC:
14010       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
14011
14012     case TLS_MODEL_INITIAL_EXEC:
14013       switch (aarch64_cmodel)
14014         {
14015         case AARCH64_CMODEL_TINY:
14016         case AARCH64_CMODEL_TINY_PIC:
14017           return SYMBOL_TINY_TLSIE;
14018         default:
14019           return SYMBOL_SMALL_TLSIE;
14020         }
14021
14022     case TLS_MODEL_LOCAL_EXEC:
14023       if (aarch64_tls_size == 12)
14024         return SYMBOL_TLSLE12;
14025       else if (aarch64_tls_size == 24)
14026         return SYMBOL_TLSLE24;
14027       else if (aarch64_tls_size == 32)
14028         return SYMBOL_TLSLE32;
14029       else if (aarch64_tls_size == 48)
14030         return SYMBOL_TLSLE48;
14031       else
14032         gcc_unreachable ();
14033
14034     case TLS_MODEL_EMULATED:
14035     case TLS_MODEL_NONE:
14036       return SYMBOL_FORCE_TO_MEM;
14037
14038     default:
14039       gcc_unreachable ();
14040     }
14041 }
14042
14043 /* Return the correct method for accessing X + OFFSET, where X is either
14044    a SYMBOL_REF or LABEL_REF.  */
14045
14046 enum aarch64_symbol_type
14047 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
14048 {
14049   if (GET_CODE (x) == LABEL_REF)
14050     {
14051       switch (aarch64_cmodel)
14052         {
14053         case AARCH64_CMODEL_LARGE:
14054           return SYMBOL_FORCE_TO_MEM;
14055
14056         case AARCH64_CMODEL_TINY_PIC:
14057         case AARCH64_CMODEL_TINY:
14058           return SYMBOL_TINY_ABSOLUTE;
14059
14060         case AARCH64_CMODEL_SMALL_SPIC:
14061         case AARCH64_CMODEL_SMALL_PIC:
14062         case AARCH64_CMODEL_SMALL:
14063           return SYMBOL_SMALL_ABSOLUTE;
14064
14065         default:
14066           gcc_unreachable ();
14067         }
14068     }
14069
14070   if (GET_CODE (x) == SYMBOL_REF)
14071     {
14072       if (aarch64_tls_symbol_p (x))
14073         return aarch64_classify_tls_symbol (x);
14074
14075       switch (aarch64_cmodel)
14076         {
14077         case AARCH64_CMODEL_TINY:
14078           /* When we retrieve symbol + offset address, we have to make sure
14079              the offset does not cause overflow of the final address.  But
14080              we have no way of knowing the address of symbol at compile time
14081              so we can't accurately say if the distance between the PC and
14082              symbol + offset is outside the addressible range of +/-1M in the
14083              TINY code model.  So we rely on images not being greater than
14084              1M and cap the offset at 1M and anything beyond 1M will have to
14085              be loaded using an alternative mechanism.  Furthermore if the
14086              symbol is a weak reference to something that isn't known to
14087              resolve to a symbol in this module, then force to memory.  */
14088           if ((SYMBOL_REF_WEAK (x)
14089                && !aarch64_symbol_binds_local_p (x))
14090               || !IN_RANGE (offset, -1048575, 1048575))
14091             return SYMBOL_FORCE_TO_MEM;
14092           return SYMBOL_TINY_ABSOLUTE;
14093
14094         case AARCH64_CMODEL_SMALL:
14095           /* Same reasoning as the tiny code model, but the offset cap here is
14096              4G.  */
14097           if ((SYMBOL_REF_WEAK (x)
14098                && !aarch64_symbol_binds_local_p (x))
14099               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
14100                             HOST_WIDE_INT_C (4294967264)))
14101             return SYMBOL_FORCE_TO_MEM;
14102           return SYMBOL_SMALL_ABSOLUTE;
14103
14104         case AARCH64_CMODEL_TINY_PIC:
14105           if (!aarch64_symbol_binds_local_p (x))
14106             return SYMBOL_TINY_GOT;
14107           return SYMBOL_TINY_ABSOLUTE;
14108
14109         case AARCH64_CMODEL_SMALL_SPIC:
14110         case AARCH64_CMODEL_SMALL_PIC:
14111           if (!aarch64_symbol_binds_local_p (x))
14112             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14113                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14114           return SYMBOL_SMALL_ABSOLUTE;
14115
14116         case AARCH64_CMODEL_LARGE:
14117           /* This is alright even in PIC code as the constant
14118              pool reference is always PC relative and within
14119              the same translation unit.  */
14120           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14121             return SYMBOL_SMALL_ABSOLUTE;
14122           else
14123             return SYMBOL_FORCE_TO_MEM;
14124
14125         default:
14126           gcc_unreachable ();
14127         }
14128     }
14129
14130   /* By default push everything into the constant pool.  */
14131   return SYMBOL_FORCE_TO_MEM;
14132 }
14133
14134 bool
14135 aarch64_constant_address_p (rtx x)
14136 {
14137   return (CONSTANT_P (x) && memory_address_p (DImode, x));
14138 }
14139
14140 bool
14141 aarch64_legitimate_pic_operand_p (rtx x)
14142 {
14143   if (GET_CODE (x) == SYMBOL_REF
14144       || (GET_CODE (x) == CONST
14145           && GET_CODE (XEXP (x, 0)) == PLUS
14146           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14147      return false;
14148
14149   return true;
14150 }
14151
14152 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
14153    that should be rematerialized rather than spilled.  */
14154
14155 static bool
14156 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14157 {
14158   /* Support CSE and rematerialization of common constants.  */
14159   if (CONST_INT_P (x)
14160       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14161       || GET_CODE (x) == CONST_VECTOR)
14162     return true;
14163
14164   /* Do not allow vector struct mode constants for Advanced SIMD.
14165      We could support 0 and -1 easily, but they need support in
14166      aarch64-simd.md.  */
14167   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14168   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14169     return false;
14170
14171   /* Only accept variable-length vector constants if they can be
14172      handled directly.
14173
14174      ??? It would be possible to handle rematerialization of other
14175      constants via secondary reloads.  */
14176   if (vec_flags & VEC_ANY_SVE)
14177     return aarch64_simd_valid_immediate (x, NULL);
14178
14179   if (GET_CODE (x) == HIGH)
14180     x = XEXP (x, 0);
14181
14182   /* Accept polynomial constants that can be calculated by using the
14183      destination of a move as the sole temporary.  Constants that
14184      require a second temporary cannot be rematerialized (they can't be
14185      forced to memory and also aren't legitimate constants).  */
14186   poly_int64 offset;
14187   if (poly_int_rtx_p (x, &offset))
14188     return aarch64_offset_temporaries (false, offset) <= 1;
14189
14190   /* If an offset is being added to something else, we need to allow the
14191      base to be moved into the destination register, meaning that there
14192      are no free temporaries for the offset.  */
14193   x = strip_offset (x, &offset);
14194   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14195     return false;
14196
14197   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14198   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14199     return false;
14200
14201   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14202      so spilling them is better than rematerialization.  */
14203   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14204     return true;
14205
14206   /* Label references are always constant.  */
14207   if (GET_CODE (x) == LABEL_REF)
14208     return true;
14209
14210   return false;
14211 }
14212
14213 rtx
14214 aarch64_load_tp (rtx target)
14215 {
14216   if (!target
14217       || GET_MODE (target) != Pmode
14218       || !register_operand (target, Pmode))
14219     target = gen_reg_rtx (Pmode);
14220
14221   /* Can return in any reg.  */
14222   emit_insn (gen_aarch64_load_tp_hard (target));
14223   return target;
14224 }
14225
14226 /* On AAPCS systems, this is the "struct __va_list".  */
14227 static GTY(()) tree va_list_type;
14228
14229 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14230    Return the type to use as __builtin_va_list.
14231
14232    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14233
14234    struct __va_list
14235    {
14236      void *__stack;
14237      void *__gr_top;
14238      void *__vr_top;
14239      int   __gr_offs;
14240      int   __vr_offs;
14241    };  */
14242
14243 static tree
14244 aarch64_build_builtin_va_list (void)
14245 {
14246   tree va_list_name;
14247   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14248
14249   /* Create the type.  */
14250   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14251   /* Give it the required name.  */
14252   va_list_name = build_decl (BUILTINS_LOCATION,
14253                              TYPE_DECL,
14254                              get_identifier ("__va_list"),
14255                              va_list_type);
14256   DECL_ARTIFICIAL (va_list_name) = 1;
14257   TYPE_NAME (va_list_type) = va_list_name;
14258   TYPE_STUB_DECL (va_list_type) = va_list_name;
14259
14260   /* Create the fields.  */
14261   f_stack = build_decl (BUILTINS_LOCATION,
14262                         FIELD_DECL, get_identifier ("__stack"),
14263                         ptr_type_node);
14264   f_grtop = build_decl (BUILTINS_LOCATION,
14265                         FIELD_DECL, get_identifier ("__gr_top"),
14266                         ptr_type_node);
14267   f_vrtop = build_decl (BUILTINS_LOCATION,
14268                         FIELD_DECL, get_identifier ("__vr_top"),
14269                         ptr_type_node);
14270   f_groff = build_decl (BUILTINS_LOCATION,
14271                         FIELD_DECL, get_identifier ("__gr_offs"),
14272                         integer_type_node);
14273   f_vroff = build_decl (BUILTINS_LOCATION,
14274                         FIELD_DECL, get_identifier ("__vr_offs"),
14275                         integer_type_node);
14276
14277   /* Tell tree-stdarg pass about our internal offset fields.
14278      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14279      purpose to identify whether the code is updating va_list internal
14280      offset fields through irregular way.  */
14281   va_list_gpr_counter_field = f_groff;
14282   va_list_fpr_counter_field = f_vroff;
14283
14284   DECL_ARTIFICIAL (f_stack) = 1;
14285   DECL_ARTIFICIAL (f_grtop) = 1;
14286   DECL_ARTIFICIAL (f_vrtop) = 1;
14287   DECL_ARTIFICIAL (f_groff) = 1;
14288   DECL_ARTIFICIAL (f_vroff) = 1;
14289
14290   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14291   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14292   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14293   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14294   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14295
14296   TYPE_FIELDS (va_list_type) = f_stack;
14297   DECL_CHAIN (f_stack) = f_grtop;
14298   DECL_CHAIN (f_grtop) = f_vrtop;
14299   DECL_CHAIN (f_vrtop) = f_groff;
14300   DECL_CHAIN (f_groff) = f_vroff;
14301
14302   /* Compute its layout.  */
14303   layout_type (va_list_type);
14304
14305   return va_list_type;
14306 }
14307
14308 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14309 static void
14310 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14311 {
14312   const CUMULATIVE_ARGS *cum;
14313   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14314   tree stack, grtop, vrtop, groff, vroff;
14315   tree t;
14316   int gr_save_area_size = cfun->va_list_gpr_size;
14317   int vr_save_area_size = cfun->va_list_fpr_size;
14318   int vr_offset;
14319
14320   cum = &crtl->args.info;
14321   if (cfun->va_list_gpr_size)
14322     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14323                              cfun->va_list_gpr_size);
14324   if (cfun->va_list_fpr_size)
14325     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14326                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14327
14328   if (!TARGET_FLOAT)
14329     {
14330       gcc_assert (cum->aapcs_nvrn == 0);
14331       vr_save_area_size = 0;
14332     }
14333
14334   f_stack = TYPE_FIELDS (va_list_type_node);
14335   f_grtop = DECL_CHAIN (f_stack);
14336   f_vrtop = DECL_CHAIN (f_grtop);
14337   f_groff = DECL_CHAIN (f_vrtop);
14338   f_vroff = DECL_CHAIN (f_groff);
14339
14340   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14341                   NULL_TREE);
14342   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14343                   NULL_TREE);
14344   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14345                   NULL_TREE);
14346   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14347                   NULL_TREE);
14348   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14349                   NULL_TREE);
14350
14351   /* Emit code to initialize STACK, which points to the next varargs stack
14352      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14353      by named arguments.  STACK is 8-byte aligned.  */
14354   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14355   if (cum->aapcs_stack_size > 0)
14356     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14357   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14358   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14359
14360   /* Emit code to initialize GRTOP, the top of the GR save area.
14361      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14362   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14363   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14364   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14365
14366   /* Emit code to initialize VRTOP, the top of the VR save area.
14367      This address is gr_save_area_bytes below GRTOP, rounded
14368      down to the next 16-byte boundary.  */
14369   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14370   vr_offset = ROUND_UP (gr_save_area_size,
14371                         STACK_BOUNDARY / BITS_PER_UNIT);
14372
14373   if (vr_offset)
14374     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14375   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14376   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14377
14378   /* Emit code to initialize GROFF, the offset from GRTOP of the
14379      next GPR argument.  */
14380   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14381               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14382   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14383
14384   /* Likewise emit code to initialize VROFF, the offset from FTOP
14385      of the next VR argument.  */
14386   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14387               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14388   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14389 }
14390
14391 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14392
14393 static tree
14394 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14395                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14396 {
14397   tree addr;
14398   bool indirect_p;
14399   bool is_ha;           /* is HFA or HVA.  */
14400   bool dw_align;        /* double-word align.  */
14401   machine_mode ag_mode = VOIDmode;
14402   int nregs;
14403   machine_mode mode;
14404
14405   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14406   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14407   HOST_WIDE_INT size, rsize, adjust, align;
14408   tree t, u, cond1, cond2;
14409
14410   indirect_p = pass_va_arg_by_reference (type);
14411   if (indirect_p)
14412     type = build_pointer_type (type);
14413
14414   mode = TYPE_MODE (type);
14415
14416   f_stack = TYPE_FIELDS (va_list_type_node);
14417   f_grtop = DECL_CHAIN (f_stack);
14418   f_vrtop = DECL_CHAIN (f_grtop);
14419   f_groff = DECL_CHAIN (f_vrtop);
14420   f_vroff = DECL_CHAIN (f_groff);
14421
14422   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14423                   f_stack, NULL_TREE);
14424   size = int_size_in_bytes (type);
14425
14426   bool abi_break;
14427   align
14428     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14429
14430   dw_align = false;
14431   adjust = 0;
14432   if (aarch64_vfp_is_call_or_return_candidate (mode,
14433                                                type,
14434                                                &ag_mode,
14435                                                &nregs,
14436                                                &is_ha))
14437     {
14438       /* No frontends can create types with variable-sized modes, so we
14439          shouldn't be asked to pass or return them.  */
14440       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14441
14442       /* TYPE passed in fp/simd registers.  */
14443       if (!TARGET_FLOAT)
14444         aarch64_err_no_fpadvsimd (mode);
14445
14446       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14447                       unshare_expr (valist), f_vrtop, NULL_TREE);
14448       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14449                       unshare_expr (valist), f_vroff, NULL_TREE);
14450
14451       rsize = nregs * UNITS_PER_VREG;
14452
14453       if (is_ha)
14454         {
14455           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14456             adjust = UNITS_PER_VREG - ag_size;
14457         }
14458       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14459                && size < UNITS_PER_VREG)
14460         {
14461           adjust = UNITS_PER_VREG - size;
14462         }
14463     }
14464   else
14465     {
14466       /* TYPE passed in general registers.  */
14467       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14468                       unshare_expr (valist), f_grtop, NULL_TREE);
14469       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14470                       unshare_expr (valist), f_groff, NULL_TREE);
14471       rsize = ROUND_UP (size, UNITS_PER_WORD);
14472       nregs = rsize / UNITS_PER_WORD;
14473
14474       if (align > 8)
14475         {
14476           if (abi_break && warn_psabi)
14477             inform (input_location, "parameter passing for argument of type "
14478                     "%qT changed in GCC 9.1", type);
14479           dw_align = true;
14480         }
14481
14482       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14483           && size < UNITS_PER_WORD)
14484         {
14485           adjust = UNITS_PER_WORD  - size;
14486         }
14487     }
14488
14489   /* Get a local temporary for the field value.  */
14490   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14491
14492   /* Emit code to branch if off >= 0.  */
14493   t = build2 (GE_EXPR, boolean_type_node, off,
14494               build_int_cst (TREE_TYPE (off), 0));
14495   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14496
14497   if (dw_align)
14498     {
14499       /* Emit: offs = (offs + 15) & -16.  */
14500       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14501                   build_int_cst (TREE_TYPE (off), 15));
14502       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14503                   build_int_cst (TREE_TYPE (off), -16));
14504       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14505     }
14506   else
14507     roundup = NULL;
14508
14509   /* Update ap.__[g|v]r_offs  */
14510   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14511               build_int_cst (TREE_TYPE (off), rsize));
14512   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14513
14514   /* String up.  */
14515   if (roundup)
14516     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14517
14518   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14519   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14520               build_int_cst (TREE_TYPE (f_off), 0));
14521   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14522
14523   /* String up: make sure the assignment happens before the use.  */
14524   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14525   COND_EXPR_ELSE (cond1) = t;
14526
14527   /* Prepare the trees handling the argument that is passed on the stack;
14528      the top level node will store in ON_STACK.  */
14529   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14530   if (align > 8)
14531     {
14532       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14533       t = fold_build_pointer_plus_hwi (arg, 15);
14534       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14535                   build_int_cst (TREE_TYPE (t), -16));
14536       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14537     }
14538   else
14539     roundup = NULL;
14540   /* Advance ap.__stack  */
14541   t = fold_build_pointer_plus_hwi (arg, size + 7);
14542   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14543               build_int_cst (TREE_TYPE (t), -8));
14544   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14545   /* String up roundup and advance.  */
14546   if (roundup)
14547     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14548   /* String up with arg */
14549   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14550   /* Big-endianness related address adjustment.  */
14551   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14552       && size < UNITS_PER_WORD)
14553   {
14554     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14555                 size_int (UNITS_PER_WORD - size));
14556     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14557   }
14558
14559   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14560   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14561
14562   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14563   t = off;
14564   if (adjust)
14565     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14566                 build_int_cst (TREE_TYPE (off), adjust));
14567
14568   t = fold_convert (sizetype, t);
14569   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14570
14571   if (is_ha)
14572     {
14573       /* type ha; // treat as "struct {ftype field[n];}"
14574          ... [computing offs]
14575          for (i = 0; i <nregs; ++i, offs += 16)
14576            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14577          return ha;  */
14578       int i;
14579       tree tmp_ha, field_t, field_ptr_t;
14580
14581       /* Declare a local variable.  */
14582       tmp_ha = create_tmp_var_raw (type, "ha");
14583       gimple_add_tmp_var (tmp_ha);
14584
14585       /* Establish the base type.  */
14586       switch (ag_mode)
14587         {
14588         case E_SFmode:
14589           field_t = float_type_node;
14590           field_ptr_t = float_ptr_type_node;
14591           break;
14592         case E_DFmode:
14593           field_t = double_type_node;
14594           field_ptr_t = double_ptr_type_node;
14595           break;
14596         case E_TFmode:
14597           field_t = long_double_type_node;
14598           field_ptr_t = long_double_ptr_type_node;
14599           break;
14600         case E_HFmode:
14601           field_t = aarch64_fp16_type_node;
14602           field_ptr_t = aarch64_fp16_ptr_type_node;
14603           break;
14604         case E_V2SImode:
14605         case E_V4SImode:
14606             {
14607               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14608               field_t = build_vector_type_for_mode (innertype, ag_mode);
14609               field_ptr_t = build_pointer_type (field_t);
14610             }
14611           break;
14612         default:
14613           gcc_assert (0);
14614         }
14615
14616       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14617       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14618       addr = t;
14619       t = fold_convert (field_ptr_t, addr);
14620       t = build2 (MODIFY_EXPR, field_t,
14621                   build1 (INDIRECT_REF, field_t, tmp_ha),
14622                   build1 (INDIRECT_REF, field_t, t));
14623
14624       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14625       for (i = 1; i < nregs; ++i)
14626         {
14627           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14628           u = fold_convert (field_ptr_t, addr);
14629           u = build2 (MODIFY_EXPR, field_t,
14630                       build2 (MEM_REF, field_t, tmp_ha,
14631                               build_int_cst (field_ptr_t,
14632                                              (i *
14633                                               int_size_in_bytes (field_t)))),
14634                       build1 (INDIRECT_REF, field_t, u));
14635           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14636         }
14637
14638       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14639       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14640     }
14641
14642   COND_EXPR_ELSE (cond2) = t;
14643   addr = fold_convert (build_pointer_type (type), cond1);
14644   addr = build_va_arg_indirect_ref (addr);
14645
14646   if (indirect_p)
14647     addr = build_va_arg_indirect_ref (addr);
14648
14649   return addr;
14650 }
14651
14652 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14653
14654 static void
14655 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14656                                 const function_arg_info &arg,
14657                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14658 {
14659   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14660   CUMULATIVE_ARGS local_cum;
14661   int gr_saved = cfun->va_list_gpr_size;
14662   int vr_saved = cfun->va_list_fpr_size;
14663
14664   /* The caller has advanced CUM up to, but not beyond, the last named
14665      argument.  Advance a local copy of CUM past the last "real" named
14666      argument, to find out how many registers are left over.  */
14667   local_cum = *cum;
14668   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14669
14670   /* Found out how many registers we need to save.
14671      Honor tree-stdvar analysis results.  */
14672   if (cfun->va_list_gpr_size)
14673     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14674                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14675   if (cfun->va_list_fpr_size)
14676     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14677                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14678
14679   if (!TARGET_FLOAT)
14680     {
14681       gcc_assert (local_cum.aapcs_nvrn == 0);
14682       vr_saved = 0;
14683     }
14684
14685   if (!no_rtl)
14686     {
14687       if (gr_saved > 0)
14688         {
14689           rtx ptr, mem;
14690
14691           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14692           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14693                                - gr_saved * UNITS_PER_WORD);
14694           mem = gen_frame_mem (BLKmode, ptr);
14695           set_mem_alias_set (mem, get_varargs_alias_set ());
14696
14697           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14698                                mem, gr_saved);
14699         }
14700       if (vr_saved > 0)
14701         {
14702           /* We can't use move_block_from_reg, because it will use
14703              the wrong mode, storing D regs only.  */
14704           machine_mode mode = TImode;
14705           int off, i, vr_start;
14706
14707           /* Set OFF to the offset from virtual_incoming_args_rtx of
14708              the first vector register.  The VR save area lies below
14709              the GR one, and is aligned to 16 bytes.  */
14710           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14711                            STACK_BOUNDARY / BITS_PER_UNIT);
14712           off -= vr_saved * UNITS_PER_VREG;
14713
14714           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14715           for (i = 0; i < vr_saved; ++i)
14716             {
14717               rtx ptr, mem;
14718
14719               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14720               mem = gen_frame_mem (mode, ptr);
14721               set_mem_alias_set (mem, get_varargs_alias_set ());
14722               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14723               off += UNITS_PER_VREG;
14724             }
14725         }
14726     }
14727
14728   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14729      any complication of having crtl->args.pretend_args_size changed.  */
14730   cfun->machine->frame.saved_varargs_size
14731     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14732                  STACK_BOUNDARY / BITS_PER_UNIT)
14733        + vr_saved * UNITS_PER_VREG);
14734 }
14735
14736 static void
14737 aarch64_conditional_register_usage (void)
14738 {
14739   int i;
14740   if (!TARGET_FLOAT)
14741     {
14742       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14743         {
14744           fixed_regs[i] = 1;
14745           call_used_regs[i] = 1;
14746         }
14747     }
14748   if (!TARGET_SVE)
14749     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14750       {
14751         fixed_regs[i] = 1;
14752         call_used_regs[i] = 1;
14753       }
14754
14755   /* When tracking speculation, we need a couple of call-clobbered registers
14756      to track the speculation state.  It would be nice to just use
14757      IP0 and IP1, but currently there are numerous places that just
14758      assume these registers are free for other uses (eg pointer
14759      authentication).  */
14760   if (aarch64_track_speculation)
14761     {
14762       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14763       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14764       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14765       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14766     }
14767 }
14768
14769 /* Walk down the type tree of TYPE counting consecutive base elements.
14770    If *MODEP is VOIDmode, then set it to the first valid floating point
14771    type.  If a non-floating point type is found, or if a floating point
14772    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14773    otherwise return the count in the sub-tree.  */
14774 static int
14775 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14776 {
14777   machine_mode mode;
14778   HOST_WIDE_INT size;
14779
14780   switch (TREE_CODE (type))
14781     {
14782     case REAL_TYPE:
14783       mode = TYPE_MODE (type);
14784       if (mode != DFmode && mode != SFmode
14785           && mode != TFmode && mode != HFmode)
14786         return -1;
14787
14788       if (*modep == VOIDmode)
14789         *modep = mode;
14790
14791       if (*modep == mode)
14792         return 1;
14793
14794       break;
14795
14796     case COMPLEX_TYPE:
14797       mode = TYPE_MODE (TREE_TYPE (type));
14798       if (mode != DFmode && mode != SFmode
14799           && mode != TFmode && mode != HFmode)
14800         return -1;
14801
14802       if (*modep == VOIDmode)
14803         *modep = mode;
14804
14805       if (*modep == mode)
14806         return 2;
14807
14808       break;
14809
14810     case VECTOR_TYPE:
14811       /* Use V2SImode and V4SImode as representatives of all 64-bit
14812          and 128-bit vector types.  */
14813       size = int_size_in_bytes (type);
14814       switch (size)
14815         {
14816         case 8:
14817           mode = V2SImode;
14818           break;
14819         case 16:
14820           mode = V4SImode;
14821           break;
14822         default:
14823           return -1;
14824         }
14825
14826       if (*modep == VOIDmode)
14827         *modep = mode;
14828
14829       /* Vector modes are considered to be opaque: two vectors are
14830          equivalent for the purposes of being homogeneous aggregates
14831          if they are the same size.  */
14832       if (*modep == mode)
14833         return 1;
14834
14835       break;
14836
14837     case ARRAY_TYPE:
14838       {
14839         int count;
14840         tree index = TYPE_DOMAIN (type);
14841
14842         /* Can't handle incomplete types nor sizes that are not
14843            fixed.  */
14844         if (!COMPLETE_TYPE_P (type)
14845             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14846           return -1;
14847
14848         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14849         if (count == -1
14850             || !index
14851             || !TYPE_MAX_VALUE (index)
14852             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14853             || !TYPE_MIN_VALUE (index)
14854             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14855             || count < 0)
14856           return -1;
14857
14858         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14859                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14860
14861         /* There must be no padding.  */
14862         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14863                       count * GET_MODE_BITSIZE (*modep)))
14864           return -1;
14865
14866         return count;
14867       }
14868
14869     case RECORD_TYPE:
14870       {
14871         int count = 0;
14872         int sub_count;
14873         tree field;
14874
14875         /* Can't handle incomplete types nor sizes that are not
14876            fixed.  */
14877         if (!COMPLETE_TYPE_P (type)
14878             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14879           return -1;
14880
14881         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14882           {
14883             if (TREE_CODE (field) != FIELD_DECL)
14884               continue;
14885
14886             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14887             if (sub_count < 0)
14888               return -1;
14889             count += sub_count;
14890           }
14891
14892         /* There must be no padding.  */
14893         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14894                       count * GET_MODE_BITSIZE (*modep)))
14895           return -1;
14896
14897         return count;
14898       }
14899
14900     case UNION_TYPE:
14901     case QUAL_UNION_TYPE:
14902       {
14903         /* These aren't very interesting except in a degenerate case.  */
14904         int count = 0;
14905         int sub_count;
14906         tree field;
14907
14908         /* Can't handle incomplete types nor sizes that are not
14909            fixed.  */
14910         if (!COMPLETE_TYPE_P (type)
14911             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14912           return -1;
14913
14914         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14915           {
14916             if (TREE_CODE (field) != FIELD_DECL)
14917               continue;
14918
14919             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14920             if (sub_count < 0)
14921               return -1;
14922             count = count > sub_count ? count : sub_count;
14923           }
14924
14925         /* There must be no padding.  */
14926         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14927                       count * GET_MODE_BITSIZE (*modep)))
14928           return -1;
14929
14930         return count;
14931       }
14932
14933     default:
14934       break;
14935     }
14936
14937   return -1;
14938 }
14939
14940 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14941    type as described in AAPCS64 \S 4.1.2.
14942
14943    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14944
14945 static bool
14946 aarch64_short_vector_p (const_tree type,
14947                         machine_mode mode)
14948 {
14949   poly_int64 size = -1;
14950
14951   if (type && TREE_CODE (type) == VECTOR_TYPE)
14952     size = int_size_in_bytes (type);
14953   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14954             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14955     size = GET_MODE_SIZE (mode);
14956
14957   return known_eq (size, 8) || known_eq (size, 16);
14958 }
14959
14960 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14961    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14962    array types.  The C99 floating-point complex types are also considered
14963    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14964    types, which are GCC extensions and out of the scope of AAPCS64, are
14965    treated as composite types here as well.
14966
14967    Note that MODE itself is not sufficient in determining whether a type
14968    is such a composite type or not.  This is because
14969    stor-layout.c:compute_record_mode may have already changed the MODE
14970    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14971    structure with only one field may have its MODE set to the mode of the
14972    field.  Also an integer mode whose size matches the size of the
14973    RECORD_TYPE type may be used to substitute the original mode
14974    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14975    solely relied on.  */
14976
14977 static bool
14978 aarch64_composite_type_p (const_tree type,
14979                           machine_mode mode)
14980 {
14981   if (aarch64_short_vector_p (type, mode))
14982     return false;
14983
14984   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14985     return true;
14986
14987   if (mode == BLKmode
14988       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14989       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14990     return true;
14991
14992   return false;
14993 }
14994
14995 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14996    shall be passed or returned in simd/fp register(s) (providing these
14997    parameter passing registers are available).
14998
14999    Upon successful return, *COUNT returns the number of needed registers,
15000    *BASE_MODE returns the mode of the individual register and when IS_HAF
15001    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
15002    floating-point aggregate or a homogeneous short-vector aggregate.  */
15003
15004 static bool
15005 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
15006                                          const_tree type,
15007                                          machine_mode *base_mode,
15008                                          int *count,
15009                                          bool *is_ha)
15010 {
15011   machine_mode new_mode = VOIDmode;
15012   bool composite_p = aarch64_composite_type_p (type, mode);
15013
15014   if (is_ha != NULL) *is_ha = false;
15015
15016   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
15017       || aarch64_short_vector_p (type, mode))
15018     {
15019       *count = 1;
15020       new_mode = mode;
15021     }
15022   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
15023     {
15024       if (is_ha != NULL) *is_ha = true;
15025       *count = 2;
15026       new_mode = GET_MODE_INNER (mode);
15027     }
15028   else if (type && composite_p)
15029     {
15030       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
15031
15032       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
15033         {
15034           if (is_ha != NULL) *is_ha = true;
15035           *count = ag_count;
15036         }
15037       else
15038         return false;
15039     }
15040   else
15041     return false;
15042
15043   *base_mode = new_mode;
15044   return true;
15045 }
15046
15047 /* Implement TARGET_STRUCT_VALUE_RTX.  */
15048
15049 static rtx
15050 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
15051                           int incoming ATTRIBUTE_UNUSED)
15052 {
15053   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
15054 }
15055
15056 /* Implements target hook vector_mode_supported_p.  */
15057 static bool
15058 aarch64_vector_mode_supported_p (machine_mode mode)
15059 {
15060   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15061   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
15062 }
15063
15064 /* Return the full-width SVE vector mode for element mode MODE, if one
15065    exists.  */
15066 opt_machine_mode
15067 aarch64_full_sve_mode (scalar_mode mode)
15068 {
15069   switch (mode)
15070     {
15071     case E_DFmode:
15072       return VNx2DFmode;
15073     case E_SFmode:
15074       return VNx4SFmode;
15075     case E_HFmode:
15076       return VNx8HFmode;
15077     case E_DImode:
15078         return VNx2DImode;
15079     case E_SImode:
15080       return VNx4SImode;
15081     case E_HImode:
15082       return VNx8HImode;
15083     case E_QImode:
15084       return VNx16QImode;
15085     default:
15086       return opt_machine_mode ();
15087     }
15088 }
15089
15090 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15091    if it exists.  */
15092 opt_machine_mode
15093 aarch64_vq_mode (scalar_mode mode)
15094 {
15095   switch (mode)
15096     {
15097     case E_DFmode:
15098       return V2DFmode;
15099     case E_SFmode:
15100       return V4SFmode;
15101     case E_HFmode:
15102       return V8HFmode;
15103     case E_SImode:
15104       return V4SImode;
15105     case E_HImode:
15106       return V8HImode;
15107     case E_QImode:
15108       return V16QImode;
15109     case E_DImode:
15110       return V2DImode;
15111     default:
15112       return opt_machine_mode ();
15113     }
15114 }
15115
15116 /* Return appropriate SIMD container
15117    for MODE within a vector of WIDTH bits.  */
15118 static machine_mode
15119 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15120 {
15121   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15122     return aarch64_full_sve_mode (mode).else_mode (word_mode);
15123
15124   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15125   if (TARGET_SIMD)
15126     {
15127       if (known_eq (width, 128))
15128         return aarch64_vq_mode (mode).else_mode (word_mode);
15129       else
15130         switch (mode)
15131           {
15132           case E_SFmode:
15133             return V2SFmode;
15134           case E_HFmode:
15135             return V4HFmode;
15136           case E_SImode:
15137             return V2SImode;
15138           case E_HImode:
15139             return V4HImode;
15140           case E_QImode:
15141             return V8QImode;
15142           default:
15143             break;
15144           }
15145     }
15146   return word_mode;
15147 }
15148
15149 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
15150 static machine_mode
15151 aarch64_preferred_simd_mode (scalar_mode mode)
15152 {
15153   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15154   return aarch64_simd_container_mode (mode, bits);
15155 }
15156
15157 /* Return a list of possible vector sizes for the vectorizer
15158    to iterate over.  */
15159 static void
15160 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15161 {
15162   if (TARGET_SVE)
15163     sizes->safe_push (BYTES_PER_SVE_VECTOR);
15164   sizes->safe_push (16);
15165   sizes->safe_push (8);
15166 }
15167
15168 /* Implement TARGET_MANGLE_TYPE.  */
15169
15170 static const char *
15171 aarch64_mangle_type (const_tree type)
15172 {
15173   /* The AArch64 ABI documents say that "__va_list" has to be
15174      mangled as if it is in the "std" namespace.  */
15175   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15176     return "St9__va_list";
15177
15178   /* Half-precision float.  */
15179   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15180     return "Dh";
15181
15182   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15183      builtin types.  */
15184   if (TYPE_NAME (type) != NULL)
15185     return aarch64_general_mangle_builtin_type (type);
15186
15187   /* Use the default mangling.  */
15188   return NULL;
15189 }
15190
15191 /* Find the first rtx_insn before insn that will generate an assembly
15192    instruction.  */
15193
15194 static rtx_insn *
15195 aarch64_prev_real_insn (rtx_insn *insn)
15196 {
15197   if (!insn)
15198     return NULL;
15199
15200   do
15201     {
15202       insn = prev_real_insn (insn);
15203     }
15204   while (insn && recog_memoized (insn) < 0);
15205
15206   return insn;
15207 }
15208
15209 static bool
15210 is_madd_op (enum attr_type t1)
15211 {
15212   unsigned int i;
15213   /* A number of these may be AArch32 only.  */
15214   enum attr_type mlatypes[] = {
15215     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15216     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15217     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15218   };
15219
15220   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15221     {
15222       if (t1 == mlatypes[i])
15223         return true;
15224     }
15225
15226   return false;
15227 }
15228
15229 /* Check if there is a register dependency between a load and the insn
15230    for which we hold recog_data.  */
15231
15232 static bool
15233 dep_between_memop_and_curr (rtx memop)
15234 {
15235   rtx load_reg;
15236   int opno;
15237
15238   gcc_assert (GET_CODE (memop) == SET);
15239
15240   if (!REG_P (SET_DEST (memop)))
15241     return false;
15242
15243   load_reg = SET_DEST (memop);
15244   for (opno = 1; opno < recog_data.n_operands; opno++)
15245     {
15246       rtx operand = recog_data.operand[opno];
15247       if (REG_P (operand)
15248           && reg_overlap_mentioned_p (load_reg, operand))
15249         return true;
15250
15251     }
15252   return false;
15253 }
15254
15255
15256 /* When working around the Cortex-A53 erratum 835769,
15257    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15258    instruction and has a preceding memory instruction such that a NOP
15259    should be inserted between them.  */
15260
15261 bool
15262 aarch64_madd_needs_nop (rtx_insn* insn)
15263 {
15264   enum attr_type attr_type;
15265   rtx_insn *prev;
15266   rtx body;
15267
15268   if (!TARGET_FIX_ERR_A53_835769)
15269     return false;
15270
15271   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15272     return false;
15273
15274   attr_type = get_attr_type (insn);
15275   if (!is_madd_op (attr_type))
15276     return false;
15277
15278   prev = aarch64_prev_real_insn (insn);
15279   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15280      Restore recog state to INSN to avoid state corruption.  */
15281   extract_constrain_insn_cached (insn);
15282
15283   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15284     return false;
15285
15286   body = single_set (prev);
15287
15288   /* If the previous insn is a memory op and there is no dependency between
15289      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15290      have a complex memory operation, probably a load/store pair.
15291      Be conservative for now and emit a NOP.  */
15292   if (GET_MODE (recog_data.operand[0]) == DImode
15293       && (!body || !dep_between_memop_and_curr (body)))
15294     return true;
15295
15296   return false;
15297
15298 }
15299
15300
15301 /* Implement FINAL_PRESCAN_INSN.  */
15302
15303 void
15304 aarch64_final_prescan_insn (rtx_insn *insn)
15305 {
15306   if (aarch64_madd_needs_nop (insn))
15307     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15308 }
15309
15310
15311 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15312    instruction.  */
15313
15314 bool
15315 aarch64_sve_index_immediate_p (rtx base_or_step)
15316 {
15317   return (CONST_INT_P (base_or_step)
15318           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15319 }
15320
15321 /* Return true if X is a valid immediate for the SVE ADD and SUB
15322    instructions.  Negate X first if NEGATE_P is true.  */
15323
15324 bool
15325 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15326 {
15327   rtx elt;
15328
15329   if (!const_vec_duplicate_p (x, &elt)
15330       || !CONST_INT_P (elt))
15331     return false;
15332
15333   HOST_WIDE_INT val = INTVAL (elt);
15334   if (negate_p)
15335     val = -val;
15336   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15337
15338   if (val & 0xff)
15339     return IN_RANGE (val, 0, 0xff);
15340   return IN_RANGE (val, 0, 0xff00);
15341 }
15342
15343 /* Return true if X is a valid immediate operand for an SVE logical
15344    instruction such as AND.  */
15345
15346 bool
15347 aarch64_sve_bitmask_immediate_p (rtx x)
15348 {
15349   rtx elt;
15350
15351   return (const_vec_duplicate_p (x, &elt)
15352           && CONST_INT_P (elt)
15353           && aarch64_bitmask_imm (INTVAL (elt),
15354                                   GET_MODE_INNER (GET_MODE (x))));
15355 }
15356
15357 /* Return true if X is a valid immediate for the SVE DUP and CPY
15358    instructions.  */
15359
15360 bool
15361 aarch64_sve_dup_immediate_p (rtx x)
15362 {
15363   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15364   if (!CONST_INT_P (x))
15365     return false;
15366
15367   HOST_WIDE_INT val = INTVAL (x);
15368   if (val & 0xff)
15369     return IN_RANGE (val, -0x80, 0x7f);
15370   return IN_RANGE (val, -0x8000, 0x7f00);
15371 }
15372
15373 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15374    SIGNED_P says whether the operand is signed rather than unsigned.  */
15375
15376 bool
15377 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15378 {
15379   rtx elt;
15380
15381   return (const_vec_duplicate_p (x, &elt)
15382           && CONST_INT_P (elt)
15383           && (signed_p
15384               ? IN_RANGE (INTVAL (elt), -16, 15)
15385               : IN_RANGE (INTVAL (elt), 0, 127)));
15386 }
15387
15388 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15389    instruction.  Negate X first if NEGATE_P is true.  */
15390
15391 bool
15392 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15393 {
15394   rtx elt;
15395   REAL_VALUE_TYPE r;
15396
15397   if (!const_vec_duplicate_p (x, &elt)
15398       || GET_CODE (elt) != CONST_DOUBLE)
15399     return false;
15400
15401   r = *CONST_DOUBLE_REAL_VALUE (elt);
15402
15403   if (negate_p)
15404     r = real_value_negate (&r);
15405
15406   if (real_equal (&r, &dconst1))
15407     return true;
15408   if (real_equal (&r, &dconsthalf))
15409     return true;
15410   return false;
15411 }
15412
15413 /* Return true if X is a valid immediate operand for an SVE FMUL
15414    instruction.  */
15415
15416 bool
15417 aarch64_sve_float_mul_immediate_p (rtx x)
15418 {
15419   rtx elt;
15420
15421   return (const_vec_duplicate_p (x, &elt)
15422           && GET_CODE (elt) == CONST_DOUBLE
15423           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15424               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15425 }
15426
15427 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15428    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15429    is nonnull, use it to describe valid immediates.  */
15430 static bool
15431 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15432                                     simd_immediate_info *info,
15433                                     enum simd_immediate_check which,
15434                                     simd_immediate_info::insn_type insn)
15435 {
15436   /* Try a 4-byte immediate with LSL.  */
15437   for (unsigned int shift = 0; shift < 32; shift += 8)
15438     if ((val32 & (0xff << shift)) == val32)
15439       {
15440         if (info)
15441           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15442                                        simd_immediate_info::LSL, shift);
15443         return true;
15444       }
15445
15446   /* Try a 2-byte immediate with LSL.  */
15447   unsigned int imm16 = val32 & 0xffff;
15448   if (imm16 == (val32 >> 16))
15449     for (unsigned int shift = 0; shift < 16; shift += 8)
15450       if ((imm16 & (0xff << shift)) == imm16)
15451         {
15452           if (info)
15453             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15454                                          simd_immediate_info::LSL, shift);
15455           return true;
15456         }
15457
15458   /* Try a 4-byte immediate with MSL, except for cases that MVN
15459      can handle.  */
15460   if (which == AARCH64_CHECK_MOV)
15461     for (unsigned int shift = 8; shift < 24; shift += 8)
15462       {
15463         unsigned int low = (1 << shift) - 1;
15464         if (((val32 & (0xff << shift)) | low) == val32)
15465           {
15466             if (info)
15467               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15468                                            simd_immediate_info::MSL, shift);
15469             return true;
15470           }
15471       }
15472
15473   return false;
15474 }
15475
15476 /* Return true if replicating VAL64 is a valid immediate for the
15477    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15478    use it to describe valid immediates.  */
15479 static bool
15480 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15481                                  simd_immediate_info *info,
15482                                  enum simd_immediate_check which)
15483 {
15484   unsigned int val32 = val64 & 0xffffffff;
15485   unsigned int val16 = val64 & 0xffff;
15486   unsigned int val8 = val64 & 0xff;
15487
15488   if (val32 == (val64 >> 32))
15489     {
15490       if ((which & AARCH64_CHECK_ORR) != 0
15491           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15492                                                  simd_immediate_info::MOV))
15493         return true;
15494
15495       if ((which & AARCH64_CHECK_BIC) != 0
15496           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15497                                                  simd_immediate_info::MVN))
15498         return true;
15499
15500       /* Try using a replicated byte.  */
15501       if (which == AARCH64_CHECK_MOV
15502           && val16 == (val32 >> 16)
15503           && val8 == (val16 >> 8))
15504         {
15505           if (info)
15506             *info = simd_immediate_info (QImode, val8);
15507           return true;
15508         }
15509     }
15510
15511   /* Try using a bit-to-bytemask.  */
15512   if (which == AARCH64_CHECK_MOV)
15513     {
15514       unsigned int i;
15515       for (i = 0; i < 64; i += 8)
15516         {
15517           unsigned char byte = (val64 >> i) & 0xff;
15518           if (byte != 0 && byte != 0xff)
15519             break;
15520         }
15521       if (i == 64)
15522         {
15523           if (info)
15524             *info = simd_immediate_info (DImode, val64);
15525           return true;
15526         }
15527     }
15528   return false;
15529 }
15530
15531 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15532    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15533
15534 static bool
15535 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15536                              simd_immediate_info *info)
15537 {
15538   scalar_int_mode mode = DImode;
15539   unsigned int val32 = val64 & 0xffffffff;
15540   if (val32 == (val64 >> 32))
15541     {
15542       mode = SImode;
15543       unsigned int val16 = val32 & 0xffff;
15544       if (val16 == (val32 >> 16))
15545         {
15546           mode = HImode;
15547           unsigned int val8 = val16 & 0xff;
15548           if (val8 == (val16 >> 8))
15549             mode = QImode;
15550         }
15551     }
15552   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15553   if (IN_RANGE (val, -0x80, 0x7f))
15554     {
15555       /* DUP with no shift.  */
15556       if (info)
15557         *info = simd_immediate_info (mode, val);
15558       return true;
15559     }
15560   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15561     {
15562       /* DUP with LSL #8.  */
15563       if (info)
15564         *info = simd_immediate_info (mode, val);
15565       return true;
15566     }
15567   if (aarch64_bitmask_imm (val64, mode))
15568     {
15569       /* DUPM.  */
15570       if (info)
15571         *info = simd_immediate_info (mode, val);
15572       return true;
15573     }
15574   return false;
15575 }
15576
15577 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15578    it to describe valid immediates.  */
15579
15580 static bool
15581 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15582 {
15583   if (x == CONST0_RTX (GET_MODE (x)))
15584     {
15585       if (info)
15586         *info = simd_immediate_info (DImode, 0);
15587       return true;
15588     }
15589
15590   /* Analyze the value as a VNx16BImode.  This should be relatively
15591      efficient, since rtx_vector_builder has enough built-in capacity
15592      to store all VLA predicate constants without needing the heap.  */
15593   rtx_vector_builder builder;
15594   if (!aarch64_get_sve_pred_bits (builder, x))
15595     return false;
15596
15597   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15598   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15599     {
15600       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15601       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15602       if (pattern != AARCH64_NUM_SVPATTERNS)
15603         {
15604           if (info)
15605             {
15606               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15607               *info = simd_immediate_info (int_mode, pattern);
15608             }
15609           return true;
15610         }
15611     }
15612   return false;
15613 }
15614
15615 /* Return true if OP is a valid SIMD immediate for the operation
15616    described by WHICH.  If INFO is nonnull, use it to describe valid
15617    immediates.  */
15618 bool
15619 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15620                               enum simd_immediate_check which)
15621 {
15622   machine_mode mode = GET_MODE (op);
15623   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15624   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15625     return false;
15626
15627   if (vec_flags & VEC_SVE_PRED)
15628     return aarch64_sve_pred_valid_immediate (op, info);
15629
15630   scalar_mode elt_mode = GET_MODE_INNER (mode);
15631   rtx base, step;
15632   unsigned int n_elts;
15633   if (GET_CODE (op) == CONST_VECTOR
15634       && CONST_VECTOR_DUPLICATE_P (op))
15635     n_elts = CONST_VECTOR_NPATTERNS (op);
15636   else if ((vec_flags & VEC_SVE_DATA)
15637            && const_vec_series_p (op, &base, &step))
15638     {
15639       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15640       if (!aarch64_sve_index_immediate_p (base)
15641           || !aarch64_sve_index_immediate_p (step))
15642         return false;
15643
15644       if (info)
15645         *info = simd_immediate_info (elt_mode, base, step);
15646       return true;
15647     }
15648   else if (GET_CODE (op) == CONST_VECTOR
15649            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15650     /* N_ELTS set above.  */;
15651   else
15652     return false;
15653
15654   scalar_float_mode elt_float_mode;
15655   if (n_elts == 1
15656       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15657     {
15658       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15659       if (aarch64_float_const_zero_rtx_p (elt)
15660           || aarch64_float_const_representable_p (elt))
15661         {
15662           if (info)
15663             *info = simd_immediate_info (elt_float_mode, elt);
15664           return true;
15665         }
15666     }
15667
15668   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15669   if (elt_size > 8)
15670     return false;
15671
15672   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15673
15674   /* Expand the vector constant out into a byte vector, with the least
15675      significant byte of the register first.  */
15676   auto_vec<unsigned char, 16> bytes;
15677   bytes.reserve (n_elts * elt_size);
15678   for (unsigned int i = 0; i < n_elts; i++)
15679     {
15680       /* The vector is provided in gcc endian-neutral fashion.
15681          For aarch64_be Advanced SIMD, it must be laid out in the vector
15682          register in reverse order.  */
15683       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15684       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15685
15686       if (elt_mode != elt_int_mode)
15687         elt = gen_lowpart (elt_int_mode, elt);
15688
15689       if (!CONST_INT_P (elt))
15690         return false;
15691
15692       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15693       for (unsigned int byte = 0; byte < elt_size; byte++)
15694         {
15695           bytes.quick_push (elt_val & 0xff);
15696           elt_val >>= BITS_PER_UNIT;
15697         }
15698     }
15699
15700   /* The immediate must repeat every eight bytes.  */
15701   unsigned int nbytes = bytes.length ();
15702   for (unsigned i = 8; i < nbytes; ++i)
15703     if (bytes[i] != bytes[i - 8])
15704       return false;
15705
15706   /* Get the repeating 8-byte value as an integer.  No endian correction
15707      is needed here because bytes is already in lsb-first order.  */
15708   unsigned HOST_WIDE_INT val64 = 0;
15709   for (unsigned int i = 0; i < 8; i++)
15710     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15711               << (i * BITS_PER_UNIT));
15712
15713   if (vec_flags & VEC_SVE_DATA)
15714     return aarch64_sve_valid_immediate (val64, info);
15715   else
15716     return aarch64_advsimd_valid_immediate (val64, info, which);
15717 }
15718
15719 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15720    has a step in the range of INDEX.  Return the index expression if so,
15721    otherwise return null.  */
15722 rtx
15723 aarch64_check_zero_based_sve_index_immediate (rtx x)
15724 {
15725   rtx base, step;
15726   if (const_vec_series_p (x, &base, &step)
15727       && base == const0_rtx
15728       && aarch64_sve_index_immediate_p (step))
15729     return step;
15730   return NULL_RTX;
15731 }
15732
15733 /* Check of immediate shift constants are within range.  */
15734 bool
15735 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15736 {
15737   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15738   if (left)
15739     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15740   else
15741     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15742 }
15743
15744 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15745    operation of width WIDTH at bit position POS.  */
15746
15747 rtx
15748 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15749 {
15750   gcc_assert (CONST_INT_P (width));
15751   gcc_assert (CONST_INT_P (pos));
15752
15753   unsigned HOST_WIDE_INT mask
15754     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15755   return GEN_INT (mask << UINTVAL (pos));
15756 }
15757
15758 bool
15759 aarch64_mov_operand_p (rtx x, machine_mode mode)
15760 {
15761   if (GET_CODE (x) == HIGH
15762       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15763     return true;
15764
15765   if (CONST_INT_P (x))
15766     return true;
15767
15768   if (VECTOR_MODE_P (GET_MODE (x)))
15769     {
15770       /* Require predicate constants to be VNx16BI before RA, so that we
15771          force everything to have a canonical form.  */
15772       if (!lra_in_progress
15773           && !reload_completed
15774           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15775           && GET_MODE (x) != VNx16BImode)
15776         return false;
15777
15778       return aarch64_simd_valid_immediate (x, NULL);
15779     }
15780
15781   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15782     return true;
15783
15784   if (aarch64_sve_cnt_immediate_p (x))
15785     return true;
15786
15787   return aarch64_classify_symbolic_expression (x)
15788     == SYMBOL_TINY_ABSOLUTE;
15789 }
15790
15791 /* Return a const_int vector of VAL.  */
15792 rtx
15793 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15794 {
15795   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15796   return gen_const_vec_duplicate (mode, c);
15797 }
15798
15799 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15800
15801 bool
15802 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15803 {
15804   machine_mode vmode;
15805
15806   vmode = aarch64_simd_container_mode (mode, 64);
15807   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15808   return aarch64_simd_valid_immediate (op_v, NULL);
15809 }
15810
15811 /* Construct and return a PARALLEL RTX vector with elements numbering the
15812    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15813    the vector - from the perspective of the architecture.  This does not
15814    line up with GCC's perspective on lane numbers, so we end up with
15815    different masks depending on our target endian-ness.  The diagram
15816    below may help.  We must draw the distinction when building masks
15817    which select one half of the vector.  An instruction selecting
15818    architectural low-lanes for a big-endian target, must be described using
15819    a mask selecting GCC high-lanes.
15820
15821                  Big-Endian             Little-Endian
15822
15823 GCC             0   1   2   3           3   2   1   0
15824               | x | x | x | x |       | x | x | x | x |
15825 Architecture    3   2   1   0           3   2   1   0
15826
15827 Low Mask:         { 2, 3 }                { 0, 1 }
15828 High Mask:        { 0, 1 }                { 2, 3 }
15829
15830    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15831
15832 rtx
15833 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15834 {
15835   rtvec v = rtvec_alloc (nunits / 2);
15836   int high_base = nunits / 2;
15837   int low_base = 0;
15838   int base;
15839   rtx t1;
15840   int i;
15841
15842   if (BYTES_BIG_ENDIAN)
15843     base = high ? low_base : high_base;
15844   else
15845     base = high ? high_base : low_base;
15846
15847   for (i = 0; i < nunits / 2; i++)
15848     RTVEC_ELT (v, i) = GEN_INT (base + i);
15849
15850   t1 = gen_rtx_PARALLEL (mode, v);
15851   return t1;
15852 }
15853
15854 /* Check OP for validity as a PARALLEL RTX vector with elements
15855    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15856    from the perspective of the architecture.  See the diagram above
15857    aarch64_simd_vect_par_cnst_half for more details.  */
15858
15859 bool
15860 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15861                                        bool high)
15862 {
15863   int nelts;
15864   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15865     return false;
15866
15867   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15868   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15869   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15870   int i = 0;
15871
15872   if (count_op != count_ideal)
15873     return false;
15874
15875   for (i = 0; i < count_ideal; i++)
15876     {
15877       rtx elt_op = XVECEXP (op, 0, i);
15878       rtx elt_ideal = XVECEXP (ideal, 0, i);
15879
15880       if (!CONST_INT_P (elt_op)
15881           || INTVAL (elt_ideal) != INTVAL (elt_op))
15882         return false;
15883     }
15884   return true;
15885 }
15886
15887 /* Return a PARALLEL containing NELTS elements, with element I equal
15888    to BASE + I * STEP.  */
15889
15890 rtx
15891 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15892 {
15893   rtvec vec = rtvec_alloc (nelts);
15894   for (unsigned int i = 0; i < nelts; ++i)
15895     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15896   return gen_rtx_PARALLEL (VOIDmode, vec);
15897 }
15898
15899 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15900    series with step STEP.  */
15901
15902 bool
15903 aarch64_stepped_int_parallel_p (rtx op, int step)
15904 {
15905   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15906     return false;
15907
15908   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15909   for (int i = 1; i < XVECLEN (op, 0); ++i)
15910     if (!CONST_INT_P (XVECEXP (op, 0, i))
15911         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15912       return false;
15913
15914   return true;
15915 }
15916
15917 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15918    HIGH (exclusive).  */
15919 void
15920 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15921                           const_tree exp)
15922 {
15923   HOST_WIDE_INT lane;
15924   gcc_assert (CONST_INT_P (operand));
15925   lane = INTVAL (operand);
15926
15927   if (lane < low || lane >= high)
15928   {
15929     if (exp)
15930       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15931     else
15932       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15933   }
15934 }
15935
15936 /* Peform endian correction on lane number N, which indexes a vector
15937    of mode MODE, and return the result as an SImode rtx.  */
15938
15939 rtx
15940 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15941 {
15942   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15943 }
15944
15945 /* Return TRUE if OP is a valid vector addressing mode.  */
15946
15947 bool
15948 aarch64_simd_mem_operand_p (rtx op)
15949 {
15950   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15951                         || REG_P (XEXP (op, 0)));
15952 }
15953
15954 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15955
15956 bool
15957 aarch64_sve_ld1r_operand_p (rtx op)
15958 {
15959   struct aarch64_address_info addr;
15960   scalar_mode mode;
15961
15962   return (MEM_P (op)
15963           && is_a <scalar_mode> (GET_MODE (op), &mode)
15964           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15965           && addr.type == ADDRESS_REG_IMM
15966           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15967 }
15968
15969 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15970 bool
15971 aarch64_sve_ld1rq_operand_p (rtx op)
15972 {
15973   struct aarch64_address_info addr;
15974   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15975   if (!MEM_P (op)
15976       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15977     return false;
15978
15979   if (addr.type == ADDRESS_REG_IMM)
15980     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15981
15982   if (addr.type == ADDRESS_REG_REG)
15983     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15984
15985   return false;
15986 }
15987
15988 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15989    The conditions for STR are the same.  */
15990 bool
15991 aarch64_sve_ldr_operand_p (rtx op)
15992 {
15993   struct aarch64_address_info addr;
15994
15995   return (MEM_P (op)
15996           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15997                                        false, ADDR_QUERY_ANY)
15998           && addr.type == ADDRESS_REG_IMM);
15999 }
16000
16001 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
16002    We need to be able to access the individual pieces, so the range
16003    is different from LD[234] and ST[234].  */
16004 bool
16005 aarch64_sve_struct_memory_operand_p (rtx op)
16006 {
16007   if (!MEM_P (op))
16008     return false;
16009
16010   machine_mode mode = GET_MODE (op);
16011   struct aarch64_address_info addr;
16012   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
16013                                  ADDR_QUERY_ANY)
16014       || addr.type != ADDRESS_REG_IMM)
16015     return false;
16016
16017   poly_int64 first = addr.const_offset;
16018   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
16019   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
16020           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
16021 }
16022
16023 /* Emit a register copy from operand to operand, taking care not to
16024    early-clobber source registers in the process.
16025
16026    COUNT is the number of components into which the copy needs to be
16027    decomposed.  */
16028 void
16029 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
16030                                 unsigned int count)
16031 {
16032   unsigned int i;
16033   int rdest = REGNO (operands[0]);
16034   int rsrc = REGNO (operands[1]);
16035
16036   if (!reg_overlap_mentioned_p (operands[0], operands[1])
16037       || rdest < rsrc)
16038     for (i = 0; i < count; i++)
16039       emit_move_insn (gen_rtx_REG (mode, rdest + i),
16040                       gen_rtx_REG (mode, rsrc + i));
16041   else
16042     for (i = 0; i < count; i++)
16043       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
16044                       gen_rtx_REG (mode, rsrc + count - i - 1));
16045 }
16046
16047 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16048    one of VSTRUCT modes: OI, CI, or XI.  */
16049 int
16050 aarch64_simd_attr_length_rglist (machine_mode mode)
16051 {
16052   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
16053   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
16054 }
16055
16056 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
16057    alignment of a vector to 128 bits.  SVE predicates have an alignment of
16058    16 bits.  */
16059 static HOST_WIDE_INT
16060 aarch64_simd_vector_alignment (const_tree type)
16061 {
16062   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16063      be set for non-predicate vectors of booleans.  Modes are the most
16064      direct way we have of identifying real SVE predicate types.  */
16065   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
16066     return 16;
16067   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16068     return 128;
16069   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
16070 }
16071
16072 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
16073 static poly_uint64
16074 aarch64_vectorize_preferred_vector_alignment (const_tree type)
16075 {
16076   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
16077     {
16078       /* If the length of the vector is fixed, try to align to that length,
16079          otherwise don't try to align at all.  */
16080       HOST_WIDE_INT result;
16081       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
16082         result = TYPE_ALIGN (TREE_TYPE (type));
16083       return result;
16084     }
16085   return TYPE_ALIGN (type);
16086 }
16087
16088 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
16089 static bool
16090 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
16091 {
16092   if (is_packed)
16093     return false;
16094
16095   /* For fixed-length vectors, check that the vectorizer will aim for
16096      full-vector alignment.  This isn't true for generic GCC vectors
16097      that are wider than the ABI maximum of 128 bits.  */
16098   poly_uint64 preferred_alignment =
16099     aarch64_vectorize_preferred_vector_alignment (type);
16100   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16101       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16102                    preferred_alignment))
16103     return false;
16104
16105   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
16106   return true;
16107 }
16108
16109 /* Return true if the vector misalignment factor is supported by the
16110    target.  */
16111 static bool
16112 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16113                                              const_tree type, int misalignment,
16114                                              bool is_packed)
16115 {
16116   if (TARGET_SIMD && STRICT_ALIGNMENT)
16117     {
16118       /* Return if movmisalign pattern is not supported for this mode.  */
16119       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16120         return false;
16121
16122       /* Misalignment factor is unknown at compile time.  */
16123       if (misalignment == -1)
16124         return false;
16125     }
16126   return default_builtin_support_vector_misalignment (mode, type, misalignment,
16127                                                       is_packed);
16128 }
16129
16130 /* If VALS is a vector constant that can be loaded into a register
16131    using DUP, generate instructions to do so and return an RTX to
16132    assign to the register.  Otherwise return NULL_RTX.  */
16133 static rtx
16134 aarch64_simd_dup_constant (rtx vals)
16135 {
16136   machine_mode mode = GET_MODE (vals);
16137   machine_mode inner_mode = GET_MODE_INNER (mode);
16138   rtx x;
16139
16140   if (!const_vec_duplicate_p (vals, &x))
16141     return NULL_RTX;
16142
16143   /* We can load this constant by using DUP and a constant in a
16144      single ARM register.  This will be cheaper than a vector
16145      load.  */
16146   x = copy_to_mode_reg (inner_mode, x);
16147   return gen_vec_duplicate (mode, x);
16148 }
16149
16150
16151 /* Generate code to load VALS, which is a PARALLEL containing only
16152    constants (for vec_init) or CONST_VECTOR, efficiently into a
16153    register.  Returns an RTX to copy into the register, or NULL_RTX
16154    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
16155 static rtx
16156 aarch64_simd_make_constant (rtx vals)
16157 {
16158   machine_mode mode = GET_MODE (vals);
16159   rtx const_dup;
16160   rtx const_vec = NULL_RTX;
16161   int n_const = 0;
16162   int i;
16163
16164   if (GET_CODE (vals) == CONST_VECTOR)
16165     const_vec = vals;
16166   else if (GET_CODE (vals) == PARALLEL)
16167     {
16168       /* A CONST_VECTOR must contain only CONST_INTs and
16169          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16170          Only store valid constants in a CONST_VECTOR.  */
16171       int n_elts = XVECLEN (vals, 0);
16172       for (i = 0; i < n_elts; ++i)
16173         {
16174           rtx x = XVECEXP (vals, 0, i);
16175           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16176             n_const++;
16177         }
16178       if (n_const == n_elts)
16179         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16180     }
16181   else
16182     gcc_unreachable ();
16183
16184   if (const_vec != NULL_RTX
16185       && aarch64_simd_valid_immediate (const_vec, NULL))
16186     /* Load using MOVI/MVNI.  */
16187     return const_vec;
16188   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16189     /* Loaded using DUP.  */
16190     return const_dup;
16191   else if (const_vec != NULL_RTX)
16192     /* Load from constant pool. We cannot take advantage of single-cycle
16193        LD1 because we need a PC-relative addressing mode.  */
16194     return const_vec;
16195   else
16196     /* A PARALLEL containing something not valid inside CONST_VECTOR.
16197        We cannot construct an initializer.  */
16198     return NULL_RTX;
16199 }
16200
16201 /* Expand a vector initialisation sequence, such that TARGET is
16202    initialised to contain VALS.  */
16203
16204 void
16205 aarch64_expand_vector_init (rtx target, rtx vals)
16206 {
16207   machine_mode mode = GET_MODE (target);
16208   scalar_mode inner_mode = GET_MODE_INNER (mode);
16209   /* The number of vector elements.  */
16210   int n_elts = XVECLEN (vals, 0);
16211   /* The number of vector elements which are not constant.  */
16212   int n_var = 0;
16213   rtx any_const = NULL_RTX;
16214   /* The first element of vals.  */
16215   rtx v0 = XVECEXP (vals, 0, 0);
16216   bool all_same = true;
16217
16218   /* This is a special vec_init<M><N> where N is not an element mode but a
16219      vector mode with half the elements of M.  We expect to find two entries
16220      of mode N in VALS and we must put their concatentation into TARGET.  */
16221   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16222     {
16223       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16224                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16225       rtx lo = XVECEXP (vals, 0, 0);
16226       rtx hi = XVECEXP (vals, 0, 1);
16227       machine_mode narrow_mode = GET_MODE (lo);
16228       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16229       gcc_assert (narrow_mode == GET_MODE (hi));
16230
16231       /* When we want to concatenate a half-width vector with zeroes we can
16232          use the aarch64_combinez[_be] patterns.  Just make sure that the
16233          zeroes are in the right half.  */
16234       if (BYTES_BIG_ENDIAN
16235           && aarch64_simd_imm_zero (lo, narrow_mode)
16236           && general_operand (hi, narrow_mode))
16237         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16238       else if (!BYTES_BIG_ENDIAN
16239                && aarch64_simd_imm_zero (hi, narrow_mode)
16240                && general_operand (lo, narrow_mode))
16241         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16242       else
16243         {
16244           /* Else create the two half-width registers and combine them.  */
16245           if (!REG_P (lo))
16246             lo = force_reg (GET_MODE (lo), lo);
16247           if (!REG_P (hi))
16248             hi = force_reg (GET_MODE (hi), hi);
16249
16250           if (BYTES_BIG_ENDIAN)
16251             std::swap (lo, hi);
16252           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16253         }
16254      return;
16255    }
16256
16257   /* Count the number of variable elements to initialise.  */
16258   for (int i = 0; i < n_elts; ++i)
16259     {
16260       rtx x = XVECEXP (vals, 0, i);
16261       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16262         ++n_var;
16263       else
16264         any_const = x;
16265
16266       all_same &= rtx_equal_p (x, v0);
16267     }
16268
16269   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16270      how best to handle this.  */
16271   if (n_var == 0)
16272     {
16273       rtx constant = aarch64_simd_make_constant (vals);
16274       if (constant != NULL_RTX)
16275         {
16276           emit_move_insn (target, constant);
16277           return;
16278         }
16279     }
16280
16281   /* Splat a single non-constant element if we can.  */
16282   if (all_same)
16283     {
16284       rtx x = copy_to_mode_reg (inner_mode, v0);
16285       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16286       return;
16287     }
16288
16289   enum insn_code icode = optab_handler (vec_set_optab, mode);
16290   gcc_assert (icode != CODE_FOR_nothing);
16291
16292   /* If there are only variable elements, try to optimize
16293      the insertion using dup for the most common element
16294      followed by insertions.  */
16295
16296   /* The algorithm will fill matches[*][0] with the earliest matching element,
16297      and matches[X][1] with the count of duplicate elements (if X is the
16298      earliest element which has duplicates).  */
16299
16300   if (n_var == n_elts && n_elts <= 16)
16301     {
16302       int matches[16][2] = {0};
16303       for (int i = 0; i < n_elts; i++)
16304         {
16305           for (int j = 0; j <= i; j++)
16306             {
16307               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16308                 {
16309                   matches[i][0] = j;
16310                   matches[j][1]++;
16311                   break;
16312                 }
16313             }
16314         }
16315       int maxelement = 0;
16316       int maxv = 0;
16317       for (int i = 0; i < n_elts; i++)
16318         if (matches[i][1] > maxv)
16319           {
16320             maxelement = i;
16321             maxv = matches[i][1];
16322           }
16323
16324       /* Create a duplicate of the most common element, unless all elements
16325          are equally useless to us, in which case just immediately set the
16326          vector register using the first element.  */
16327
16328       if (maxv == 1)
16329         {
16330           /* For vectors of two 64-bit elements, we can do even better.  */
16331           if (n_elts == 2
16332               && (inner_mode == E_DImode
16333                   || inner_mode == E_DFmode))
16334
16335             {
16336               rtx x0 = XVECEXP (vals, 0, 0);
16337               rtx x1 = XVECEXP (vals, 0, 1);
16338               /* Combine can pick up this case, but handling it directly
16339                  here leaves clearer RTL.
16340
16341                  This is load_pair_lanes<mode>, and also gives us a clean-up
16342                  for store_pair_lanes<mode>.  */
16343               if (memory_operand (x0, inner_mode)
16344                   && memory_operand (x1, inner_mode)
16345                   && !STRICT_ALIGNMENT
16346                   && rtx_equal_p (XEXP (x1, 0),
16347                                   plus_constant (Pmode,
16348                                                  XEXP (x0, 0),
16349                                                  GET_MODE_SIZE (inner_mode))))
16350                 {
16351                   rtx t;
16352                   if (inner_mode == DFmode)
16353                     t = gen_load_pair_lanesdf (target, x0, x1);
16354                   else
16355                     t = gen_load_pair_lanesdi (target, x0, x1);
16356                   emit_insn (t);
16357                   return;
16358                 }
16359             }
16360           /* The subreg-move sequence below will move into lane zero of the
16361              vector register.  For big-endian we want that position to hold
16362              the last element of VALS.  */
16363           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16364           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16365           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16366         }
16367       else
16368         {
16369           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16370           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16371         }
16372
16373       /* Insert the rest.  */
16374       for (int i = 0; i < n_elts; i++)
16375         {
16376           rtx x = XVECEXP (vals, 0, i);
16377           if (matches[i][0] == maxelement)
16378             continue;
16379           x = copy_to_mode_reg (inner_mode, x);
16380           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16381         }
16382       return;
16383     }
16384
16385   /* Initialise a vector which is part-variable.  We want to first try
16386      to build those lanes which are constant in the most efficient way we
16387      can.  */
16388   if (n_var != n_elts)
16389     {
16390       rtx copy = copy_rtx (vals);
16391
16392       /* Load constant part of vector.  We really don't care what goes into the
16393          parts we will overwrite, but we're more likely to be able to load the
16394          constant efficiently if it has fewer, larger, repeating parts
16395          (see aarch64_simd_valid_immediate).  */
16396       for (int i = 0; i < n_elts; i++)
16397         {
16398           rtx x = XVECEXP (vals, 0, i);
16399           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16400             continue;
16401           rtx subst = any_const;
16402           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16403             {
16404               /* Look in the copied vector, as more elements are const.  */
16405               rtx test = XVECEXP (copy, 0, i ^ bit);
16406               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16407                 {
16408                   subst = test;
16409                   break;
16410                 }
16411             }
16412           XVECEXP (copy, 0, i) = subst;
16413         }
16414       aarch64_expand_vector_init (target, copy);
16415     }
16416
16417   /* Insert the variable lanes directly.  */
16418   for (int i = 0; i < n_elts; i++)
16419     {
16420       rtx x = XVECEXP (vals, 0, i);
16421       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16422         continue;
16423       x = copy_to_mode_reg (inner_mode, x);
16424       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16425     }
16426 }
16427
16428 /* Emit RTL corresponding to:
16429    insr TARGET, ELEM.  */
16430
16431 static void
16432 emit_insr (rtx target, rtx elem)
16433 {
16434   machine_mode mode = GET_MODE (target);
16435   scalar_mode elem_mode = GET_MODE_INNER (mode);
16436   elem = force_reg (elem_mode, elem);
16437
16438   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16439   gcc_assert (icode != CODE_FOR_nothing);
16440   emit_insn (GEN_FCN (icode) (target, target, elem));
16441 }
16442
16443 /* Subroutine of aarch64_sve_expand_vector_init for handling
16444    trailing constants.
16445    This function works as follows:
16446    (a) Create a new vector consisting of trailing constants.
16447    (b) Initialize TARGET with the constant vector using emit_move_insn.
16448    (c) Insert remaining elements in TARGET using insr.
16449    NELTS is the total number of elements in original vector while
16450    while NELTS_REQD is the number of elements that are actually
16451    significant.
16452
16453    ??? The heuristic used is to do above only if number of constants
16454    is at least half the total number of elements.  May need fine tuning.  */
16455
16456 static bool
16457 aarch64_sve_expand_vector_init_handle_trailing_constants
16458  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16459 {
16460   machine_mode mode = GET_MODE (target);
16461   scalar_mode elem_mode = GET_MODE_INNER (mode);
16462   int n_trailing_constants = 0;
16463
16464   for (int i = nelts_reqd - 1;
16465        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16466        i--)
16467     n_trailing_constants++;
16468
16469   if (n_trailing_constants >= nelts_reqd / 2)
16470     {
16471       rtx_vector_builder v (mode, 1, nelts);
16472       for (int i = 0; i < nelts; i++)
16473         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16474       rtx const_vec = v.build ();
16475       emit_move_insn (target, const_vec);
16476
16477       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16478         emit_insr (target, builder.elt (i));
16479
16480       return true;
16481     }
16482
16483   return false;
16484 }
16485
16486 /* Subroutine of aarch64_sve_expand_vector_init.
16487    Works as follows:
16488    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16489    (b) Skip trailing elements from BUILDER, which are the same as
16490        element NELTS_REQD - 1.
16491    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16492
16493 static void
16494 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16495                                              const rtx_vector_builder &builder,
16496                                              int nelts_reqd)
16497 {
16498   machine_mode mode = GET_MODE (target);
16499   scalar_mode elem_mode = GET_MODE_INNER (mode);
16500
16501   struct expand_operand ops[2];
16502   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16503   gcc_assert (icode != CODE_FOR_nothing);
16504
16505   create_output_operand (&ops[0], target, mode);
16506   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16507   expand_insn (icode, 2, ops);
16508
16509   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16510   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16511     emit_insr (target, builder.elt (i));
16512 }
16513
16514 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16515    when all trailing elements of builder are same.
16516    This works as follows:
16517    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16518    (b) Insert remaining elements in TARGET using insr.
16519
16520    ??? The heuristic used is to do above if number of same trailing elements
16521    is at least 3/4 of total number of elements, loosely based on
16522    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16523
16524 static bool
16525 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16526  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16527 {
16528   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16529   if (ndups >= (3 * nelts_reqd) / 4)
16530     {
16531       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16532                                                    nelts_reqd - ndups + 1);
16533       return true;
16534     }
16535
16536   return false;
16537 }
16538
16539 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16540    of elements in BUILDER.
16541
16542    The function tries to initialize TARGET from BUILDER if it fits one
16543    of the special cases outlined below.
16544
16545    Failing that, the function divides BUILDER into two sub-vectors:
16546    v_even = even elements of BUILDER;
16547    v_odd = odd elements of BUILDER;
16548
16549    and recursively calls itself with v_even and v_odd.
16550
16551    if (recursive call succeeded for v_even or v_odd)
16552      TARGET = zip (v_even, v_odd)
16553
16554    The function returns true if it managed to build TARGET from BUILDER
16555    with one of the special cases, false otherwise.
16556
16557    Example: {a, 1, b, 2, c, 3, d, 4}
16558
16559    The vector gets divided into:
16560    v_even = {a, b, c, d}
16561    v_odd = {1, 2, 3, 4}
16562
16563    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16564    initialize tmp2 from constant vector v_odd using emit_move_insn.
16565
16566    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16567    4 elements, so we construct tmp1 from v_even using insr:
16568    tmp1 = dup(d)
16569    insr tmp1, c
16570    insr tmp1, b
16571    insr tmp1, a
16572
16573    And finally:
16574    TARGET = zip (tmp1, tmp2)
16575    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16576
16577 static bool
16578 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16579                                 int nelts, int nelts_reqd)
16580 {
16581   machine_mode mode = GET_MODE (target);
16582
16583   /* Case 1: Vector contains trailing constants.  */
16584
16585   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16586        (target, builder, nelts, nelts_reqd))
16587     return true;
16588
16589   /* Case 2: Vector contains leading constants.  */
16590
16591   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16592   for (int i = 0; i < nelts_reqd; i++)
16593     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16594   rev_builder.finalize ();
16595
16596   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16597        (target, rev_builder, nelts, nelts_reqd))
16598     {
16599       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16600       return true;
16601     }
16602
16603   /* Case 3: Vector contains trailing same element.  */
16604
16605   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16606        (target, builder, nelts_reqd))
16607     return true;
16608
16609   /* Case 4: Vector contains leading same element.  */
16610
16611   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16612        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16613     {
16614       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16615       return true;
16616     }
16617
16618   /* Avoid recursing below 4-elements.
16619      ??? The threshold 4 may need fine-tuning.  */
16620
16621   if (nelts_reqd <= 4)
16622     return false;
16623
16624   rtx_vector_builder v_even (mode, 1, nelts);
16625   rtx_vector_builder v_odd (mode, 1, nelts);
16626
16627   for (int i = 0; i < nelts * 2; i += 2)
16628     {
16629       v_even.quick_push (builder.elt (i));
16630       v_odd.quick_push (builder.elt (i + 1));
16631     }
16632
16633   v_even.finalize ();
16634   v_odd.finalize ();
16635
16636   rtx tmp1 = gen_reg_rtx (mode);
16637   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16638                                                     nelts, nelts_reqd / 2);
16639
16640   rtx tmp2 = gen_reg_rtx (mode);
16641   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16642                                                    nelts, nelts_reqd / 2);
16643
16644   if (!did_even_p && !did_odd_p)
16645     return false;
16646
16647   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16648      special cases and zip v_even, v_odd.  */
16649
16650   if (!did_even_p)
16651     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16652
16653   if (!did_odd_p)
16654     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16655
16656   rtvec v = gen_rtvec (2, tmp1, tmp2);
16657   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16658   return true;
16659 }
16660
16661 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16662
16663 void
16664 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16665 {
16666   machine_mode mode = GET_MODE (target);
16667   int nelts = XVECLEN (vals, 0);
16668
16669   rtx_vector_builder v (mode, 1, nelts);
16670   for (int i = 0; i < nelts; i++)
16671     v.quick_push (XVECEXP (vals, 0, i));
16672   v.finalize ();
16673
16674   /* If neither sub-vectors of v could be initialized specially,
16675      then use INSR to insert all elements from v into TARGET.
16676      ??? This might not be optimal for vectors with large
16677      initializers like 16-element or above.
16678      For nelts < 4, it probably isn't useful to handle specially.  */
16679
16680   if (nelts < 4
16681       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16682     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16683 }
16684
16685 /* Check whether VALUE is a vector constant in which every element
16686    is either a power of 2 or a negated power of 2.  If so, return
16687    a constant vector of log2s, and flip CODE between PLUS and MINUS
16688    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
16689
16690 static rtx
16691 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16692 {
16693   if (GET_CODE (value) != CONST_VECTOR)
16694     return NULL_RTX;
16695
16696   rtx_vector_builder builder;
16697   if (!builder.new_unary_operation (GET_MODE (value), value, false))
16698     return NULL_RTX;
16699
16700   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16701   /* 1 if the result of the multiplication must be negated,
16702      0 if it mustn't, or -1 if we don't yet care.  */
16703   int negate = -1;
16704   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16705   for (unsigned int i = 0; i < encoded_nelts; ++i)
16706     {
16707       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16708       if (!CONST_SCALAR_INT_P (elt))
16709         return NULL_RTX;
16710       rtx_mode_t val (elt, int_mode);
16711       wide_int pow2 = wi::neg (val);
16712       if (val != pow2)
16713         {
16714           /* It matters whether we negate or not.  Make that choice,
16715              and make sure that it's consistent with previous elements.  */
16716           if (negate == !wi::neg_p (val))
16717             return NULL_RTX;
16718           negate = wi::neg_p (val);
16719           if (!negate)
16720             pow2 = val;
16721         }
16722       /* POW2 is now the value that we want to be a power of 2.  */
16723       int shift = wi::exact_log2 (pow2);
16724       if (shift < 0)
16725         return NULL_RTX;
16726       builder.quick_push (gen_int_mode (shift, int_mode));
16727     }
16728   if (negate == -1)
16729     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
16730     code = PLUS;
16731   else if (negate == 1)
16732     code = code == PLUS ? MINUS : PLUS;
16733   return builder.build ();
16734 }
16735
16736 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16737    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
16738    operands array, in the same order as for fma_optab.  Return true if
16739    the function emitted all the necessary instructions, false if the caller
16740    should generate the pattern normally with the new OPERANDS array.  */
16741
16742 bool
16743 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16744 {
16745   machine_mode mode = GET_MODE (operands[0]);
16746   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16747     {
16748       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16749                                   NULL_RTX, true, OPTAB_DIRECT);
16750       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16751                           operands[3], product, operands[0], true,
16752                           OPTAB_DIRECT);
16753       return true;
16754     }
16755   operands[2] = force_reg (mode, operands[2]);
16756   return false;
16757 }
16758
16759 /* Likewise, but for a conditional pattern.  */
16760
16761 bool
16762 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16763 {
16764   machine_mode mode = GET_MODE (operands[0]);
16765   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16766     {
16767       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16768                                   NULL_RTX, true, OPTAB_DIRECT);
16769       emit_insn (gen_cond (code, mode, operands[0], operands[1],
16770                            operands[4], product, operands[5]));
16771       return true;
16772     }
16773   operands[3] = force_reg (mode, operands[3]);
16774   return false;
16775 }
16776
16777 static unsigned HOST_WIDE_INT
16778 aarch64_shift_truncation_mask (machine_mode mode)
16779 {
16780   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16781     return 0;
16782   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16783 }
16784
16785 /* Select a format to encode pointers in exception handling data.  */
16786 int
16787 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16788 {
16789    int type;
16790    switch (aarch64_cmodel)
16791      {
16792      case AARCH64_CMODEL_TINY:
16793      case AARCH64_CMODEL_TINY_PIC:
16794      case AARCH64_CMODEL_SMALL:
16795      case AARCH64_CMODEL_SMALL_PIC:
16796      case AARCH64_CMODEL_SMALL_SPIC:
16797        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16798           for everything.  */
16799        type = DW_EH_PE_sdata4;
16800        break;
16801      default:
16802        /* No assumptions here.  8-byte relocs required.  */
16803        type = DW_EH_PE_sdata8;
16804        break;
16805      }
16806    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16807 }
16808
16809 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16810
16811 static void
16812 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16813 {
16814   if (aarch64_simd_decl_p (decl))
16815     {
16816       fprintf (stream, "\t.variant_pcs\t");
16817       assemble_name (stream, name);
16818       fprintf (stream, "\n");
16819     }
16820 }
16821
16822 /* The last .arch and .tune assembly strings that we printed.  */
16823 static std::string aarch64_last_printed_arch_string;
16824 static std::string aarch64_last_printed_tune_string;
16825
16826 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16827    by the function fndecl.  */
16828
16829 void
16830 aarch64_declare_function_name (FILE *stream, const char* name,
16831                                 tree fndecl)
16832 {
16833   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16834
16835   struct cl_target_option *targ_options;
16836   if (target_parts)
16837     targ_options = TREE_TARGET_OPTION (target_parts);
16838   else
16839     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16840   gcc_assert (targ_options);
16841
16842   const struct processor *this_arch
16843     = aarch64_get_arch (targ_options->x_explicit_arch);
16844
16845   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16846   std::string extension
16847     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16848                                                   this_arch->flags);
16849   /* Only update the assembler .arch string if it is distinct from the last
16850      such string we printed.  */
16851   std::string to_print = this_arch->name + extension;
16852   if (to_print != aarch64_last_printed_arch_string)
16853     {
16854       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16855       aarch64_last_printed_arch_string = to_print;
16856     }
16857
16858   /* Print the cpu name we're tuning for in the comments, might be
16859      useful to readers of the generated asm.  Do it only when it changes
16860      from function to function and verbose assembly is requested.  */
16861   const struct processor *this_tune
16862     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16863
16864   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16865     {
16866       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16867                    this_tune->name);
16868       aarch64_last_printed_tune_string = this_tune->name;
16869     }
16870
16871   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16872
16873   /* Don't forget the type directive for ELF.  */
16874   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16875   ASM_OUTPUT_LABEL (stream, name);
16876 }
16877
16878 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16879
16880 void
16881 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16882 {
16883   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16884   const char *value = IDENTIFIER_POINTER (target);
16885   aarch64_asm_output_variant_pcs (stream, decl, name);
16886   ASM_OUTPUT_DEF (stream, name, value);
16887 }
16888
16889 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16890    function symbol references.  */
16891
16892 void
16893 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16894 {
16895   default_elf_asm_output_external (stream, decl, name);
16896   aarch64_asm_output_variant_pcs (stream, decl, name);
16897 }
16898
16899 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16900    Used to output the .cfi_b_key_frame directive when signing the current
16901    function with the B key.  */
16902
16903 void
16904 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16905 {
16906   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16907       && aarch64_ra_sign_key == AARCH64_KEY_B)
16908         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16909 }
16910
16911 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16912
16913 static void
16914 aarch64_start_file (void)
16915 {
16916   struct cl_target_option *default_options
16917     = TREE_TARGET_OPTION (target_option_default_node);
16918
16919   const struct processor *default_arch
16920     = aarch64_get_arch (default_options->x_explicit_arch);
16921   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16922   std::string extension
16923     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16924                                                   default_arch->flags);
16925
16926    aarch64_last_printed_arch_string = default_arch->name + extension;
16927    aarch64_last_printed_tune_string = "";
16928    asm_fprintf (asm_out_file, "\t.arch %s\n",
16929                 aarch64_last_printed_arch_string.c_str ());
16930
16931    default_file_start ();
16932 }
16933
16934 /* Emit load exclusive.  */
16935
16936 static void
16937 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16938                              rtx mem, rtx model_rtx)
16939 {
16940   if (mode == TImode)
16941     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
16942                                                 gen_highpart (DImode, rval),
16943                                                 mem, model_rtx));
16944   else
16945     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16946 }
16947
16948 /* Emit store exclusive.  */
16949
16950 static void
16951 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16952                               rtx mem, rtx rval, rtx model_rtx)
16953 {
16954   if (mode == TImode)
16955     emit_insn (gen_aarch64_store_exclusive_pair
16956                (bval, mem, operand_subword (rval, 0, 0, TImode),
16957                 operand_subword (rval, 1, 0, TImode), model_rtx));
16958   else
16959     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
16960 }
16961
16962 /* Mark the previous jump instruction as unlikely.  */
16963
16964 static void
16965 aarch64_emit_unlikely_jump (rtx insn)
16966 {
16967   rtx_insn *jump = emit_jump_insn (insn);
16968   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16969 }
16970
16971 /* We store the names of the various atomic helpers in a 5x4 array.
16972    Return the libcall function given MODE, MODEL and NAMES.  */
16973
16974 rtx
16975 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
16976                         const atomic_ool_names *names)
16977 {
16978   memmodel model = memmodel_base (INTVAL (model_rtx));
16979   int mode_idx, model_idx;
16980
16981   switch (mode)
16982     {
16983     case E_QImode:
16984       mode_idx = 0;
16985       break;
16986     case E_HImode:
16987       mode_idx = 1;
16988       break;
16989     case E_SImode:
16990       mode_idx = 2;
16991       break;
16992     case E_DImode:
16993       mode_idx = 3;
16994       break;
16995     case E_TImode:
16996       mode_idx = 4;
16997       break;
16998     default:
16999       gcc_unreachable ();
17000     }
17001
17002   switch (model)
17003     {
17004     case MEMMODEL_RELAXED:
17005       model_idx = 0;
17006       break;
17007     case MEMMODEL_CONSUME:
17008     case MEMMODEL_ACQUIRE:
17009       model_idx = 1;
17010       break;
17011     case MEMMODEL_RELEASE:
17012       model_idx = 2;
17013       break;
17014     case MEMMODEL_ACQ_REL:
17015     case MEMMODEL_SEQ_CST:
17016       model_idx = 3;
17017       break;
17018     default:
17019       gcc_unreachable ();
17020     }
17021
17022   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
17023                                       VISIBILITY_HIDDEN);
17024 }
17025
17026 #define DEF0(B, N) \
17027   { "__aarch64_" #B #N "_relax", \
17028     "__aarch64_" #B #N "_acq", \
17029     "__aarch64_" #B #N "_rel", \
17030     "__aarch64_" #B #N "_acq_rel" }
17031
17032 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17033                  { NULL, NULL, NULL, NULL }
17034 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17035
17036 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
17037 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
17038 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
17039 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
17040 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
17041 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
17042
17043 #undef DEF0
17044 #undef DEF4
17045 #undef DEF5
17046
17047 /* Expand a compare and swap pattern.  */
17048
17049 void
17050 aarch64_expand_compare_and_swap (rtx operands[])
17051 {
17052   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
17053   machine_mode mode, r_mode;
17054
17055   bval = operands[0];
17056   rval = operands[1];
17057   mem = operands[2];
17058   oldval = operands[3];
17059   newval = operands[4];
17060   is_weak = operands[5];
17061   mod_s = operands[6];
17062   mod_f = operands[7];
17063   mode = GET_MODE (mem);
17064
17065   /* Normally the succ memory model must be stronger than fail, but in the
17066      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17067      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
17068   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
17069       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
17070     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
17071
17072   r_mode = mode;
17073   if (mode == QImode || mode == HImode)
17074     {
17075       r_mode = SImode;
17076       rval = gen_reg_rtx (r_mode);
17077     }
17078
17079   if (TARGET_LSE)
17080     {
17081       /* The CAS insn requires oldval and rval overlap, but we need to
17082          have a copy of oldval saved across the operation to tell if
17083          the operation is successful.  */
17084       if (reg_overlap_mentioned_p (rval, oldval))
17085         rval = copy_to_mode_reg (r_mode, oldval);
17086       else
17087         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
17088
17089       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
17090                                                    newval, mod_s));
17091       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17092     }
17093   else if (TARGET_OUTLINE_ATOMICS)
17094     {
17095       /* Oldval must satisfy compare afterward.  */
17096       if (!aarch64_plus_operand (oldval, mode))
17097         oldval = force_reg (mode, oldval);
17098       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
17099       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
17100                                       oldval, mode, newval, mode,
17101                                       XEXP (mem, 0), Pmode);
17102       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17103     }
17104   else
17105     {
17106       /* The oldval predicate varies by mode.  Test it and force to reg.  */
17107       insn_code code = code_for_aarch64_compare_and_swap (mode);
17108       if (!insn_data[code].operand[2].predicate (oldval, mode))
17109         oldval = force_reg (mode, oldval);
17110
17111       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17112                                  is_weak, mod_s, mod_f));
17113       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17114     }
17115
17116   if (r_mode != mode)
17117     rval = gen_lowpart (mode, rval);
17118   emit_move_insn (operands[1], rval);
17119
17120   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
17121   emit_insn (gen_rtx_SET (bval, x));
17122 }
17123
17124 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17125    sequence implementing an atomic operation.  */
17126
17127 static void
17128 aarch64_emit_post_barrier (enum memmodel model)
17129 {
17130   const enum memmodel base_model = memmodel_base (model);
17131
17132   if (is_mm_sync (model)
17133       && (base_model == MEMMODEL_ACQUIRE
17134           || base_model == MEMMODEL_ACQ_REL
17135           || base_model == MEMMODEL_SEQ_CST))
17136     {
17137       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
17138     }
17139 }
17140
17141 /* Split a compare and swap pattern.  */
17142
17143 void
17144 aarch64_split_compare_and_swap (rtx operands[])
17145 {
17146   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
17147   machine_mode mode;
17148   bool is_weak;
17149   rtx_code_label *label1, *label2;
17150   enum memmodel model;
17151
17152   rval = operands[0];
17153   mem = operands[1];
17154   oldval = operands[2];
17155   newval = operands[3];
17156   is_weak = (operands[4] != const0_rtx);
17157   model_rtx = operands[5];
17158   scratch = operands[7];
17159   mode = GET_MODE (mem);
17160   model = memmodel_from_int (INTVAL (model_rtx));
17161
17162   /* When OLDVAL is zero and we want the strong version we can emit a tighter
17163     loop:
17164     .label1:
17165         LD[A]XR rval, [mem]
17166         CBNZ    rval, .label2
17167         ST[L]XR scratch, newval, [mem]
17168         CBNZ    scratch, .label1
17169     .label2:
17170         CMP     rval, 0.  */
17171   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
17172                         oldval == const0_rtx && mode != TImode);
17173
17174   label1 = NULL;
17175   if (!is_weak)
17176     {
17177       label1 = gen_label_rtx ();
17178       emit_label (label1);
17179     }
17180   label2 = gen_label_rtx ();
17181
17182   /* The initial load can be relaxed for a __sync operation since a final
17183      barrier will be emitted to stop code hoisting.  */
17184   if (is_mm_sync (model))
17185     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
17186   else
17187     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
17188
17189   if (strong_zero_p)
17190     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17191   else
17192     {
17193       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17194       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17195     }
17196   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17197                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
17198   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17199
17200   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
17201
17202   if (!is_weak)
17203     {
17204       if (aarch64_track_speculation)
17205         {
17206           /* Emit an explicit compare instruction, so that we can correctly
17207              track the condition codes.  */
17208           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17209           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17210         }
17211       else
17212         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
17213
17214       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17215                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
17216       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17217     }
17218   else
17219     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17220
17221   emit_label (label2);
17222
17223   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17224      to set the condition flags.  If this is not used it will be removed by
17225      later passes.  */
17226   if (strong_zero_p)
17227     aarch64_gen_compare_reg (NE, rval, const0_rtx);
17228
17229   /* Emit any final barrier needed for a __sync operation.  */
17230   if (is_mm_sync (model))
17231     aarch64_emit_post_barrier (model);
17232 }
17233
17234 /* Split an atomic operation.  */
17235
17236 void
17237 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17238                          rtx value, rtx model_rtx, rtx cond)
17239 {
17240   machine_mode mode = GET_MODE (mem);
17241   machine_mode wmode = (mode == DImode ? DImode : SImode);
17242   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17243   const bool is_sync = is_mm_sync (model);
17244   rtx_code_label *label;
17245   rtx x;
17246
17247   /* Split the atomic operation into a sequence.  */
17248   label = gen_label_rtx ();
17249   emit_label (label);
17250
17251   if (new_out)
17252     new_out = gen_lowpart (wmode, new_out);
17253   if (old_out)
17254     old_out = gen_lowpart (wmode, old_out);
17255   else
17256     old_out = new_out;
17257   value = simplify_gen_subreg (wmode, value, mode, 0);
17258
17259   /* The initial load can be relaxed for a __sync operation since a final
17260      barrier will be emitted to stop code hoisting.  */
17261  if (is_sync)
17262     aarch64_emit_load_exclusive (mode, old_out, mem,
17263                                  GEN_INT (MEMMODEL_RELAXED));
17264   else
17265     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17266
17267   switch (code)
17268     {
17269     case SET:
17270       new_out = value;
17271       break;
17272
17273     case NOT:
17274       x = gen_rtx_AND (wmode, old_out, value);
17275       emit_insn (gen_rtx_SET (new_out, x));
17276       x = gen_rtx_NOT (wmode, new_out);
17277       emit_insn (gen_rtx_SET (new_out, x));
17278       break;
17279
17280     case MINUS:
17281       if (CONST_INT_P (value))
17282         {
17283           value = GEN_INT (-INTVAL (value));
17284           code = PLUS;
17285         }
17286       /* Fall through.  */
17287
17288     default:
17289       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17290       emit_insn (gen_rtx_SET (new_out, x));
17291       break;
17292     }
17293
17294   aarch64_emit_store_exclusive (mode, cond, mem,
17295                                 gen_lowpart (mode, new_out), model_rtx);
17296
17297   if (aarch64_track_speculation)
17298     {
17299       /* Emit an explicit compare instruction, so that we can correctly
17300          track the condition codes.  */
17301       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17302       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17303     }
17304   else
17305     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17306
17307   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17308                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17309   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17310
17311   /* Emit any final barrier needed for a __sync operation.  */
17312   if (is_sync)
17313     aarch64_emit_post_barrier (model);
17314 }
17315
17316 static void
17317 aarch64_init_libfuncs (void)
17318 {
17319    /* Half-precision float operations.  The compiler handles all operations
17320      with NULL libfuncs by converting to SFmode.  */
17321
17322   /* Conversions.  */
17323   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17324   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17325
17326   /* Arithmetic.  */
17327   set_optab_libfunc (add_optab, HFmode, NULL);
17328   set_optab_libfunc (sdiv_optab, HFmode, NULL);
17329   set_optab_libfunc (smul_optab, HFmode, NULL);
17330   set_optab_libfunc (neg_optab, HFmode, NULL);
17331   set_optab_libfunc (sub_optab, HFmode, NULL);
17332
17333   /* Comparisons.  */
17334   set_optab_libfunc (eq_optab, HFmode, NULL);
17335   set_optab_libfunc (ne_optab, HFmode, NULL);
17336   set_optab_libfunc (lt_optab, HFmode, NULL);
17337   set_optab_libfunc (le_optab, HFmode, NULL);
17338   set_optab_libfunc (ge_optab, HFmode, NULL);
17339   set_optab_libfunc (gt_optab, HFmode, NULL);
17340   set_optab_libfunc (unord_optab, HFmode, NULL);
17341 }
17342
17343 /* Target hook for c_mode_for_suffix.  */
17344 static machine_mode
17345 aarch64_c_mode_for_suffix (char suffix)
17346 {
17347   if (suffix == 'q')
17348     return TFmode;
17349
17350   return VOIDmode;
17351 }
17352
17353 /* We can only represent floating point constants which will fit in
17354    "quarter-precision" values.  These values are characterised by
17355    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
17356    by:
17357
17358    (-1)^s * (n/16) * 2^r
17359
17360    Where:
17361      's' is the sign bit.
17362      'n' is an integer in the range 16 <= n <= 31.
17363      'r' is an integer in the range -3 <= r <= 4.  */
17364
17365 /* Return true iff X can be represented by a quarter-precision
17366    floating point immediate operand X.  Note, we cannot represent 0.0.  */
17367 bool
17368 aarch64_float_const_representable_p (rtx x)
17369 {
17370   /* This represents our current view of how many bits
17371      make up the mantissa.  */
17372   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17373   int exponent;
17374   unsigned HOST_WIDE_INT mantissa, mask;
17375   REAL_VALUE_TYPE r, m;
17376   bool fail;
17377
17378   x = unwrap_const_vec_duplicate (x);
17379   if (!CONST_DOUBLE_P (x))
17380     return false;
17381
17382   if (GET_MODE (x) == VOIDmode
17383       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17384     return false;
17385
17386   r = *CONST_DOUBLE_REAL_VALUE (x);
17387
17388   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17389      know if we have +zero until we analyse the mantissa, but we
17390      can reject the other invalid values.  */
17391   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17392       || REAL_VALUE_MINUS_ZERO (r))
17393     return false;
17394
17395   /* Extract exponent.  */
17396   r = real_value_abs (&r);
17397   exponent = REAL_EXP (&r);
17398
17399   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17400      highest (sign) bit, with a fixed binary point at bit point_pos.
17401      m1 holds the low part of the mantissa, m2 the high part.
17402      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17403      bits for the mantissa, this can fail (low bits will be lost).  */
17404   real_ldexp (&m, &r, point_pos - exponent);
17405   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17406
17407   /* If the low part of the mantissa has bits set we cannot represent
17408      the value.  */
17409   if (w.ulow () != 0)
17410     return false;
17411   /* We have rejected the lower HOST_WIDE_INT, so update our
17412      understanding of how many bits lie in the mantissa and
17413      look only at the high HOST_WIDE_INT.  */
17414   mantissa = w.elt (1);
17415   point_pos -= HOST_BITS_PER_WIDE_INT;
17416
17417   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17418   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17419   if ((mantissa & mask) != 0)
17420     return false;
17421
17422   /* Having filtered unrepresentable values, we may now remove all
17423      but the highest 5 bits.  */
17424   mantissa >>= point_pos - 5;
17425
17426   /* We cannot represent the value 0.0, so reject it.  This is handled
17427      elsewhere.  */
17428   if (mantissa == 0)
17429     return false;
17430
17431   /* Then, as bit 4 is always set, we can mask it off, leaving
17432      the mantissa in the range [0, 15].  */
17433   mantissa &= ~(1 << 4);
17434   gcc_assert (mantissa <= 15);
17435
17436   /* GCC internally does not use IEEE754-like encoding (where normalized
17437      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17438      Our mantissa values are shifted 4 places to the left relative to
17439      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17440      by 5 places to correct for GCC's representation.  */
17441   exponent = 5 - exponent;
17442
17443   return (exponent >= 0 && exponent <= 7);
17444 }
17445
17446 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17447    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17448    output MOVI/MVNI, ORR or BIC immediate.  */
17449 char*
17450 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17451                                    enum simd_immediate_check which)
17452 {
17453   bool is_valid;
17454   static char templ[40];
17455   const char *mnemonic;
17456   const char *shift_op;
17457   unsigned int lane_count = 0;
17458   char element_char;
17459
17460   struct simd_immediate_info info;
17461
17462   /* This will return true to show const_vector is legal for use as either
17463      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17464      It will also update INFO to show how the immediate should be generated.
17465      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17466   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17467   gcc_assert (is_valid);
17468
17469   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17470   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17471
17472   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17473     {
17474       gcc_assert (info.insn == simd_immediate_info::MOV
17475                   && info.u.mov.shift == 0);
17476       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17477          move immediate path.  */
17478       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17479         info.u.mov.value = GEN_INT (0);
17480       else
17481         {
17482           const unsigned int buf_size = 20;
17483           char float_buf[buf_size] = {'\0'};
17484           real_to_decimal_for_mode (float_buf,
17485                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17486                                     buf_size, buf_size, 1, info.elt_mode);
17487
17488           if (lane_count == 1)
17489             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17490           else
17491             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17492                       lane_count, element_char, float_buf);
17493           return templ;
17494         }
17495     }
17496
17497   gcc_assert (CONST_INT_P (info.u.mov.value));
17498
17499   if (which == AARCH64_CHECK_MOV)
17500     {
17501       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17502       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17503                   ? "msl" : "lsl");
17504       if (lane_count == 1)
17505         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17506                   mnemonic, UINTVAL (info.u.mov.value));
17507       else if (info.u.mov.shift)
17508         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17509                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17510                   element_char, UINTVAL (info.u.mov.value), shift_op,
17511                   info.u.mov.shift);
17512       else
17513         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17514                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17515                   element_char, UINTVAL (info.u.mov.value));
17516     }
17517   else
17518     {
17519       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17520       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17521       if (info.u.mov.shift)
17522         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17523                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17524                   element_char, UINTVAL (info.u.mov.value), "lsl",
17525                   info.u.mov.shift);
17526       else
17527         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17528                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17529                   element_char, UINTVAL (info.u.mov.value));
17530     }
17531   return templ;
17532 }
17533
17534 char*
17535 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17536 {
17537
17538   /* If a floating point number was passed and we desire to use it in an
17539      integer mode do the conversion to integer.  */
17540   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17541     {
17542       unsigned HOST_WIDE_INT ival;
17543       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17544           gcc_unreachable ();
17545       immediate = gen_int_mode (ival, mode);
17546     }
17547
17548   machine_mode vmode;
17549   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17550      a 128 bit vector mode.  */
17551   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17552
17553   vmode = aarch64_simd_container_mode (mode, width);
17554   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17555   return aarch64_output_simd_mov_immediate (v_op, width);
17556 }
17557
17558 /* Return the output string to use for moving immediate CONST_VECTOR
17559    into an SVE register.  */
17560
17561 char *
17562 aarch64_output_sve_mov_immediate (rtx const_vector)
17563 {
17564   static char templ[40];
17565   struct simd_immediate_info info;
17566   char element_char;
17567
17568   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17569   gcc_assert (is_valid);
17570
17571   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17572
17573   machine_mode vec_mode = GET_MODE (const_vector);
17574   if (aarch64_sve_pred_mode_p (vec_mode))
17575     {
17576       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17577       if (info.insn == simd_immediate_info::MOV)
17578         {
17579           gcc_assert (info.u.mov.value == const0_rtx);
17580           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17581         }
17582       else
17583         {
17584           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17585           unsigned int total_bytes;
17586           if (info.u.pattern == AARCH64_SV_ALL
17587               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17588             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17589                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17590           else
17591             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17592                       svpattern_token (info.u.pattern));
17593         }
17594       return buf;
17595     }
17596
17597   if (info.insn == simd_immediate_info::INDEX)
17598     {
17599       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17600                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17601                 element_char, INTVAL (info.u.index.base),
17602                 INTVAL (info.u.index.step));
17603       return templ;
17604     }
17605
17606   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17607     {
17608       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17609         info.u.mov.value = GEN_INT (0);
17610       else
17611         {
17612           const int buf_size = 20;
17613           char float_buf[buf_size] = {};
17614           real_to_decimal_for_mode (float_buf,
17615                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17616                                     buf_size, buf_size, 1, info.elt_mode);
17617
17618           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17619                     element_char, float_buf);
17620           return templ;
17621         }
17622     }
17623
17624   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17625             element_char, INTVAL (info.u.mov.value));
17626   return templ;
17627 }
17628
17629 /* Split operands into moves from op[1] + op[2] into op[0].  */
17630
17631 void
17632 aarch64_split_combinev16qi (rtx operands[3])
17633 {
17634   unsigned int dest = REGNO (operands[0]);
17635   unsigned int src1 = REGNO (operands[1]);
17636   unsigned int src2 = REGNO (operands[2]);
17637   machine_mode halfmode = GET_MODE (operands[1]);
17638   unsigned int halfregs = REG_NREGS (operands[1]);
17639   rtx destlo, desthi;
17640
17641   gcc_assert (halfmode == V16QImode);
17642
17643   if (src1 == dest && src2 == dest + halfregs)
17644     {
17645       /* No-op move.  Can't split to nothing; emit something.  */
17646       emit_note (NOTE_INSN_DELETED);
17647       return;
17648     }
17649
17650   /* Preserve register attributes for variable tracking.  */
17651   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17652   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17653                                GET_MODE_SIZE (halfmode));
17654
17655   /* Special case of reversed high/low parts.  */
17656   if (reg_overlap_mentioned_p (operands[2], destlo)
17657       && reg_overlap_mentioned_p (operands[1], desthi))
17658     {
17659       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17660       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17661       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17662     }
17663   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17664     {
17665       /* Try to avoid unnecessary moves if part of the result
17666          is in the right place already.  */
17667       if (src1 != dest)
17668         emit_move_insn (destlo, operands[1]);
17669       if (src2 != dest + halfregs)
17670         emit_move_insn (desthi, operands[2]);
17671     }
17672   else
17673     {
17674       if (src2 != dest + halfregs)
17675         emit_move_insn (desthi, operands[2]);
17676       if (src1 != dest)
17677         emit_move_insn (destlo, operands[1]);
17678     }
17679 }
17680
17681 /* vec_perm support.  */
17682
17683 struct expand_vec_perm_d
17684 {
17685   rtx target, op0, op1;
17686   vec_perm_indices perm;
17687   machine_mode vmode;
17688   unsigned int vec_flags;
17689   bool one_vector_p;
17690   bool testing_p;
17691 };
17692
17693 /* Generate a variable permutation.  */
17694
17695 static void
17696 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17697 {
17698   machine_mode vmode = GET_MODE (target);
17699   bool one_vector_p = rtx_equal_p (op0, op1);
17700
17701   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17702   gcc_checking_assert (GET_MODE (op0) == vmode);
17703   gcc_checking_assert (GET_MODE (op1) == vmode);
17704   gcc_checking_assert (GET_MODE (sel) == vmode);
17705   gcc_checking_assert (TARGET_SIMD);
17706
17707   if (one_vector_p)
17708     {
17709       if (vmode == V8QImode)
17710         {
17711           /* Expand the argument to a V16QI mode by duplicating it.  */
17712           rtx pair = gen_reg_rtx (V16QImode);
17713           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17714           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17715         }
17716       else
17717         {
17718           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17719         }
17720     }
17721   else
17722     {
17723       rtx pair;
17724
17725       if (vmode == V8QImode)
17726         {
17727           pair = gen_reg_rtx (V16QImode);
17728           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17729           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17730         }
17731       else
17732         {
17733           pair = gen_reg_rtx (OImode);
17734           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17735           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17736         }
17737     }
17738 }
17739
17740 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17741    NELT is the number of elements in the vector.  */
17742
17743 void
17744 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17745                          unsigned int nelt)
17746 {
17747   machine_mode vmode = GET_MODE (target);
17748   bool one_vector_p = rtx_equal_p (op0, op1);
17749   rtx mask;
17750
17751   /* The TBL instruction does not use a modulo index, so we must take care
17752      of that ourselves.  */
17753   mask = aarch64_simd_gen_const_vector_dup (vmode,
17754       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17755   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17756
17757   /* For big-endian, we also need to reverse the index within the vector
17758      (but not which vector).  */
17759   if (BYTES_BIG_ENDIAN)
17760     {
17761       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17762       if (!one_vector_p)
17763         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17764       sel = expand_simple_binop (vmode, XOR, sel, mask,
17765                                  NULL, 0, OPTAB_LIB_WIDEN);
17766     }
17767   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17768 }
17769
17770 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17771
17772 static void
17773 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17774 {
17775   emit_insn (gen_rtx_SET (target,
17776                           gen_rtx_UNSPEC (GET_MODE (target),
17777                                           gen_rtvec (2, op0, op1), code)));
17778 }
17779
17780 /* Expand an SVE vec_perm with the given operands.  */
17781
17782 void
17783 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17784 {
17785   machine_mode data_mode = GET_MODE (target);
17786   machine_mode sel_mode = GET_MODE (sel);
17787   /* Enforced by the pattern condition.  */
17788   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17789
17790   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17791      size of the two value vectors, i.e. the upper bits of the indices
17792      are effectively ignored.  SVE TBL instead produces 0 for any
17793      out-of-range indices, so we need to modulo all the vec_perm indices
17794      to ensure they are all in range.  */
17795   rtx sel_reg = force_reg (sel_mode, sel);
17796
17797   /* Check if the sel only references the first values vector.  */
17798   if (GET_CODE (sel) == CONST_VECTOR
17799       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17800     {
17801       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17802       return;
17803     }
17804
17805   /* Check if the two values vectors are the same.  */
17806   if (rtx_equal_p (op0, op1))
17807     {
17808       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17809       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17810                                          NULL, 0, OPTAB_DIRECT);
17811       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17812       return;
17813     }
17814
17815   /* Run TBL on for each value vector and combine the results.  */
17816
17817   rtx res0 = gen_reg_rtx (data_mode);
17818   rtx res1 = gen_reg_rtx (data_mode);
17819   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17820   if (GET_CODE (sel) != CONST_VECTOR
17821       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17822     {
17823       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17824                                                        2 * nunits - 1);
17825       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17826                                      NULL, 0, OPTAB_DIRECT);
17827     }
17828   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17829   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17830                                      NULL, 0, OPTAB_DIRECT);
17831   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17832   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17833     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17834   else
17835     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17836 }
17837
17838 /* Recognize patterns suitable for the TRN instructions.  */
17839 static bool
17840 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17841 {
17842   HOST_WIDE_INT odd;
17843   poly_uint64 nelt = d->perm.length ();
17844   rtx out, in0, in1, x;
17845   machine_mode vmode = d->vmode;
17846
17847   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17848     return false;
17849
17850   /* Note that these are little-endian tests.
17851      We correct for big-endian later.  */
17852   if (!d->perm[0].is_constant (&odd)
17853       || (odd != 0 && odd != 1)
17854       || !d->perm.series_p (0, 2, odd, 2)
17855       || !d->perm.series_p (1, 2, nelt + odd, 2))
17856     return false;
17857
17858   /* Success!  */
17859   if (d->testing_p)
17860     return true;
17861
17862   in0 = d->op0;
17863   in1 = d->op1;
17864   /* We don't need a big-endian lane correction for SVE; see the comment
17865      at the head of aarch64-sve.md for details.  */
17866   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17867     {
17868       x = in0, in0 = in1, in1 = x;
17869       odd = !odd;
17870     }
17871   out = d->target;
17872
17873   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17874                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17875   return true;
17876 }
17877
17878 /* Recognize patterns suitable for the UZP instructions.  */
17879 static bool
17880 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17881 {
17882   HOST_WIDE_INT odd;
17883   rtx out, in0, in1, x;
17884   machine_mode vmode = d->vmode;
17885
17886   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17887     return false;
17888
17889   /* Note that these are little-endian tests.
17890      We correct for big-endian later.  */
17891   if (!d->perm[0].is_constant (&odd)
17892       || (odd != 0 && odd != 1)
17893       || !d->perm.series_p (0, 1, odd, 2))
17894     return false;
17895
17896   /* Success!  */
17897   if (d->testing_p)
17898     return true;
17899
17900   in0 = d->op0;
17901   in1 = d->op1;
17902   /* We don't need a big-endian lane correction for SVE; see the comment
17903      at the head of aarch64-sve.md for details.  */
17904   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17905     {
17906       x = in0, in0 = in1, in1 = x;
17907       odd = !odd;
17908     }
17909   out = d->target;
17910
17911   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17912                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17913   return true;
17914 }
17915
17916 /* Recognize patterns suitable for the ZIP instructions.  */
17917 static bool
17918 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17919 {
17920   unsigned int high;
17921   poly_uint64 nelt = d->perm.length ();
17922   rtx out, in0, in1, x;
17923   machine_mode vmode = d->vmode;
17924
17925   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17926     return false;
17927
17928   /* Note that these are little-endian tests.
17929      We correct for big-endian later.  */
17930   poly_uint64 first = d->perm[0];
17931   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17932       || !d->perm.series_p (0, 2, first, 1)
17933       || !d->perm.series_p (1, 2, first + nelt, 1))
17934     return false;
17935   high = maybe_ne (first, 0U);
17936
17937   /* Success!  */
17938   if (d->testing_p)
17939     return true;
17940
17941   in0 = d->op0;
17942   in1 = d->op1;
17943   /* We don't need a big-endian lane correction for SVE; see the comment
17944      at the head of aarch64-sve.md for details.  */
17945   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17946     {
17947       x = in0, in0 = in1, in1 = x;
17948       high = !high;
17949     }
17950   out = d->target;
17951
17952   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17953                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17954   return true;
17955 }
17956
17957 /* Recognize patterns for the EXT insn.  */
17958
17959 static bool
17960 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17961 {
17962   HOST_WIDE_INT location;
17963   rtx offset;
17964
17965   /* The first element always refers to the first vector.
17966      Check if the extracted indices are increasing by one.  */
17967   if (d->vec_flags == VEC_SVE_PRED
17968       || !d->perm[0].is_constant (&location)
17969       || !d->perm.series_p (0, 1, location, 1))
17970     return false;
17971
17972   /* Success! */
17973   if (d->testing_p)
17974     return true;
17975
17976   /* The case where (location == 0) is a no-op for both big- and little-endian,
17977      and is removed by the mid-end at optimization levels -O1 and higher.
17978
17979      We don't need a big-endian lane correction for SVE; see the comment
17980      at the head of aarch64-sve.md for details.  */
17981   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17982     {
17983       /* After setup, we want the high elements of the first vector (stored
17984          at the LSB end of the register), and the low elements of the second
17985          vector (stored at the MSB end of the register). So swap.  */
17986       std::swap (d->op0, d->op1);
17987       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17988          to_constant () is safe since this is restricted to Advanced SIMD
17989          vectors.  */
17990       location = d->perm.length ().to_constant () - location;
17991     }
17992
17993   offset = GEN_INT (location);
17994   emit_set_insn (d->target,
17995                  gen_rtx_UNSPEC (d->vmode,
17996                                  gen_rtvec (3, d->op0, d->op1, offset),
17997                                  UNSPEC_EXT));
17998   return true;
17999 }
18000
18001 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
18002    within each 64-bit, 32-bit or 16-bit granule.  */
18003
18004 static bool
18005 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
18006 {
18007   HOST_WIDE_INT diff;
18008   unsigned int i, size, unspec;
18009   machine_mode pred_mode;
18010
18011   if (d->vec_flags == VEC_SVE_PRED
18012       || !d->one_vector_p
18013       || !d->perm[0].is_constant (&diff))
18014     return false;
18015
18016   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
18017   if (size == 8)
18018     {
18019       unspec = UNSPEC_REV64;
18020       pred_mode = VNx2BImode;
18021     }
18022   else if (size == 4)
18023     {
18024       unspec = UNSPEC_REV32;
18025       pred_mode = VNx4BImode;
18026     }
18027   else if (size == 2)
18028     {
18029       unspec = UNSPEC_REV16;
18030       pred_mode = VNx8BImode;
18031     }
18032   else
18033     return false;
18034
18035   unsigned int step = diff + 1;
18036   for (i = 0; i < step; ++i)
18037     if (!d->perm.series_p (i, step, diff - i, step))
18038       return false;
18039
18040   /* Success! */
18041   if (d->testing_p)
18042     return true;
18043
18044   if (d->vec_flags == VEC_SVE_DATA)
18045     {
18046       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
18047       rtx target = gen_reg_rtx (int_mode);
18048       if (BYTES_BIG_ENDIAN)
18049         /* The act of taking a subreg between INT_MODE and d->vmode
18050            is itself a reversing operation on big-endian targets;
18051            see the comment at the head of aarch64-sve.md for details.
18052            First reinterpret OP0 as INT_MODE without using a subreg
18053            and without changing the contents.  */
18054         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
18055       else
18056         {
18057           /* For SVE we use REV[BHW] unspecs derived from the element size
18058              of v->mode and vector modes whose elements have SIZE bytes.
18059              This ensures that the vector modes match the predicate modes.  */
18060           int unspec = aarch64_sve_rev_unspec (d->vmode);
18061           rtx pred = aarch64_ptrue_reg (pred_mode);
18062           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
18063                                        gen_lowpart (int_mode, d->op0)));
18064         }
18065       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18066       return true;
18067     }
18068   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
18069   emit_set_insn (d->target, src);
18070   return true;
18071 }
18072
18073 /* Recognize patterns for the REV insn, which reverses elements within
18074    a full vector.  */
18075
18076 static bool
18077 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
18078 {
18079   poly_uint64 nelt = d->perm.length ();
18080
18081   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
18082     return false;
18083
18084   if (!d->perm.series_p (0, 1, nelt - 1, -1))
18085     return false;
18086
18087   /* Success! */
18088   if (d->testing_p)
18089     return true;
18090
18091   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
18092   emit_set_insn (d->target, src);
18093   return true;
18094 }
18095
18096 static bool
18097 aarch64_evpc_dup (struct expand_vec_perm_d *d)
18098 {
18099   rtx out = d->target;
18100   rtx in0;
18101   HOST_WIDE_INT elt;
18102   machine_mode vmode = d->vmode;
18103   rtx lane;
18104
18105   if (d->vec_flags == VEC_SVE_PRED
18106       || d->perm.encoding ().encoded_nelts () != 1
18107       || !d->perm[0].is_constant (&elt))
18108     return false;
18109
18110   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
18111     return false;
18112
18113   /* Success! */
18114   if (d->testing_p)
18115     return true;
18116
18117   /* The generic preparation in aarch64_expand_vec_perm_const_1
18118      swaps the operand order and the permute indices if it finds
18119      d->perm[0] to be in the second operand.  Thus, we can always
18120      use d->op0 and need not do any extra arithmetic to get the
18121      correct lane number.  */
18122   in0 = d->op0;
18123   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
18124
18125   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
18126   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
18127   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
18128   return true;
18129 }
18130
18131 static bool
18132 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
18133 {
18134   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
18135   machine_mode vmode = d->vmode;
18136
18137   /* Make sure that the indices are constant.  */
18138   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
18139   for (unsigned int i = 0; i < encoded_nelts; ++i)
18140     if (!d->perm[i].is_constant ())
18141       return false;
18142
18143   if (d->testing_p)
18144     return true;
18145
18146   /* Generic code will try constant permutation twice.  Once with the
18147      original mode and again with the elements lowered to QImode.
18148      So wait and don't do the selector expansion ourselves.  */
18149   if (vmode != V8QImode && vmode != V16QImode)
18150     return false;
18151
18152   /* to_constant is safe since this routine is specific to Advanced SIMD
18153      vectors.  */
18154   unsigned int nelt = d->perm.length ().to_constant ();
18155   for (unsigned int i = 0; i < nelt; ++i)
18156     /* If big-endian and two vectors we end up with a weird mixed-endian
18157        mode on NEON.  Reverse the index within each word but not the word
18158        itself.  to_constant is safe because we checked is_constant above.  */
18159     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
18160                         ? d->perm[i].to_constant () ^ (nelt - 1)
18161                         : d->perm[i].to_constant ());
18162
18163   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18164   sel = force_reg (vmode, sel);
18165
18166   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
18167   return true;
18168 }
18169
18170 /* Try to implement D using an SVE TBL instruction.  */
18171
18172 static bool
18173 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
18174 {
18175   unsigned HOST_WIDE_INT nelt;
18176
18177   /* Permuting two variable-length vectors could overflow the
18178      index range.  */
18179   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
18180     return false;
18181
18182   if (d->testing_p)
18183     return true;
18184
18185   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
18186   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
18187   if (d->one_vector_p)
18188     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
18189   else
18190     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
18191   return true;
18192 }
18193
18194 /* Try to implement D using SVE SEL instruction.  */
18195
18196 static bool
18197 aarch64_evpc_sel (struct expand_vec_perm_d *d)
18198 {
18199   machine_mode vmode = d->vmode;
18200   int unit_size = GET_MODE_UNIT_SIZE (vmode);
18201
18202   if (d->vec_flags != VEC_SVE_DATA
18203       || unit_size > 8)
18204     return false;
18205
18206   int n_patterns = d->perm.encoding ().npatterns ();
18207   poly_int64 vec_len = d->perm.length ();
18208
18209   for (int i = 0; i < n_patterns; ++i)
18210     if (!known_eq (d->perm[i], i)
18211         && !known_eq (d->perm[i], vec_len + i))
18212       return false;
18213
18214   for (int i = n_patterns; i < n_patterns * 2; i++)
18215     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
18216         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
18217       return false;
18218
18219   if (d->testing_p)
18220     return true;
18221
18222   machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
18223
18224   rtx_vector_builder builder (pred_mode, n_patterns, 2);
18225   for (int i = 0; i < n_patterns * 2; i++)
18226     {
18227       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
18228                                           : CONST0_RTX (BImode);
18229       builder.quick_push (elem);
18230     }
18231
18232   rtx const_vec = builder.build ();
18233   rtx pred = force_reg (pred_mode, const_vec);
18234   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
18235   return true;
18236 }
18237
18238 static bool
18239 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
18240 {
18241   /* The pattern matching functions above are written to look for a small
18242      number to begin the sequence (0, 1, N/2).  If we begin with an index
18243      from the second operand, we can swap the operands.  */
18244   poly_int64 nelt = d->perm.length ();
18245   if (known_ge (d->perm[0], nelt))
18246     {
18247       d->perm.rotate_inputs (1);
18248       std::swap (d->op0, d->op1);
18249     }
18250
18251   if ((d->vec_flags == VEC_ADVSIMD
18252        || d->vec_flags == VEC_SVE_DATA
18253        || d->vec_flags == VEC_SVE_PRED)
18254       && known_gt (nelt, 1))
18255     {
18256       if (aarch64_evpc_rev_local (d))
18257         return true;
18258       else if (aarch64_evpc_rev_global (d))
18259         return true;
18260       else if (aarch64_evpc_ext (d))
18261         return true;
18262       else if (aarch64_evpc_dup (d))
18263         return true;
18264       else if (aarch64_evpc_zip (d))
18265         return true;
18266       else if (aarch64_evpc_uzp (d))
18267         return true;
18268       else if (aarch64_evpc_trn (d))
18269         return true;
18270       else if (aarch64_evpc_sel (d))
18271         return true;
18272       if (d->vec_flags == VEC_SVE_DATA)
18273         return aarch64_evpc_sve_tbl (d);
18274       else if (d->vec_flags == VEC_ADVSIMD)
18275         return aarch64_evpc_tbl (d);
18276     }
18277   return false;
18278 }
18279
18280 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
18281
18282 static bool
18283 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18284                                   rtx op1, const vec_perm_indices &sel)
18285 {
18286   struct expand_vec_perm_d d;
18287
18288   /* Check whether the mask can be applied to a single vector.  */
18289   if (sel.ninputs () == 1
18290       || (op0 && rtx_equal_p (op0, op1)))
18291     d.one_vector_p = true;
18292   else if (sel.all_from_input_p (0))
18293     {
18294       d.one_vector_p = true;
18295       op1 = op0;
18296     }
18297   else if (sel.all_from_input_p (1))
18298     {
18299       d.one_vector_p = true;
18300       op0 = op1;
18301     }
18302   else
18303     d.one_vector_p = false;
18304
18305   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18306                      sel.nelts_per_input ());
18307   d.vmode = vmode;
18308   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18309   d.target = target;
18310   d.op0 = op0;
18311   d.op1 = op1;
18312   d.testing_p = !target;
18313
18314   if (!d.testing_p)
18315     return aarch64_expand_vec_perm_const_1 (&d);
18316
18317   rtx_insn *last = get_last_insn ();
18318   bool ret = aarch64_expand_vec_perm_const_1 (&d);
18319   gcc_assert (last == get_last_insn ());
18320
18321   return ret;
18322 }
18323
18324 /* Generate a byte permute mask for a register of mode MODE,
18325    which has NUNITS units.  */
18326
18327 rtx
18328 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18329 {
18330   /* We have to reverse each vector because we dont have
18331      a permuted load that can reverse-load according to ABI rules.  */
18332   rtx mask;
18333   rtvec v = rtvec_alloc (16);
18334   unsigned int i, j;
18335   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18336
18337   gcc_assert (BYTES_BIG_ENDIAN);
18338   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18339
18340   for (i = 0; i < nunits; i++)
18341     for (j = 0; j < usize; j++)
18342       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18343   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18344   return force_reg (V16QImode, mask);
18345 }
18346
18347 /* Expand an SVE integer comparison using the SVE equivalent of:
18348
18349      (set TARGET (CODE OP0 OP1)).  */
18350
18351 void
18352 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18353 {
18354   machine_mode pred_mode = GET_MODE (target);
18355   machine_mode data_mode = GET_MODE (op0);
18356   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18357                                       op0, op1);
18358   if (!rtx_equal_p (target, res))
18359     emit_move_insn (target, res);
18360 }
18361
18362 /* Return the UNSPEC_COND_* code for comparison CODE.  */
18363
18364 static unsigned int
18365 aarch64_unspec_cond_code (rtx_code code)
18366 {
18367   switch (code)
18368     {
18369     case NE:
18370       return UNSPEC_COND_FCMNE;
18371     case EQ:
18372       return UNSPEC_COND_FCMEQ;
18373     case LT:
18374       return UNSPEC_COND_FCMLT;
18375     case GT:
18376       return UNSPEC_COND_FCMGT;
18377     case LE:
18378       return UNSPEC_COND_FCMLE;
18379     case GE:
18380       return UNSPEC_COND_FCMGE;
18381     case UNORDERED:
18382       return UNSPEC_COND_FCMUO;
18383     default:
18384       gcc_unreachable ();
18385     }
18386 }
18387
18388 /* Emit:
18389
18390       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18391
18392    where <X> is the operation associated with comparison CODE.
18393    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18394
18395 static void
18396 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18397                           bool known_ptrue_p, rtx op0, rtx op1)
18398 {
18399   rtx flag = gen_int_mode (known_ptrue_p, SImode);
18400   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18401                                gen_rtvec (4, pred, flag, op0, op1),
18402                                aarch64_unspec_cond_code (code));
18403   emit_set_insn (target, unspec);
18404 }
18405
18406 /* Emit the SVE equivalent of:
18407
18408       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18409       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18410       (set TARGET (ior:PRED_MODE TMP1 TMP2))
18411
18412    where <Xi> is the operation associated with comparison CODEi.
18413    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18414
18415 static void
18416 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18417                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18418 {
18419   machine_mode pred_mode = GET_MODE (pred);
18420   rtx tmp1 = gen_reg_rtx (pred_mode);
18421   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18422   rtx tmp2 = gen_reg_rtx (pred_mode);
18423   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18424   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18425 }
18426
18427 /* Emit the SVE equivalent of:
18428
18429       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18430       (set TARGET (not TMP))
18431
18432    where <X> is the operation associated with comparison CODE.
18433    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18434
18435 static void
18436 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18437                                  bool known_ptrue_p, rtx op0, rtx op1)
18438 {
18439   machine_mode pred_mode = GET_MODE (pred);
18440   rtx tmp = gen_reg_rtx (pred_mode);
18441   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18442   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18443 }
18444
18445 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18446
18447      (set TARGET (CODE OP0 OP1))
18448
18449    If CAN_INVERT_P is true, the caller can also handle inverted results;
18450    return true if the result is in fact inverted.  */
18451
18452 bool
18453 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18454                                   rtx op0, rtx op1, bool can_invert_p)
18455 {
18456   machine_mode pred_mode = GET_MODE (target);
18457   machine_mode data_mode = GET_MODE (op0);
18458
18459   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18460   switch (code)
18461     {
18462     case UNORDERED:
18463       /* UNORDERED has no immediate form.  */
18464       op1 = force_reg (data_mode, op1);
18465       /* fall through */
18466     case LT:
18467     case LE:
18468     case GT:
18469     case GE:
18470     case EQ:
18471     case NE:
18472       {
18473         /* There is native support for the comparison.  */
18474         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18475         return false;
18476       }
18477
18478     case LTGT:
18479       /* This is a trapping operation (LT or GT).  */
18480       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18481       return false;
18482
18483     case UNEQ:
18484       if (!flag_trapping_math)
18485         {
18486           /* This would trap for signaling NaNs.  */
18487           op1 = force_reg (data_mode, op1);
18488           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18489                                         ptrue, true, op0, op1);
18490           return false;
18491         }
18492       /* fall through */
18493     case UNLT:
18494     case UNLE:
18495     case UNGT:
18496     case UNGE:
18497       if (flag_trapping_math)
18498         {
18499           /* Work out which elements are ordered.  */
18500           rtx ordered = gen_reg_rtx (pred_mode);
18501           op1 = force_reg (data_mode, op1);
18502           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18503                                            ptrue, true, op0, op1);
18504
18505           /* Test the opposite condition for the ordered elements,
18506              then invert the result.  */
18507           if (code == UNEQ)
18508             code = NE;
18509           else
18510             code = reverse_condition_maybe_unordered (code);
18511           if (can_invert_p)
18512             {
18513               aarch64_emit_sve_fp_cond (target, code,
18514                                         ordered, false, op0, op1);
18515               return true;
18516             }
18517           aarch64_emit_sve_invert_fp_cond (target, code,
18518                                            ordered, false, op0, op1);
18519           return false;
18520         }
18521       break;
18522
18523     case ORDERED:
18524       /* ORDERED has no immediate form.  */
18525       op1 = force_reg (data_mode, op1);
18526       break;
18527
18528     default:
18529       gcc_unreachable ();
18530     }
18531
18532   /* There is native support for the inverse comparison.  */
18533   code = reverse_condition_maybe_unordered (code);
18534   if (can_invert_p)
18535     {
18536       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18537       return true;
18538     }
18539   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18540   return false;
18541 }
18542
18543 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18544    of the data being selected and CMP_MODE is the mode of the values being
18545    compared.  */
18546
18547 void
18548 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18549                           rtx *ops)
18550 {
18551   machine_mode pred_mode
18552     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18553                              GET_MODE_SIZE (cmp_mode)).require ();
18554   rtx pred = gen_reg_rtx (pred_mode);
18555   if (FLOAT_MODE_P (cmp_mode))
18556     {
18557       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18558                                             ops[4], ops[5], true))
18559         std::swap (ops[1], ops[2]);
18560     }
18561   else
18562     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18563
18564   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18565     ops[1] = force_reg (data_mode, ops[1]);
18566   /* The "false" value can only be zero if the "true" value is a constant.  */
18567   if (register_operand (ops[1], data_mode)
18568       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18569     ops[2] = force_reg (data_mode, ops[2]);
18570
18571   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18572   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18573 }
18574
18575 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18576    true.  However due to issues with register allocation it is preferable
18577    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18578    operations in general registers is better than treating them as scalar
18579    vector operations.  This reduces latency and avoids redundant int<->FP
18580    moves.  So tie modes if they are either the same class, or vector modes
18581    with other vector modes, vector structs or any scalar mode.  */
18582
18583 static bool
18584 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18585 {
18586   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18587     return true;
18588
18589   /* We specifically want to allow elements of "structure" modes to
18590      be tieable to the structure.  This more general condition allows
18591      other rarer situations too.  The reason we don't extend this to
18592      predicate modes is that there are no predicate structure modes
18593      nor any specific instructions for extracting part of a predicate
18594      register.  */
18595   if (aarch64_vector_data_mode_p (mode1)
18596       && aarch64_vector_data_mode_p (mode2))
18597     return true;
18598
18599   /* Also allow any scalar modes with vectors.  */
18600   if (aarch64_vector_mode_supported_p (mode1)
18601       || aarch64_vector_mode_supported_p (mode2))
18602     return true;
18603
18604   return false;
18605 }
18606
18607 /* Return a new RTX holding the result of moving POINTER forward by
18608    AMOUNT bytes.  */
18609
18610 static rtx
18611 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18612 {
18613   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18614
18615   return adjust_automodify_address (pointer, GET_MODE (pointer),
18616                                     next, amount);
18617 }
18618
18619 /* Return a new RTX holding the result of moving POINTER forward by the
18620    size of the mode it points to.  */
18621
18622 static rtx
18623 aarch64_progress_pointer (rtx pointer)
18624 {
18625   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18626 }
18627
18628 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18629    MODE bytes.  */
18630
18631 static void
18632 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18633                                               machine_mode mode)
18634 {
18635   rtx reg = gen_reg_rtx (mode);
18636
18637   /* "Cast" the pointers to the correct mode.  */
18638   *src = adjust_address (*src, mode, 0);
18639   *dst = adjust_address (*dst, mode, 0);
18640   /* Emit the memcpy.  */
18641   emit_move_insn (reg, *src);
18642   emit_move_insn (*dst, reg);
18643   /* Move the pointers forward.  */
18644   *src = aarch64_progress_pointer (*src);
18645   *dst = aarch64_progress_pointer (*dst);
18646 }
18647
18648 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18649    we succeed, otherwise return false.  */
18650
18651 bool
18652 aarch64_expand_cpymem (rtx *operands)
18653 {
18654   int n, mode_bits;
18655   rtx dst = operands[0];
18656   rtx src = operands[1];
18657   rtx base;
18658   machine_mode cur_mode = BLKmode, next_mode;
18659   bool speed_p = !optimize_function_for_size_p (cfun);
18660
18661   /* When optimizing for size, give a better estimate of the length of a
18662      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18663      will always require an even number of instructions to do now.  And each
18664      operation requires both a load+store, so devide the max number by 2.  */
18665   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18666
18667   /* We can't do anything smart if the amount to copy is not constant.  */
18668   if (!CONST_INT_P (operands[2]))
18669     return false;
18670
18671   n = INTVAL (operands[2]);
18672
18673   /* Try to keep the number of instructions low.  For all cases we will do at
18674      most two moves for the residual amount, since we'll always overlap the
18675      remainder.  */
18676   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18677     return false;
18678
18679   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18680   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18681
18682   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18683   src = adjust_automodify_address (src, VOIDmode, base, 0);
18684
18685   /* Convert n to bits to make the rest of the code simpler.  */
18686   n = n * BITS_PER_UNIT;
18687
18688   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18689      larger than TImode, but we should not use them for loads/stores here.  */
18690   const int copy_limit = GET_MODE_BITSIZE (TImode);
18691
18692   while (n > 0)
18693     {
18694       /* Find the largest mode in which to do the copy in without over reading
18695          or writing.  */
18696       opt_scalar_int_mode mode_iter;
18697       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18698         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18699           cur_mode = mode_iter.require ();
18700
18701       gcc_assert (cur_mode != BLKmode);
18702
18703       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18704       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18705
18706       n -= mode_bits;
18707
18708       /* Do certain trailing copies as overlapping if it's going to be
18709          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18710          byte copy it's more efficient to do two overlapping 8 byte copies than
18711          8 + 6 + 1.  */
18712       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18713         {
18714           next_mode = smallest_mode_for_size (n, MODE_INT);
18715           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18716           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18717           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18718           n = n_bits;
18719         }
18720     }
18721
18722   return true;
18723 }
18724
18725 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18726    SImode stores.  Handle the case when the constant has identical
18727    bottom and top halves.  This is beneficial when the two stores can be
18728    merged into an STP and we avoid synthesising potentially expensive
18729    immediates twice.  Return true if such a split is possible.  */
18730
18731 bool
18732 aarch64_split_dimode_const_store (rtx dst, rtx src)
18733 {
18734   rtx lo = gen_lowpart (SImode, src);
18735   rtx hi = gen_highpart_mode (SImode, DImode, src);
18736
18737   bool size_p = optimize_function_for_size_p (cfun);
18738
18739   if (!rtx_equal_p (lo, hi))
18740     return false;
18741
18742   unsigned int orig_cost
18743     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18744   unsigned int lo_cost
18745     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18746
18747   /* We want to transform:
18748      MOV        x1, 49370
18749      MOVK       x1, 0x140, lsl 16
18750      MOVK       x1, 0xc0da, lsl 32
18751      MOVK       x1, 0x140, lsl 48
18752      STR        x1, [x0]
18753    into:
18754      MOV        w1, 49370
18755      MOVK       w1, 0x140, lsl 16
18756      STP        w1, w1, [x0]
18757    So we want to perform this only when we save two instructions
18758    or more.  When optimizing for size, however, accept any code size
18759    savings we can.  */
18760   if (size_p && orig_cost <= lo_cost)
18761     return false;
18762
18763   if (!size_p
18764       && (orig_cost <= lo_cost + 1))
18765     return false;
18766
18767   rtx mem_lo = adjust_address (dst, SImode, 0);
18768   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18769     return false;
18770
18771   rtx tmp_reg = gen_reg_rtx (SImode);
18772   aarch64_expand_mov_immediate (tmp_reg, lo);
18773   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18774   /* Don't emit an explicit store pair as this may not be always profitable.
18775      Let the sched-fusion logic decide whether to merge them.  */
18776   emit_move_insn (mem_lo, tmp_reg);
18777   emit_move_insn (mem_hi, tmp_reg);
18778
18779   return true;
18780 }
18781
18782 /* Generate RTL for a conditional branch with rtx comparison CODE in
18783    mode CC_MODE.  The destination of the unlikely conditional branch
18784    is LABEL_REF.  */
18785
18786 void
18787 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18788                               rtx label_ref)
18789 {
18790   rtx x;
18791   x = gen_rtx_fmt_ee (code, VOIDmode,
18792                       gen_rtx_REG (cc_mode, CC_REGNUM),
18793                       const0_rtx);
18794
18795   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18796                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18797                             pc_rtx);
18798   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18799 }
18800
18801 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18802
18803    OP1 represents the TImode destination operand 1
18804    OP2 represents the TImode destination operand 2
18805    LOW_DEST represents the low half (DImode) of TImode operand 0
18806    LOW_IN1 represents the low half (DImode) of TImode operand 1
18807    LOW_IN2 represents the low half (DImode) of TImode operand 2
18808    HIGH_DEST represents the high half (DImode) of TImode operand 0
18809    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18810    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18811
18812 void
18813 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18814                             rtx *low_in1, rtx *low_in2,
18815                             rtx *high_dest, rtx *high_in1,
18816                             rtx *high_in2)
18817 {
18818   *low_dest = gen_reg_rtx (DImode);
18819   *low_in1 = gen_lowpart (DImode, op1);
18820   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18821                                   subreg_lowpart_offset (DImode, TImode));
18822   *high_dest = gen_reg_rtx (DImode);
18823   *high_in1 = gen_highpart (DImode, op1);
18824   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18825                                    subreg_highpart_offset (DImode, TImode));
18826 }
18827
18828 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18829
18830    This function differs from 'arch64_addti_scratch_regs' in that
18831    OP1 can be an immediate constant (zero). We must call
18832    subreg_highpart_offset with DImode and TImode arguments, otherwise
18833    VOIDmode will be used for the const_int which generates an internal
18834    error from subreg_size_highpart_offset which does not expect a size of zero.
18835
18836    OP1 represents the TImode destination operand 1
18837    OP2 represents the TImode destination operand 2
18838    LOW_DEST represents the low half (DImode) of TImode operand 0
18839    LOW_IN1 represents the low half (DImode) of TImode operand 1
18840    LOW_IN2 represents the low half (DImode) of TImode operand 2
18841    HIGH_DEST represents the high half (DImode) of TImode operand 0
18842    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18843    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18844
18845
18846 void
18847 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18848                              rtx *low_in1, rtx *low_in2,
18849                              rtx *high_dest, rtx *high_in1,
18850                              rtx *high_in2)
18851 {
18852   *low_dest = gen_reg_rtx (DImode);
18853   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18854                                   subreg_lowpart_offset (DImode, TImode));
18855
18856   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18857                                   subreg_lowpart_offset (DImode, TImode));
18858   *high_dest = gen_reg_rtx (DImode);
18859
18860   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18861                                    subreg_highpart_offset (DImode, TImode));
18862   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18863                                    subreg_highpart_offset (DImode, TImode));
18864 }
18865
18866 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18867
18868    OP0 represents the TImode destination operand 0
18869    LOW_DEST represents the low half (DImode) of TImode operand 0
18870    LOW_IN1 represents the low half (DImode) of TImode operand 1
18871    LOW_IN2 represents the low half (DImode) of TImode operand 2
18872    HIGH_DEST represents the high half (DImode) of TImode operand 0
18873    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18874    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18875    UNSIGNED_P is true if the operation is being performed on unsigned
18876    values.  */
18877 void
18878 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18879                        rtx low_in2, rtx high_dest, rtx high_in1,
18880                        rtx high_in2, bool unsigned_p)
18881 {
18882   if (low_in2 == const0_rtx)
18883     {
18884       low_dest = low_in1;
18885       high_in2 = force_reg (DImode, high_in2);
18886       if (unsigned_p)
18887         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18888       else
18889         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18890     }
18891   else
18892     {
18893       if (CONST_INT_P (low_in2))
18894         {
18895           high_in2 = force_reg (DImode, high_in2);
18896           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18897                                               GEN_INT (-INTVAL (low_in2))));
18898         }
18899       else
18900         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18901
18902       if (unsigned_p)
18903         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18904       else
18905         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18906     }
18907
18908   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18909   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18910
18911 }
18912
18913 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18914
18915 static unsigned HOST_WIDE_INT
18916 aarch64_asan_shadow_offset (void)
18917 {
18918   if (TARGET_ILP32)
18919     return (HOST_WIDE_INT_1 << 29);
18920   else
18921     return (HOST_WIDE_INT_1 << 36);
18922 }
18923
18924 static rtx
18925 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18926                         int code, tree treeop0, tree treeop1)
18927 {
18928   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18929   rtx op0, op1;
18930   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18931   insn_code icode;
18932   struct expand_operand ops[4];
18933
18934   start_sequence ();
18935   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18936
18937   op_mode = GET_MODE (op0);
18938   if (op_mode == VOIDmode)
18939     op_mode = GET_MODE (op1);
18940
18941   switch (op_mode)
18942     {
18943     case E_QImode:
18944     case E_HImode:
18945     case E_SImode:
18946       cmp_mode = SImode;
18947       icode = CODE_FOR_cmpsi;
18948       break;
18949
18950     case E_DImode:
18951       cmp_mode = DImode;
18952       icode = CODE_FOR_cmpdi;
18953       break;
18954
18955     case E_SFmode:
18956       cmp_mode = SFmode;
18957       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18958       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18959       break;
18960
18961     case E_DFmode:
18962       cmp_mode = DFmode;
18963       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18964       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18965       break;
18966
18967     default:
18968       end_sequence ();
18969       return NULL_RTX;
18970     }
18971
18972   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18973   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18974   if (!op0 || !op1)
18975     {
18976       end_sequence ();
18977       return NULL_RTX;
18978     }
18979   *prep_seq = get_insns ();
18980   end_sequence ();
18981
18982   create_fixed_operand (&ops[0], op0);
18983   create_fixed_operand (&ops[1], op1);
18984
18985   start_sequence ();
18986   if (!maybe_expand_insn (icode, 2, ops))
18987     {
18988       end_sequence ();
18989       return NULL_RTX;
18990     }
18991   *gen_seq = get_insns ();
18992   end_sequence ();
18993
18994   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18995                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18996 }
18997
18998 static rtx
18999 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
19000                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
19001 {
19002   rtx op0, op1, target;
19003   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19004   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19005   insn_code icode;
19006   struct expand_operand ops[6];
19007   int aarch64_cond;
19008
19009   push_to_sequence (*prep_seq);
19010   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19011
19012   op_mode = GET_MODE (op0);
19013   if (op_mode == VOIDmode)
19014     op_mode = GET_MODE (op1);
19015
19016   switch (op_mode)
19017     {
19018     case E_QImode:
19019     case E_HImode:
19020     case E_SImode:
19021       cmp_mode = SImode;
19022       icode = CODE_FOR_ccmpsi;
19023       break;
19024
19025     case E_DImode:
19026       cmp_mode = DImode;
19027       icode = CODE_FOR_ccmpdi;
19028       break;
19029
19030     case E_SFmode:
19031       cmp_mode = SFmode;
19032       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19033       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
19034       break;
19035
19036     case E_DFmode:
19037       cmp_mode = DFmode;
19038       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19039       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
19040       break;
19041
19042     default:
19043       end_sequence ();
19044       return NULL_RTX;
19045     }
19046
19047   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
19048   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
19049   if (!op0 || !op1)
19050     {
19051       end_sequence ();
19052       return NULL_RTX;
19053     }
19054   *prep_seq = get_insns ();
19055   end_sequence ();
19056
19057   target = gen_rtx_REG (cc_mode, CC_REGNUM);
19058   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
19059
19060   if (bit_code != AND)
19061     {
19062       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
19063                                                 GET_MODE (XEXP (prev, 0))),
19064                              VOIDmode, XEXP (prev, 0), const0_rtx);
19065       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
19066     }
19067
19068   create_fixed_operand (&ops[0], XEXP (prev, 0));
19069   create_fixed_operand (&ops[1], target);
19070   create_fixed_operand (&ops[2], op0);
19071   create_fixed_operand (&ops[3], op1);
19072   create_fixed_operand (&ops[4], prev);
19073   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
19074
19075   push_to_sequence (*gen_seq);
19076   if (!maybe_expand_insn (icode, 6, ops))
19077     {
19078       end_sequence ();
19079       return NULL_RTX;
19080     }
19081
19082   *gen_seq = get_insns ();
19083   end_sequence ();
19084
19085   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
19086 }
19087
19088 #undef TARGET_GEN_CCMP_FIRST
19089 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19090
19091 #undef TARGET_GEN_CCMP_NEXT
19092 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19093
19094 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
19095    instruction fusion of some sort.  */
19096
19097 static bool
19098 aarch64_macro_fusion_p (void)
19099 {
19100   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
19101 }
19102
19103
19104 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
19105    should be kept together during scheduling.  */
19106
19107 static bool
19108 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
19109 {
19110   rtx set_dest;
19111   rtx prev_set = single_set (prev);
19112   rtx curr_set = single_set (curr);
19113   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
19114   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
19115
19116   if (!aarch64_macro_fusion_p ())
19117     return false;
19118
19119   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
19120     {
19121       /* We are trying to match:
19122          prev (mov)  == (set (reg r0) (const_int imm16))
19123          curr (movk) == (set (zero_extract (reg r0)
19124                                            (const_int 16)
19125                                            (const_int 16))
19126                              (const_int imm16_1))  */
19127
19128       set_dest = SET_DEST (curr_set);
19129
19130       if (GET_CODE (set_dest) == ZERO_EXTRACT
19131           && CONST_INT_P (SET_SRC (curr_set))
19132           && CONST_INT_P (SET_SRC (prev_set))
19133           && CONST_INT_P (XEXP (set_dest, 2))
19134           && INTVAL (XEXP (set_dest, 2)) == 16
19135           && REG_P (XEXP (set_dest, 0))
19136           && REG_P (SET_DEST (prev_set))
19137           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
19138         {
19139           return true;
19140         }
19141     }
19142
19143   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
19144     {
19145
19146       /*  We're trying to match:
19147           prev (adrp) == (set (reg r1)
19148                               (high (symbol_ref ("SYM"))))
19149           curr (add) == (set (reg r0)
19150                              (lo_sum (reg r1)
19151                                      (symbol_ref ("SYM"))))
19152           Note that r0 need not necessarily be the same as r1, especially
19153           during pre-regalloc scheduling.  */
19154
19155       if (satisfies_constraint_Ush (SET_SRC (prev_set))
19156           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19157         {
19158           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
19159               && REG_P (XEXP (SET_SRC (curr_set), 0))
19160               && REGNO (XEXP (SET_SRC (curr_set), 0))
19161                  == REGNO (SET_DEST (prev_set))
19162               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
19163                               XEXP (SET_SRC (curr_set), 1)))
19164             return true;
19165         }
19166     }
19167
19168   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
19169     {
19170
19171       /* We're trying to match:
19172          prev (movk) == (set (zero_extract (reg r0)
19173                                            (const_int 16)
19174                                            (const_int 32))
19175                              (const_int imm16_1))
19176          curr (movk) == (set (zero_extract (reg r0)
19177                                            (const_int 16)
19178                                            (const_int 48))
19179                              (const_int imm16_2))  */
19180
19181       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
19182           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
19183           && REG_P (XEXP (SET_DEST (prev_set), 0))
19184           && REG_P (XEXP (SET_DEST (curr_set), 0))
19185           && REGNO (XEXP (SET_DEST (prev_set), 0))
19186              == REGNO (XEXP (SET_DEST (curr_set), 0))
19187           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
19188           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
19189           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
19190           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
19191           && CONST_INT_P (SET_SRC (prev_set))
19192           && CONST_INT_P (SET_SRC (curr_set)))
19193         return true;
19194
19195     }
19196   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
19197     {
19198       /* We're trying to match:
19199           prev (adrp) == (set (reg r0)
19200                               (high (symbol_ref ("SYM"))))
19201           curr (ldr) == (set (reg r1)
19202                              (mem (lo_sum (reg r0)
19203                                              (symbol_ref ("SYM")))))
19204                  or
19205           curr (ldr) == (set (reg r1)
19206                              (zero_extend (mem
19207                                            (lo_sum (reg r0)
19208                                                    (symbol_ref ("SYM"))))))  */
19209       if (satisfies_constraint_Ush (SET_SRC (prev_set))
19210           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19211         {
19212           rtx curr_src = SET_SRC (curr_set);
19213
19214           if (GET_CODE (curr_src) == ZERO_EXTEND)
19215             curr_src = XEXP (curr_src, 0);
19216
19217           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
19218               && REG_P (XEXP (XEXP (curr_src, 0), 0))
19219               && REGNO (XEXP (XEXP (curr_src, 0), 0))
19220                  == REGNO (SET_DEST (prev_set))
19221               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
19222                               XEXP (SET_SRC (prev_set), 0)))
19223               return true;
19224         }
19225     }
19226
19227   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
19228       && any_condjump_p (curr))
19229     {
19230       unsigned int condreg1, condreg2;
19231       rtx cc_reg_1;
19232       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
19233       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
19234
19235       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
19236           && prev
19237           && modified_in_p (cc_reg_1, prev))
19238         {
19239           enum attr_type prev_type = get_attr_type (prev);
19240
19241           /* FIXME: this misses some which is considered simple arthematic
19242              instructions for ThunderX.  Simple shifts are missed here.  */
19243           if (prev_type == TYPE_ALUS_SREG
19244               || prev_type == TYPE_ALUS_IMM
19245               || prev_type == TYPE_LOGICS_REG
19246               || prev_type == TYPE_LOGICS_IMM)
19247             return true;
19248         }
19249     }
19250
19251   if (prev_set
19252       && curr_set
19253       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
19254       && any_condjump_p (curr))
19255     {
19256       /* We're trying to match:
19257           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19258           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
19259                                                          (const_int 0))
19260                                                  (label_ref ("SYM"))
19261                                                  (pc))  */
19262       if (SET_DEST (curr_set) == (pc_rtx)
19263           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19264           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19265           && REG_P (SET_DEST (prev_set))
19266           && REGNO (SET_DEST (prev_set))
19267              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19268         {
19269           /* Fuse ALU operations followed by conditional branch instruction.  */
19270           switch (get_attr_type (prev))
19271             {
19272             case TYPE_ALU_IMM:
19273             case TYPE_ALU_SREG:
19274             case TYPE_ADC_REG:
19275             case TYPE_ADC_IMM:
19276             case TYPE_ADCS_REG:
19277             case TYPE_ADCS_IMM:
19278             case TYPE_LOGIC_REG:
19279             case TYPE_LOGIC_IMM:
19280             case TYPE_CSEL:
19281             case TYPE_ADR:
19282             case TYPE_MOV_IMM:
19283             case TYPE_SHIFT_REG:
19284             case TYPE_SHIFT_IMM:
19285             case TYPE_BFM:
19286             case TYPE_RBIT:
19287             case TYPE_REV:
19288             case TYPE_EXTEND:
19289               return true;
19290
19291             default:;
19292             }
19293         }
19294     }
19295
19296   return false;
19297 }
19298
19299 /* Return true iff the instruction fusion described by OP is enabled.  */
19300
19301 bool
19302 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19303 {
19304   return (aarch64_tune_params.fusible_ops & op) != 0;
19305 }
19306
19307 /* If MEM is in the form of [base+offset], extract the two parts
19308    of address and set to BASE and OFFSET, otherwise return false
19309    after clearing BASE and OFFSET.  */
19310
19311 bool
19312 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19313 {
19314   rtx addr;
19315
19316   gcc_assert (MEM_P (mem));
19317
19318   addr = XEXP (mem, 0);
19319
19320   if (REG_P (addr))
19321     {
19322       *base = addr;
19323       *offset = const0_rtx;
19324       return true;
19325     }
19326
19327   if (GET_CODE (addr) == PLUS
19328       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19329     {
19330       *base = XEXP (addr, 0);
19331       *offset = XEXP (addr, 1);
19332       return true;
19333     }
19334
19335   *base = NULL_RTX;
19336   *offset = NULL_RTX;
19337
19338   return false;
19339 }
19340
19341 /* Types for scheduling fusion.  */
19342 enum sched_fusion_type
19343 {
19344   SCHED_FUSION_NONE = 0,
19345   SCHED_FUSION_LD_SIGN_EXTEND,
19346   SCHED_FUSION_LD_ZERO_EXTEND,
19347   SCHED_FUSION_LD,
19348   SCHED_FUSION_ST,
19349   SCHED_FUSION_NUM
19350 };
19351
19352 /* If INSN is a load or store of address in the form of [base+offset],
19353    extract the two parts and set to BASE and OFFSET.  Return scheduling
19354    fusion type this INSN is.  */
19355
19356 static enum sched_fusion_type
19357 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19358 {
19359   rtx x, dest, src;
19360   enum sched_fusion_type fusion = SCHED_FUSION_LD;
19361
19362   gcc_assert (INSN_P (insn));
19363   x = PATTERN (insn);
19364   if (GET_CODE (x) != SET)
19365     return SCHED_FUSION_NONE;
19366
19367   src = SET_SRC (x);
19368   dest = SET_DEST (x);
19369
19370   machine_mode dest_mode = GET_MODE (dest);
19371
19372   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19373     return SCHED_FUSION_NONE;
19374
19375   if (GET_CODE (src) == SIGN_EXTEND)
19376     {
19377       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19378       src = XEXP (src, 0);
19379       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19380         return SCHED_FUSION_NONE;
19381     }
19382   else if (GET_CODE (src) == ZERO_EXTEND)
19383     {
19384       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19385       src = XEXP (src, 0);
19386       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19387         return SCHED_FUSION_NONE;
19388     }
19389
19390   if (GET_CODE (src) == MEM && REG_P (dest))
19391     extract_base_offset_in_addr (src, base, offset);
19392   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19393     {
19394       fusion = SCHED_FUSION_ST;
19395       extract_base_offset_in_addr (dest, base, offset);
19396     }
19397   else
19398     return SCHED_FUSION_NONE;
19399
19400   if (*base == NULL_RTX || *offset == NULL_RTX)
19401     fusion = SCHED_FUSION_NONE;
19402
19403   return fusion;
19404 }
19405
19406 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19407
19408    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19409    and PRI are only calculated for these instructions.  For other instruction,
19410    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
19411    type instruction fusion can be added by returning different priorities.
19412
19413    It's important that irrelevant instructions get the largest FUSION_PRI.  */
19414
19415 static void
19416 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19417                                int *fusion_pri, int *pri)
19418 {
19419   int tmp, off_val;
19420   rtx base, offset;
19421   enum sched_fusion_type fusion;
19422
19423   gcc_assert (INSN_P (insn));
19424
19425   tmp = max_pri - 1;
19426   fusion = fusion_load_store (insn, &base, &offset);
19427   if (fusion == SCHED_FUSION_NONE)
19428     {
19429       *pri = tmp;
19430       *fusion_pri = tmp;
19431       return;
19432     }
19433
19434   /* Set FUSION_PRI according to fusion type and base register.  */
19435   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19436
19437   /* Calculate PRI.  */
19438   tmp /= 2;
19439
19440   /* INSN with smaller offset goes first.  */
19441   off_val = (int)(INTVAL (offset));
19442   if (off_val >= 0)
19443     tmp -= (off_val & 0xfffff);
19444   else
19445     tmp += ((- off_val) & 0xfffff);
19446
19447   *pri = tmp;
19448   return;
19449 }
19450
19451 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19452    Adjust priority of sha1h instructions so they are scheduled before
19453    other SHA1 instructions.  */
19454
19455 static int
19456 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19457 {
19458   rtx x = PATTERN (insn);
19459
19460   if (GET_CODE (x) == SET)
19461     {
19462       x = SET_SRC (x);
19463
19464       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19465         return priority + 10;
19466     }
19467
19468   return priority;
19469 }
19470
19471 /* Given OPERANDS of consecutive load/store, check if we can merge
19472    them into ldp/stp.  LOAD is true if they are load instructions.
19473    MODE is the mode of memory operands.  */
19474
19475 bool
19476 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19477                                 machine_mode mode)
19478 {
19479   HOST_WIDE_INT offval_1, offval_2, msize;
19480   enum reg_class rclass_1, rclass_2;
19481   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19482
19483   if (load)
19484     {
19485       mem_1 = operands[1];
19486       mem_2 = operands[3];
19487       reg_1 = operands[0];
19488       reg_2 = operands[2];
19489       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19490       if (REGNO (reg_1) == REGNO (reg_2))
19491         return false;
19492     }
19493   else
19494     {
19495       mem_1 = operands[0];
19496       mem_2 = operands[2];
19497       reg_1 = operands[1];
19498       reg_2 = operands[3];
19499     }
19500
19501   /* The mems cannot be volatile.  */
19502   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19503     return false;
19504
19505   /* If we have SImode and slow unaligned ldp,
19506      check the alignment to be at least 8 byte. */
19507   if (mode == SImode
19508       && (aarch64_tune_params.extra_tuning_flags
19509           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19510       && !optimize_size
19511       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19512     return false;
19513
19514   /* Check if the addresses are in the form of [base+offset].  */
19515   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19516   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19517     return false;
19518   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19519   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19520     return false;
19521
19522   /* Check if the bases are same.  */
19523   if (!rtx_equal_p (base_1, base_2))
19524     return false;
19525
19526   /* The operands must be of the same size.  */
19527   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19528                          GET_MODE_SIZE (GET_MODE (mem_2))));
19529
19530   offval_1 = INTVAL (offset_1);
19531   offval_2 = INTVAL (offset_2);
19532   /* We should only be trying this for fixed-sized modes.  There is no
19533      SVE LDP/STP instruction.  */
19534   msize = GET_MODE_SIZE (mode).to_constant ();
19535   /* Check if the offsets are consecutive.  */
19536   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19537     return false;
19538
19539   /* Check if the addresses are clobbered by load.  */
19540   if (load)
19541     {
19542       if (reg_mentioned_p (reg_1, mem_1))
19543         return false;
19544
19545       /* In increasing order, the last load can clobber the address.  */
19546       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19547         return false;
19548     }
19549
19550   /* One of the memory accesses must be a mempair operand.
19551      If it is not the first one, they need to be swapped by the
19552      peephole.  */
19553   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19554        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19555     return false;
19556
19557   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19558     rclass_1 = FP_REGS;
19559   else
19560     rclass_1 = GENERAL_REGS;
19561
19562   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19563     rclass_2 = FP_REGS;
19564   else
19565     rclass_2 = GENERAL_REGS;
19566
19567   /* Check if the registers are of same class.  */
19568   if (rclass_1 != rclass_2)
19569     return false;
19570
19571   return true;
19572 }
19573
19574 /* Given OPERANDS of consecutive load/store that can be merged,
19575    swap them if they are not in ascending order.  */
19576 void
19577 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19578 {
19579   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19580   HOST_WIDE_INT offval_1, offval_2;
19581
19582   if (load)
19583     {
19584       mem_1 = operands[1];
19585       mem_2 = operands[3];
19586     }
19587   else
19588     {
19589       mem_1 = operands[0];
19590       mem_2 = operands[2];
19591     }
19592
19593   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19594   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19595
19596   offval_1 = INTVAL (offset_1);
19597   offval_2 = INTVAL (offset_2);
19598
19599   if (offval_1 > offval_2)
19600     {
19601       /* Irrespective of whether this is a load or a store,
19602          we do the same swap.  */
19603       std::swap (operands[0], operands[2]);
19604       std::swap (operands[1], operands[3]);
19605     }
19606 }
19607
19608 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19609    comparison between the two.  */
19610 int
19611 aarch64_host_wide_int_compare (const void *x, const void *y)
19612 {
19613   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19614                    * ((const HOST_WIDE_INT *) y));
19615 }
19616
19617 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19618    other pointing to a REG rtx containing an offset, compare the offsets
19619    of the two pairs.
19620
19621    Return:
19622
19623         1 iff offset (X) > offset (Y)
19624         0 iff offset (X) == offset (Y)
19625         -1 iff offset (X) < offset (Y)  */
19626 int
19627 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19628 {
19629   const rtx * operands_1 = (const rtx *) x;
19630   const rtx * operands_2 = (const rtx *) y;
19631   rtx mem_1, mem_2, base, offset_1, offset_2;
19632
19633   if (MEM_P (operands_1[0]))
19634     mem_1 = operands_1[0];
19635   else
19636     mem_1 = operands_1[1];
19637
19638   if (MEM_P (operands_2[0]))
19639     mem_2 = operands_2[0];
19640   else
19641     mem_2 = operands_2[1];
19642
19643   /* Extract the offsets.  */
19644   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19645   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19646
19647   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19648
19649   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19650 }
19651
19652 /* Given OPERANDS of consecutive load/store, check if we can merge
19653    them into ldp/stp by adjusting the offset.  LOAD is true if they
19654    are load instructions.  MODE is the mode of memory operands.
19655
19656    Given below consecutive stores:
19657
19658      str  w1, [xb, 0x100]
19659      str  w1, [xb, 0x104]
19660      str  w1, [xb, 0x108]
19661      str  w1, [xb, 0x10c]
19662
19663    Though the offsets are out of the range supported by stp, we can
19664    still pair them after adjusting the offset, like:
19665
19666      add  scratch, xb, 0x100
19667      stp  w1, w1, [scratch]
19668      stp  w1, w1, [scratch, 0x8]
19669
19670    The peephole patterns detecting this opportunity should guarantee
19671    the scratch register is avaliable.  */
19672
19673 bool
19674 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19675                                        scalar_mode mode)
19676 {
19677   const int num_insns = 4;
19678   enum reg_class rclass;
19679   HOST_WIDE_INT offvals[num_insns], msize;
19680   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19681
19682   if (load)
19683     {
19684       for (int i = 0; i < num_insns; i++)
19685         {
19686           reg[i] = operands[2 * i];
19687           mem[i] = operands[2 * i + 1];
19688
19689           gcc_assert (REG_P (reg[i]));
19690         }
19691
19692       /* Do not attempt to merge the loads if the loads clobber each other.  */
19693       for (int i = 0; i < 8; i += 2)
19694         for (int j = i + 2; j < 8; j += 2)
19695           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19696             return false;
19697     }
19698   else
19699     for (int i = 0; i < num_insns; i++)
19700       {
19701         mem[i] = operands[2 * i];
19702         reg[i] = operands[2 * i + 1];
19703       }
19704
19705   /* Skip if memory operand is by itself valid for ldp/stp.  */
19706   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19707     return false;
19708
19709   for (int i = 0; i < num_insns; i++)
19710     {
19711       /* The mems cannot be volatile.  */
19712       if (MEM_VOLATILE_P (mem[i]))
19713         return false;
19714
19715       /* Check if the addresses are in the form of [base+offset].  */
19716       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19717       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19718         return false;
19719     }
19720
19721   /* Check if the registers are of same class.  */
19722   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19723     ? FP_REGS : GENERAL_REGS;
19724
19725   for (int i = 1; i < num_insns; i++)
19726     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19727       {
19728         if (rclass != FP_REGS)
19729           return false;
19730       }
19731     else
19732       {
19733         if (rclass != GENERAL_REGS)
19734           return false;
19735       }
19736
19737   /* Only the last register in the order in which they occur
19738      may be clobbered by the load.  */
19739   if (rclass == GENERAL_REGS && load)
19740     for (int i = 0; i < num_insns - 1; i++)
19741       if (reg_mentioned_p (reg[i], mem[i]))
19742         return false;
19743
19744   /* Check if the bases are same.  */
19745   for (int i = 0; i < num_insns - 1; i++)
19746     if (!rtx_equal_p (base[i], base[i + 1]))
19747       return false;
19748
19749   for (int i = 0; i < num_insns; i++)
19750     offvals[i] = INTVAL (offset[i]);
19751
19752   msize = GET_MODE_SIZE (mode);
19753
19754   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19755   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19756          aarch64_host_wide_int_compare);
19757
19758   if (!(offvals[1] == offvals[0] + msize
19759         && offvals[3] == offvals[2] + msize))
19760     return false;
19761
19762   /* Check that offsets are within range of each other.  The ldp/stp
19763      instructions have 7 bit immediate offsets, so use 0x80.  */
19764   if (offvals[2] - offvals[0] >= msize * 0x80)
19765     return false;
19766
19767   /* The offsets must be aligned with respect to each other.  */
19768   if (offvals[0] % msize != offvals[2] % msize)
19769     return false;
19770
19771   /* If we have SImode and slow unaligned ldp,
19772      check the alignment to be at least 8 byte. */
19773   if (mode == SImode
19774       && (aarch64_tune_params.extra_tuning_flags
19775           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19776       && !optimize_size
19777       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19778     return false;
19779
19780   return true;
19781 }
19782
19783 /* Given OPERANDS of consecutive load/store, this function pairs them
19784    into LDP/STP after adjusting the offset.  It depends on the fact
19785    that the operands can be sorted so the offsets are correct for STP.
19786    MODE is the mode of memory operands.  CODE is the rtl operator
19787    which should be applied to all memory operands, it's SIGN_EXTEND,
19788    ZERO_EXTEND or UNKNOWN.  */
19789
19790 bool
19791 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19792                              scalar_mode mode, RTX_CODE code)
19793 {
19794   rtx base, offset_1, offset_3, t1, t2;
19795   rtx mem_1, mem_2, mem_3, mem_4;
19796   rtx temp_operands[8];
19797   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19798                 stp_off_upper_limit, stp_off_lower_limit, msize;
19799
19800   /* We make changes on a copy as we may still bail out.  */
19801   for (int i = 0; i < 8; i ++)
19802     temp_operands[i] = operands[i];
19803
19804   /* Sort the operands.  */
19805   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19806
19807   /* Copy the memory operands so that if we have to bail for some
19808      reason the original addresses are unchanged.  */
19809   if (load)
19810     {
19811       mem_1 = copy_rtx (temp_operands[1]);
19812       mem_2 = copy_rtx (temp_operands[3]);
19813       mem_3 = copy_rtx (temp_operands[5]);
19814       mem_4 = copy_rtx (temp_operands[7]);
19815     }
19816   else
19817     {
19818       mem_1 = copy_rtx (temp_operands[0]);
19819       mem_2 = copy_rtx (temp_operands[2]);
19820       mem_3 = copy_rtx (temp_operands[4]);
19821       mem_4 = copy_rtx (temp_operands[6]);
19822       gcc_assert (code == UNKNOWN);
19823     }
19824
19825   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19826   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19827   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19828               && offset_3 != NULL_RTX);
19829
19830   /* Adjust offset so it can fit in LDP/STP instruction.  */
19831   msize = GET_MODE_SIZE (mode);
19832   stp_off_upper_limit = msize * (0x40 - 1);
19833   stp_off_lower_limit = - msize * 0x40;
19834
19835   off_val_1 = INTVAL (offset_1);
19836   off_val_3 = INTVAL (offset_3);
19837
19838   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19839   if (msize <= 4)
19840     base_off = (off_val_1 + off_val_3) / 2;
19841   else
19842     /* However, due to issues with negative LDP/STP offset generation for
19843        larger modes, for DF, DI and vector modes. we must not use negative
19844        addresses smaller than 9 signed unadjusted bits can store.  This
19845        provides the most range in this case.  */
19846     base_off = off_val_1;
19847
19848   /* Adjust the base so that it is aligned with the addresses but still
19849      optimal.  */
19850   if (base_off % msize != off_val_1 % msize)
19851     /* Fix the offset, bearing in mind we want to make it bigger not
19852        smaller.  */
19853     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19854   else if (msize <= 4)
19855     /* The negative range of LDP/STP is one larger than the positive range.  */
19856     base_off += msize;
19857
19858   /* Check if base offset is too big or too small.  We can attempt to resolve
19859      this issue by setting it to the maximum value and seeing if the offsets
19860      still fit.  */
19861   if (base_off >= 0x1000)
19862     {
19863       base_off = 0x1000 - 1;
19864       /* We must still make sure that the base offset is aligned with respect
19865          to the address.  But it may may not be made any bigger.  */
19866       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19867     }
19868
19869   /* Likewise for the case where the base is too small.  */
19870   if (base_off <= -0x1000)
19871     {
19872       base_off = -0x1000 + 1;
19873       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19874     }
19875
19876   /* Offset of the first STP/LDP.  */
19877   new_off_1 = off_val_1 - base_off;
19878
19879   /* Offset of the second STP/LDP.  */
19880   new_off_3 = off_val_3 - base_off;
19881
19882   /* The offsets must be within the range of the LDP/STP instructions.  */
19883   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19884       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19885     return false;
19886
19887   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19888                                                   new_off_1), true);
19889   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19890                                                   new_off_1 + msize), true);
19891   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19892                                                   new_off_3), true);
19893   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19894                                                   new_off_3 + msize), true);
19895
19896   if (!aarch64_mem_pair_operand (mem_1, mode)
19897       || !aarch64_mem_pair_operand (mem_3, mode))
19898     return false;
19899
19900   if (code == ZERO_EXTEND)
19901     {
19902       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19903       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19904       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19905       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19906     }
19907   else if (code == SIGN_EXTEND)
19908     {
19909       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19910       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19911       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19912       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19913     }
19914
19915   if (load)
19916     {
19917       operands[0] = temp_operands[0];
19918       operands[1] = mem_1;
19919       operands[2] = temp_operands[2];
19920       operands[3] = mem_2;
19921       operands[4] = temp_operands[4];
19922       operands[5] = mem_3;
19923       operands[6] = temp_operands[6];
19924       operands[7] = mem_4;
19925     }
19926   else
19927     {
19928       operands[0] = mem_1;
19929       operands[1] = temp_operands[1];
19930       operands[2] = mem_2;
19931       operands[3] = temp_operands[3];
19932       operands[4] = mem_3;
19933       operands[5] = temp_operands[5];
19934       operands[6] = mem_4;
19935       operands[7] = temp_operands[7];
19936     }
19937
19938   /* Emit adjusting instruction.  */
19939   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19940   /* Emit ldp/stp instructions.  */
19941   t1 = gen_rtx_SET (operands[0], operands[1]);
19942   t2 = gen_rtx_SET (operands[2], operands[3]);
19943   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19944   t1 = gen_rtx_SET (operands[4], operands[5]);
19945   t2 = gen_rtx_SET (operands[6], operands[7]);
19946   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19947   return true;
19948 }
19949
19950 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19951    it isn't worth branching around empty masked ops (including masked
19952    stores).  */
19953
19954 static bool
19955 aarch64_empty_mask_is_expensive (unsigned)
19956 {
19957   return false;
19958 }
19959
19960 /* Return 1 if pseudo register should be created and used to hold
19961    GOT address for PIC code.  */
19962
19963 bool
19964 aarch64_use_pseudo_pic_reg (void)
19965 {
19966   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19967 }
19968
19969 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19970
19971 static int
19972 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19973 {
19974   switch (XINT (x, 1))
19975     {
19976     case UNSPEC_GOTSMALLPIC:
19977     case UNSPEC_GOTSMALLPIC28K:
19978     case UNSPEC_GOTTINYPIC:
19979       return 0;
19980     default:
19981       break;
19982     }
19983
19984   return default_unspec_may_trap_p (x, flags);
19985 }
19986
19987
19988 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19989    return the log2 of that value.  Otherwise return -1.  */
19990
19991 int
19992 aarch64_fpconst_pow_of_2 (rtx x)
19993 {
19994   const REAL_VALUE_TYPE *r;
19995
19996   if (!CONST_DOUBLE_P (x))
19997     return -1;
19998
19999   r = CONST_DOUBLE_REAL_VALUE (x);
20000
20001   if (REAL_VALUE_NEGATIVE (*r)
20002       || REAL_VALUE_ISNAN (*r)
20003       || REAL_VALUE_ISINF (*r)
20004       || !real_isinteger (r, DFmode))
20005     return -1;
20006
20007   return exact_log2 (real_to_integer (r));
20008 }
20009
20010 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20011    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20012    return n. Otherwise return -1.  */
20013
20014 int
20015 aarch64_fpconst_pow2_recip (rtx x)
20016 {
20017   REAL_VALUE_TYPE r0;
20018
20019   if (!CONST_DOUBLE_P (x))
20020     return -1;
20021
20022   r0 = *CONST_DOUBLE_REAL_VALUE (x);
20023   if (exact_real_inverse (DFmode, &r0)
20024       && !REAL_VALUE_NEGATIVE (r0))
20025     {
20026         int ret = exact_log2 (real_to_integer (&r0));
20027         if (ret >= 1 && ret <= 32)
20028             return ret;
20029     }
20030   return -1;
20031 }
20032
20033 /* If X is a vector of equal CONST_DOUBLE values and that value is
20034    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
20035
20036 int
20037 aarch64_vec_fpconst_pow_of_2 (rtx x)
20038 {
20039   int nelts;
20040   if (GET_CODE (x) != CONST_VECTOR
20041       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
20042     return -1;
20043
20044   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
20045     return -1;
20046
20047   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
20048   if (firstval <= 0)
20049     return -1;
20050
20051   for (int i = 1; i < nelts; i++)
20052     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
20053       return -1;
20054
20055   return firstval;
20056 }
20057
20058 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20059    to float.
20060
20061    __fp16 always promotes through this hook.
20062    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20063    through the generic excess precision logic rather than here.  */
20064
20065 static tree
20066 aarch64_promoted_type (const_tree t)
20067 {
20068   if (SCALAR_FLOAT_TYPE_P (t)
20069       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
20070     return float_type_node;
20071
20072   return NULL_TREE;
20073 }
20074
20075 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
20076
20077 static bool
20078 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
20079                            optimization_type opt_type)
20080 {
20081   switch (op)
20082     {
20083     case rsqrt_optab:
20084       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
20085
20086     default:
20087       return true;
20088     }
20089 }
20090
20091 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
20092
20093 static unsigned int
20094 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
20095                                         int *offset)
20096 {
20097   /* Polynomial invariant 1 == (VG / 2) - 1.  */
20098   gcc_assert (i == 1);
20099   *factor = 2;
20100   *offset = 1;
20101   return AARCH64_DWARF_VG;
20102 }
20103
20104 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20105    if MODE is HFmode, and punt to the generic implementation otherwise.  */
20106
20107 static bool
20108 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
20109 {
20110   return (mode == HFmode
20111           ? true
20112           : default_libgcc_floating_mode_supported_p (mode));
20113 }
20114
20115 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20116    if MODE is HFmode, and punt to the generic implementation otherwise.  */
20117
20118 static bool
20119 aarch64_scalar_mode_supported_p (scalar_mode mode)
20120 {
20121   return (mode == HFmode
20122           ? true
20123           : default_scalar_mode_supported_p (mode));
20124 }
20125
20126 /* Set the value of FLT_EVAL_METHOD.
20127    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20128
20129     0: evaluate all operations and constants, whose semantic type has at
20130        most the range and precision of type float, to the range and
20131        precision of float; evaluate all other operations and constants to
20132        the range and precision of the semantic type;
20133
20134     N, where _FloatN is a supported interchange floating type
20135        evaluate all operations and constants, whose semantic type has at
20136        most the range and precision of _FloatN type, to the range and
20137        precision of the _FloatN type; evaluate all other operations and
20138        constants to the range and precision of the semantic type;
20139
20140    If we have the ARMv8.2-A extensions then we support _Float16 in native
20141    precision, so we should set this to 16.  Otherwise, we support the type,
20142    but want to evaluate expressions in float precision, so set this to
20143    0.  */
20144
20145 static enum flt_eval_method
20146 aarch64_excess_precision (enum excess_precision_type type)
20147 {
20148   switch (type)
20149     {
20150       case EXCESS_PRECISION_TYPE_FAST:
20151       case EXCESS_PRECISION_TYPE_STANDARD:
20152         /* We can calculate either in 16-bit range and precision or
20153            32-bit range and precision.  Make that decision based on whether
20154            we have native support for the ARMv8.2-A 16-bit floating-point
20155            instructions or not.  */
20156         return (TARGET_FP_F16INST
20157                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20158                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
20159       case EXCESS_PRECISION_TYPE_IMPLICIT:
20160         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
20161       default:
20162         gcc_unreachable ();
20163     }
20164   return FLT_EVAL_METHOD_UNPREDICTABLE;
20165 }
20166
20167 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
20168    scheduled for speculative execution.  Reject the long-running division
20169    and square-root instructions.  */
20170
20171 static bool
20172 aarch64_sched_can_speculate_insn (rtx_insn *insn)
20173 {
20174   switch (get_attr_type (insn))
20175     {
20176       case TYPE_SDIV:
20177       case TYPE_UDIV:
20178       case TYPE_FDIVS:
20179       case TYPE_FDIVD:
20180       case TYPE_FSQRTS:
20181       case TYPE_FSQRTD:
20182       case TYPE_NEON_FP_SQRT_S:
20183       case TYPE_NEON_FP_SQRT_D:
20184       case TYPE_NEON_FP_SQRT_S_Q:
20185       case TYPE_NEON_FP_SQRT_D_Q:
20186       case TYPE_NEON_FP_DIV_S:
20187       case TYPE_NEON_FP_DIV_D:
20188       case TYPE_NEON_FP_DIV_S_Q:
20189       case TYPE_NEON_FP_DIV_D_Q:
20190         return false;
20191       default:
20192         return true;
20193     }
20194 }
20195
20196 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
20197
20198 static int
20199 aarch64_compute_pressure_classes (reg_class *classes)
20200 {
20201   int i = 0;
20202   classes[i++] = GENERAL_REGS;
20203   classes[i++] = FP_REGS;
20204   /* PR_REGS isn't a useful pressure class because many predicate pseudo
20205      registers need to go in PR_LO_REGS at some point during their
20206      lifetime.  Splitting it into two halves has the effect of making
20207      all predicates count against PR_LO_REGS, so that we try whenever
20208      possible to restrict the number of live predicates to 8.  This
20209      greatly reduces the amount of spilling in certain loops.  */
20210   classes[i++] = PR_LO_REGS;
20211   classes[i++] = PR_HI_REGS;
20212   return i;
20213 }
20214
20215 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
20216
20217 static bool
20218 aarch64_can_change_mode_class (machine_mode from,
20219                                machine_mode to, reg_class_t)
20220 {
20221   if (BYTES_BIG_ENDIAN)
20222     {
20223       bool from_sve_p = aarch64_sve_data_mode_p (from);
20224       bool to_sve_p = aarch64_sve_data_mode_p (to);
20225
20226       /* Don't allow changes between SVE data modes and non-SVE modes.
20227          See the comment at the head of aarch64-sve.md for details.  */
20228       if (from_sve_p != to_sve_p)
20229         return false;
20230
20231       /* Don't allow changes in element size: lane 0 of the new vector
20232          would not then be lane 0 of the old vector.  See the comment
20233          above aarch64_maybe_expand_sve_subreg_move for a more detailed
20234          description.
20235
20236          In the worst case, this forces a register to be spilled in
20237          one mode and reloaded in the other, which handles the
20238          endianness correctly.  */
20239       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
20240         return false;
20241     }
20242   return true;
20243 }
20244
20245 /* Implement TARGET_EARLY_REMAT_MODES.  */
20246
20247 static void
20248 aarch64_select_early_remat_modes (sbitmap modes)
20249 {
20250   /* SVE values are not normally live across a call, so it should be
20251      worth doing early rematerialization even in VL-specific mode.  */
20252   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
20253     if (aarch64_sve_mode_p ((machine_mode) i))
20254       bitmap_set_bit (modes, i);
20255 }
20256
20257 /* Override the default target speculation_safe_value.  */
20258 static rtx
20259 aarch64_speculation_safe_value (machine_mode mode,
20260                                 rtx result, rtx val, rtx failval)
20261 {
20262   /* Maybe we should warn if falling back to hard barriers.  They are
20263      likely to be noticably more expensive than the alternative below.  */
20264   if (!aarch64_track_speculation)
20265     return default_speculation_safe_value (mode, result, val, failval);
20266
20267   if (!REG_P (val))
20268     val = copy_to_mode_reg (mode, val);
20269
20270   if (!aarch64_reg_or_zero (failval, mode))
20271     failval = copy_to_mode_reg (mode, failval);
20272
20273   emit_insn (gen_despeculate_copy (mode, result, val, failval));
20274   return result;
20275 }
20276
20277 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20278    Look into the tuning structure for an estimate.
20279    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20280    Advanced SIMD 128 bits.  */
20281
20282 static HOST_WIDE_INT
20283 aarch64_estimated_poly_value (poly_int64 val)
20284 {
20285   enum aarch64_sve_vector_bits_enum width_source
20286     = aarch64_tune_params.sve_width;
20287
20288   /* If we still don't have an estimate, use the default.  */
20289   if (width_source == SVE_SCALABLE)
20290     return default_estimated_poly_value (val);
20291
20292   HOST_WIDE_INT over_128 = width_source - 128;
20293   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20294 }
20295
20296
20297 /* Return true for types that could be supported as SIMD return or
20298    argument types.  */
20299
20300 static bool
20301 supported_simd_type (tree t)
20302 {
20303   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20304     {
20305       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20306       return s == 1 || s == 2 || s == 4 || s == 8;
20307     }
20308   return false;
20309 }
20310
20311 /* Return true for types that currently are supported as SIMD return
20312    or argument types.  */
20313
20314 static bool
20315 currently_supported_simd_type (tree t, tree b)
20316 {
20317   if (COMPLEX_FLOAT_TYPE_P (t))
20318     return false;
20319
20320   if (TYPE_SIZE (t) != TYPE_SIZE (b))
20321     return false;
20322
20323   return supported_simd_type (t);
20324 }
20325
20326 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
20327
20328 static int
20329 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20330                                         struct cgraph_simd_clone *clonei,
20331                                         tree base_type, int num)
20332 {
20333   tree t, ret_type, arg_type;
20334   unsigned int elt_bits, vec_bits, count;
20335
20336   if (!TARGET_SIMD)
20337     return 0;
20338
20339   if (clonei->simdlen
20340       && (clonei->simdlen < 2
20341           || clonei->simdlen > 1024
20342           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20343     {
20344       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20345                   "unsupported simdlen %d", clonei->simdlen);
20346       return 0;
20347     }
20348
20349   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20350   if (TREE_CODE (ret_type) != VOID_TYPE
20351       && !currently_supported_simd_type (ret_type, base_type))
20352     {
20353       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20354         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20355                     "GCC does not currently support mixed size types "
20356                     "for %<simd%> functions");
20357       else if (supported_simd_type (ret_type))
20358         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20359                     "GCC does not currently support return type %qT "
20360                     "for %<simd%> functions", ret_type);
20361       else
20362         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20363                     "unsupported return type %qT for %<simd%> functions",
20364                     ret_type);
20365       return 0;
20366     }
20367
20368   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20369     {
20370       arg_type = TREE_TYPE (t);
20371
20372       if (!currently_supported_simd_type (arg_type, base_type))
20373         {
20374           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20375             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20376                         "GCC does not currently support mixed size types "
20377                         "for %<simd%> functions");
20378           else
20379             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20380                         "GCC does not currently support argument type %qT "
20381                         "for %<simd%> functions", arg_type);
20382           return 0;
20383         }
20384     }
20385
20386   clonei->vecsize_mangle = 'n';
20387   clonei->mask_mode = VOIDmode;
20388   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20389   if (clonei->simdlen == 0)
20390     {
20391       count = 2;
20392       vec_bits = (num == 0 ? 64 : 128);
20393       clonei->simdlen = vec_bits / elt_bits;
20394     }
20395   else
20396     {
20397       count = 1;
20398       vec_bits = clonei->simdlen * elt_bits;
20399       if (vec_bits != 64 && vec_bits != 128)
20400         {
20401           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20402                       "GCC does not currently support simdlen %d for type %qT",
20403                       clonei->simdlen, base_type);
20404           return 0;
20405         }
20406     }
20407   clonei->vecsize_int = vec_bits;
20408   clonei->vecsize_float = vec_bits;
20409   return count;
20410 }
20411
20412 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
20413
20414 static void
20415 aarch64_simd_clone_adjust (struct cgraph_node *node)
20416 {
20417   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20418      use the correct ABI.  */
20419
20420   tree t = TREE_TYPE (node->decl);
20421   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20422                                         TYPE_ATTRIBUTES (t));
20423 }
20424
20425 /* Implement TARGET_SIMD_CLONE_USABLE.  */
20426
20427 static int
20428 aarch64_simd_clone_usable (struct cgraph_node *node)
20429 {
20430   switch (node->simdclone->vecsize_mangle)
20431     {
20432     case 'n':
20433       if (!TARGET_SIMD)
20434         return -1;
20435       return 0;
20436     default:
20437       gcc_unreachable ();
20438     }
20439 }
20440
20441 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20442
20443 static int
20444 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20445 {
20446   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20447       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20448     return 0;
20449   return 1;
20450 }
20451
20452 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20453
20454 static const char *
20455 aarch64_get_multilib_abi_name (void)
20456 {
20457   if (TARGET_BIG_END)
20458     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20459   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20460 }
20461
20462 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20463    global variable based guard use the default else
20464    return a null tree.  */
20465 static tree
20466 aarch64_stack_protect_guard (void)
20467 {
20468   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20469     return default_stack_protect_guard ();
20470
20471   return NULL_TREE;
20472 }
20473
20474 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20475    section at the end if needed.  */
20476 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20477 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20478 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20479 void
20480 aarch64_file_end_indicate_exec_stack ()
20481 {
20482   file_end_indicate_exec_stack ();
20483
20484   unsigned feature_1_and = 0;
20485   if (aarch64_bti_enabled ())
20486     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20487
20488   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20489     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20490
20491   if (feature_1_and)
20492     {
20493       /* Generate .note.gnu.property section.  */
20494       switch_to_section (get_section (".note.gnu.property",
20495                                       SECTION_NOTYPE, NULL));
20496
20497       /* PT_NOTE header: namesz, descsz, type.
20498          namesz = 4 ("GNU\0")
20499          descsz = 16 (Size of the program property array)
20500                   [(12 + padding) * Number of array elements]
20501          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20502       assemble_align (POINTER_SIZE);
20503       assemble_integer (GEN_INT (4), 4, 32, 1);
20504       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20505       assemble_integer (GEN_INT (5), 4, 32, 1);
20506
20507       /* PT_NOTE name.  */
20508       assemble_string ("GNU", 4);
20509
20510       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20511          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20512          datasz = 4
20513          data   = feature_1_and.  */
20514       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20515       assemble_integer (GEN_INT (4), 4, 32, 1);
20516       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20517
20518       /* Pad the size of the note to the required alignment.  */
20519       assemble_align (POINTER_SIZE);
20520     }
20521 }
20522 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20523 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20524 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20525
20526 /* Target-specific selftests.  */
20527
20528 #if CHECKING_P
20529
20530 namespace selftest {
20531
20532 /* Selftest for the RTL loader.
20533    Verify that the RTL loader copes with a dump from
20534    print_rtx_function.  This is essentially just a test that class
20535    function_reader can handle a real dump, but it also verifies
20536    that lookup_reg_by_dump_name correctly handles hard regs.
20537    The presence of hard reg names in the dump means that the test is
20538    target-specific, hence it is in this file.  */
20539
20540 static void
20541 aarch64_test_loading_full_dump ()
20542 {
20543   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20544
20545   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20546
20547   rtx_insn *insn_1 = get_insn_by_uid (1);
20548   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20549
20550   rtx_insn *insn_15 = get_insn_by_uid (15);
20551   ASSERT_EQ (INSN, GET_CODE (insn_15));
20552   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20553
20554   /* Verify crtl->return_rtx.  */
20555   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20556   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20557   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20558 }
20559
20560 /* Run all target-specific selftests.  */
20561
20562 static void
20563 aarch64_run_selftests (void)
20564 {
20565   aarch64_test_loading_full_dump ();
20566 }
20567
20568 } // namespace selftest
20569
20570 #endif /* #if CHECKING_P */
20571
20572 #undef TARGET_STACK_PROTECT_GUARD
20573 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20574
20575 #undef TARGET_ADDRESS_COST
20576 #define TARGET_ADDRESS_COST aarch64_address_cost
20577
20578 /* This hook will determines whether unnamed bitfields affect the alignment
20579    of the containing structure.  The hook returns true if the structure
20580    should inherit the alignment requirements of an unnamed bitfield's
20581    type.  */
20582 #undef TARGET_ALIGN_ANON_BITFIELD
20583 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20584
20585 #undef TARGET_ASM_ALIGNED_DI_OP
20586 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20587
20588 #undef TARGET_ASM_ALIGNED_HI_OP
20589 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20590
20591 #undef TARGET_ASM_ALIGNED_SI_OP
20592 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20593
20594 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20595 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20596   hook_bool_const_tree_hwi_hwi_const_tree_true
20597
20598 #undef TARGET_ASM_FILE_START
20599 #define TARGET_ASM_FILE_START aarch64_start_file
20600
20601 #undef TARGET_ASM_OUTPUT_MI_THUNK
20602 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20603
20604 #undef TARGET_ASM_SELECT_RTX_SECTION
20605 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20606
20607 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20608 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20609
20610 #undef TARGET_BUILD_BUILTIN_VA_LIST
20611 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20612
20613 #undef TARGET_CALLEE_COPIES
20614 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20615
20616 #undef TARGET_CAN_ELIMINATE
20617 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20618
20619 #undef TARGET_CAN_INLINE_P
20620 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20621
20622 #undef TARGET_CANNOT_FORCE_CONST_MEM
20623 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20624
20625 #undef TARGET_CASE_VALUES_THRESHOLD
20626 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20627
20628 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20629 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20630
20631 /* Only the least significant bit is used for initialization guard
20632    variables.  */
20633 #undef TARGET_CXX_GUARD_MASK_BIT
20634 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20635
20636 #undef TARGET_C_MODE_FOR_SUFFIX
20637 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20638
20639 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20640 #undef  TARGET_DEFAULT_TARGET_FLAGS
20641 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20642 #endif
20643
20644 #undef TARGET_CLASS_MAX_NREGS
20645 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20646
20647 #undef TARGET_BUILTIN_DECL
20648 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20649
20650 #undef TARGET_BUILTIN_RECIPROCAL
20651 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20652
20653 #undef TARGET_C_EXCESS_PRECISION
20654 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20655
20656 #undef  TARGET_EXPAND_BUILTIN
20657 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20658
20659 #undef TARGET_EXPAND_BUILTIN_VA_START
20660 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20661
20662 #undef TARGET_FOLD_BUILTIN
20663 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20664
20665 #undef TARGET_FUNCTION_ARG
20666 #define TARGET_FUNCTION_ARG aarch64_function_arg
20667
20668 #undef TARGET_FUNCTION_ARG_ADVANCE
20669 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20670
20671 #undef TARGET_FUNCTION_ARG_BOUNDARY
20672 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20673
20674 #undef TARGET_FUNCTION_ARG_PADDING
20675 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20676
20677 #undef TARGET_GET_RAW_RESULT_MODE
20678 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20679 #undef TARGET_GET_RAW_ARG_MODE
20680 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20681
20682 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20683 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20684
20685 #undef TARGET_FUNCTION_VALUE
20686 #define TARGET_FUNCTION_VALUE aarch64_function_value
20687
20688 #undef TARGET_FUNCTION_VALUE_REGNO_P
20689 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20690
20691 #undef TARGET_GIMPLE_FOLD_BUILTIN
20692 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20693
20694 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20695 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20696
20697 #undef  TARGET_INIT_BUILTINS
20698 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20699
20700 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20701 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20702   aarch64_ira_change_pseudo_allocno_class
20703
20704 #undef TARGET_LEGITIMATE_ADDRESS_P
20705 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20706
20707 #undef TARGET_LEGITIMATE_CONSTANT_P
20708 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20709
20710 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20711 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20712   aarch64_legitimize_address_displacement
20713
20714 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20715 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20716
20717 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20718 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20719 aarch64_libgcc_floating_mode_supported_p
20720
20721 #undef TARGET_MANGLE_TYPE
20722 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20723
20724 #undef TARGET_MEMORY_MOVE_COST
20725 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20726
20727 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20728 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20729
20730 #undef TARGET_MUST_PASS_IN_STACK
20731 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20732
20733 /* This target hook should return true if accesses to volatile bitfields
20734    should use the narrowest mode possible.  It should return false if these
20735    accesses should use the bitfield container type.  */
20736 #undef TARGET_NARROW_VOLATILE_BITFIELD
20737 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20738
20739 #undef  TARGET_OPTION_OVERRIDE
20740 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20741
20742 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20743 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20744   aarch64_override_options_after_change
20745
20746 #undef TARGET_OPTION_SAVE
20747 #define TARGET_OPTION_SAVE aarch64_option_save
20748
20749 #undef TARGET_OPTION_RESTORE
20750 #define TARGET_OPTION_RESTORE aarch64_option_restore
20751
20752 #undef TARGET_OPTION_PRINT
20753 #define TARGET_OPTION_PRINT aarch64_option_print
20754
20755 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20756 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20757
20758 #undef TARGET_SET_CURRENT_FUNCTION
20759 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20760
20761 #undef TARGET_PASS_BY_REFERENCE
20762 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20763
20764 #undef TARGET_PREFERRED_RELOAD_CLASS
20765 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20766
20767 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20768 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20769
20770 #undef TARGET_PROMOTED_TYPE
20771 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20772
20773 #undef TARGET_SECONDARY_RELOAD
20774 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20775
20776 #undef TARGET_SHIFT_TRUNCATION_MASK
20777 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20778
20779 #undef TARGET_SETUP_INCOMING_VARARGS
20780 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20781
20782 #undef TARGET_STRUCT_VALUE_RTX
20783 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20784
20785 #undef TARGET_REGISTER_MOVE_COST
20786 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20787
20788 #undef TARGET_RETURN_IN_MEMORY
20789 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20790
20791 #undef TARGET_RETURN_IN_MSB
20792 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20793
20794 #undef TARGET_RTX_COSTS
20795 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20796
20797 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20798 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20799
20800 #undef TARGET_SCHED_ISSUE_RATE
20801 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20802
20803 #undef TARGET_SCHED_VARIABLE_ISSUE
20804 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20805
20806 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20807 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20808   aarch64_sched_first_cycle_multipass_dfa_lookahead
20809
20810 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20811 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20812   aarch64_first_cycle_multipass_dfa_lookahead_guard
20813
20814 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20815 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20816   aarch64_get_separate_components
20817
20818 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20819 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20820   aarch64_components_for_bb
20821
20822 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20823 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20824   aarch64_disqualify_components
20825
20826 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20827 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20828   aarch64_emit_prologue_components
20829
20830 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20831 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20832   aarch64_emit_epilogue_components
20833
20834 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20835 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20836   aarch64_set_handled_components
20837
20838 #undef TARGET_TRAMPOLINE_INIT
20839 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20840
20841 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20842 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20843
20844 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20845 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20846
20847 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20848 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20849   aarch64_builtin_support_vector_misalignment
20850
20851 #undef TARGET_ARRAY_MODE
20852 #define TARGET_ARRAY_MODE aarch64_array_mode
20853
20854 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20855 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20856
20857 #undef TARGET_VECTORIZE_ADD_STMT_COST
20858 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20859
20860 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20861 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20862   aarch64_builtin_vectorization_cost
20863
20864 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20865 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20866
20867 #undef TARGET_VECTORIZE_BUILTINS
20868 #define TARGET_VECTORIZE_BUILTINS
20869
20870 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20871 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20872   aarch64_builtin_vectorized_function
20873
20874 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20875 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20876   aarch64_autovectorize_vector_sizes
20877
20878 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20879 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20880   aarch64_atomic_assign_expand_fenv
20881
20882 /* Section anchor support.  */
20883
20884 #undef TARGET_MIN_ANCHOR_OFFSET
20885 #define TARGET_MIN_ANCHOR_OFFSET -256
20886
20887 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20888    byte offset; we can do much more for larger data types, but have no way
20889    to determine the size of the access.  We assume accesses are aligned.  */
20890 #undef TARGET_MAX_ANCHOR_OFFSET
20891 #define TARGET_MAX_ANCHOR_OFFSET 4095
20892
20893 #undef TARGET_VECTOR_ALIGNMENT
20894 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20895
20896 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20897 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20898   aarch64_vectorize_preferred_vector_alignment
20899 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20900 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20901   aarch64_simd_vector_alignment_reachable
20902
20903 /* vec_perm support.  */
20904
20905 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20906 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20907   aarch64_vectorize_vec_perm_const
20908
20909 #undef TARGET_VECTORIZE_GET_MASK_MODE
20910 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20911 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20912 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20913   aarch64_empty_mask_is_expensive
20914 #undef TARGET_PREFERRED_ELSE_VALUE
20915 #define TARGET_PREFERRED_ELSE_VALUE \
20916   aarch64_preferred_else_value
20917
20918 #undef TARGET_INIT_LIBFUNCS
20919 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20920
20921 #undef TARGET_FIXED_CONDITION_CODE_REGS
20922 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20923
20924 #undef TARGET_FLAGS_REGNUM
20925 #define TARGET_FLAGS_REGNUM CC_REGNUM
20926
20927 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20928 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20929
20930 #undef TARGET_ASAN_SHADOW_OFFSET
20931 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20932
20933 #undef TARGET_LEGITIMIZE_ADDRESS
20934 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20935
20936 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20937 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20938
20939 #undef TARGET_CAN_USE_DOLOOP_P
20940 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20941
20942 #undef TARGET_SCHED_ADJUST_PRIORITY
20943 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20944
20945 #undef TARGET_SCHED_MACRO_FUSION_P
20946 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20947
20948 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20949 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20950
20951 #undef TARGET_SCHED_FUSION_PRIORITY
20952 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20953
20954 #undef TARGET_UNSPEC_MAY_TRAP_P
20955 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20956
20957 #undef TARGET_USE_PSEUDO_PIC_REG
20958 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20959
20960 #undef TARGET_PRINT_OPERAND
20961 #define TARGET_PRINT_OPERAND aarch64_print_operand
20962
20963 #undef TARGET_PRINT_OPERAND_ADDRESS
20964 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20965
20966 #undef TARGET_OPTAB_SUPPORTED_P
20967 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20968
20969 #undef TARGET_OMIT_STRUCT_RETURN_REG
20970 #define TARGET_OMIT_STRUCT_RETURN_REG true
20971
20972 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20973 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20974   aarch64_dwarf_poly_indeterminate_value
20975
20976 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20977 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20978 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20979
20980 #undef TARGET_HARD_REGNO_NREGS
20981 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20982 #undef TARGET_HARD_REGNO_MODE_OK
20983 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20984
20985 #undef TARGET_MODES_TIEABLE_P
20986 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20987
20988 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20989 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20990   aarch64_hard_regno_call_part_clobbered
20991
20992 #undef TARGET_INSN_CALLEE_ABI
20993 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
20994
20995 #undef TARGET_CONSTANT_ALIGNMENT
20996 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20997
20998 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20999 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
21000   aarch64_stack_clash_protection_alloca_probe_range
21001
21002 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21003 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21004
21005 #undef TARGET_CAN_CHANGE_MODE_CLASS
21006 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21007
21008 #undef TARGET_SELECT_EARLY_REMAT_MODES
21009 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21010
21011 #undef TARGET_SPECULATION_SAFE_VALUE
21012 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21013
21014 #undef TARGET_ESTIMATED_POLY_VALUE
21015 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21016
21017 #undef TARGET_ATTRIBUTE_TABLE
21018 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21019
21020 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21021 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21022   aarch64_simd_clone_compute_vecsize_and_simdlen
21023
21024 #undef TARGET_SIMD_CLONE_ADJUST
21025 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21026
21027 #undef TARGET_SIMD_CLONE_USABLE
21028 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21029
21030 #undef TARGET_COMP_TYPE_ATTRIBUTES
21031 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21032
21033 #undef TARGET_GET_MULTILIB_ABI_NAME
21034 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21035
21036 #undef TARGET_FNTYPE_ABI
21037 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21038
21039 #if CHECKING_P
21040 #undef TARGET_RUN_TARGET_SELFTESTS
21041 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21042 #endif /* #if CHECKING_P */
21043
21044 #undef TARGET_ASM_POST_CFI_STARTPROC
21045 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21046
21047 struct gcc_target targetm = TARGET_INITIALIZER;
21048
21049 #include "gt-aarch64.h"