gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74 #include "intl.h"
  75 #include "expmed.h"
  76 #include "function-abi.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 namespace {
 178
 179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 180 class pure_scalable_type_info
 181 {
 182 public:
 183   /* Represents the result of analyzing a type.  All values are nonzero,
 184      in the possibly forlorn hope that accidental conversions to bool
 185      trigger a warning.  */
 186   enum analysis_result
 187   {
 188     /* The type does not have an ABI identity; i.e. it doesn't contain
 189        at least one object whose type is a Fundamental Data Type.  */
 190     NO_ABI_IDENTITY = 1,
 191
 192     /* The type is definitely a Pure Scalable Type.  */
 193     IS_PST,
 194
 195     /* The type is definitely not a Pure Scalable Type.  */
 196     ISNT_PST,
 197
 198     /* It doesn't matter for PCS purposes whether the type is a Pure
 199        Scalable Type or not, since the type will be handled the same
 200        way regardless.
 201
 202        Specifically, this means that if the type is a Pure Scalable Type,
 203        there aren't enough argument registers to hold it, and so it will
 204        need to be passed or returned in memory.  If the type isn't a
 205        Pure Scalable Type, it's too big to be passed or returned in core
 206        or SIMD&FP registers, and so again will need to go in memory.  */
 207     DOESNT_MATTER
 208   };
 209
 210   /* Aggregates of 17 bytes or more are normally passed and returned
 211      in memory, so aggregates of that size can safely be analyzed as
 212      DOESNT_MATTER.  We need to be able to collect enough pieces to
 213      represent a PST that is smaller than that.  Since predicates are
 214      2 bytes in size for -msve-vector-bits=128, that means we need to be
 215      able to store at least 8 pieces.
 216
 217      We also need to be able to store enough pieces to represent
 218      a single vector in each vector argument register and a single
 219      predicate in each predicate argument register.  This means that
 220      we need at least 12 pieces.  */
 221   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 222 #if __cplusplus >= 201103L
 223   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 224 #endif
 225
 226   /* Describes one piece of a PST.  Each piece is one of:
 227
 228      - a single Scalable Vector Type (SVT)
 229      - a single Scalable Predicate Type (SPT)
 230      - a PST containing 2, 3 or 4 SVTs, with no padding
 231
 232      It either represents a single built-in type or a PST formed from
 233      multiple homogeneous built-in types.  */
 234   struct piece
 235   {
 236     rtx get_rtx (unsigned int, unsigned int) const;
 237
 238     /* The number of vector and predicate registers that the piece
 239        occupies.  One of the two is always zero.  */
 240     unsigned int num_zr;
 241     unsigned int num_pr;
 242
 243     /* The mode of the registers described above.  */
 244     machine_mode mode;
 245
 246     /* If this piece is formed from multiple homogeneous built-in types,
 247        this is the mode of the built-in types, otherwise it is MODE.  */
 248     machine_mode orig_mode;
 249
 250     /* The offset in bytes of the piece from the start of the type.  */
 251     poly_uint64_pod offset;
 252   };
 253
 254   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 255      are in memory order.  */
 256   auto_vec<piece, MAX_PIECES> pieces;
 257
 258   unsigned int num_zr () const;
 259   unsigned int num_pr () const;
 260
 261   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 262
 263   analysis_result analyze (const_tree);
 264   bool analyze_registers (const_tree);
 265
 266 private:
 267   analysis_result analyze_array (const_tree);
 268   analysis_result analyze_record (const_tree);
 269   void add_piece (const piece &);
 270 };
 271 }
 272
 273 /* The current code model.  */
 274 enum aarch64_code_model aarch64_cmodel;
 275
 276 /* The number of 64-bit elements in an SVE vector.  */
 277 poly_uint16 aarch64_sve_vg;
 278
 279 #ifdef HAVE_AS_TLS
 280 #undef TARGET_HAVE_TLS
 281 #define TARGET_HAVE_TLS 1
 282 #endif
 283
 284 static bool aarch64_composite_type_p (const_tree, machine_mode);
 285 static bool aarch64_return_in_memory_1 (const_tree);
 286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 287                                                      const_tree,
 288                                                      machine_mode *, int *,
 289                                                      bool *, bool);
 290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 292 static void aarch64_override_options_after_change (void);
 293 static bool aarch64_vector_mode_supported_p (machine_mode);
 294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 296                                                          const_tree type,
 297                                                          int misalignment,
 298                                                          bool is_packed);
 299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 301                                             aarch64_addr_query_type);
 302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 303
 304 /* Major revision number of the ARM Architecture implemented by the target.  */
 305 unsigned aarch64_architecture_version;
 306
 307 /* The processor for which instructions should be scheduled.  */
 308 enum aarch64_processor aarch64_tune = cortexa53;
 309
 310 /* Mask to specify which instruction scheduling options should be used.  */
 311 uint64_t aarch64_tune_flags = 0;
 312
 313 /* Global flag for PC relative loads.  */
 314 bool aarch64_pcrelative_literal_loads;
 315
 316 /* Global flag for whether frame pointer is enabled.  */
 317 bool aarch64_use_frame_pointer;
 318
 319 #define BRANCH_PROTECT_STR_MAX 255
 320 char *accepted_branch_protection_string = NULL;
 321
 322 static enum aarch64_parse_opt_result
 323 aarch64_parse_branch_protection (const char*, char**);
 324
 325 /* Support for command line parsing of boolean flags in the tuning
 326    structures.  */
 327 struct aarch64_flag_desc
 328 {
 329   const char* name;
 330   unsigned int flag;
 331 };
 332
 333 #define AARCH64_FUSION_PAIR(name, internal_name) \
 334   { name, AARCH64_FUSE_##internal_name },
 335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 336 {
 337   { "none", AARCH64_FUSE_NOTHING },
 338 #include "aarch64-fusion-pairs.def"
 339   { "all", AARCH64_FUSE_ALL },
 340   { NULL, AARCH64_FUSE_NOTHING }
 341 };
 342
 343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 344   { name, AARCH64_EXTRA_TUNE_##internal_name },
 345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 346 {
 347   { "none", AARCH64_EXTRA_TUNE_NONE },
 348 #include "aarch64-tuning-flags.def"
 349   { "all", AARCH64_EXTRA_TUNE_ALL },
 350   { NULL, AARCH64_EXTRA_TUNE_NONE }
 351 };
 352
 353 /* Tuning parameters.  */
 354
 355 static const struct cpu_addrcost_table generic_addrcost_table =
 356 {
 357     {
 358       1, /* hi  */
 359       0, /* si  */
 360       0, /* di  */
 361       1, /* ti  */
 362     },
 363   0, /* pre_modify  */
 364   0, /* post_modify  */
 365   0, /* register_offset  */
 366   0, /* register_sextend  */
 367   0, /* register_zextend  */
 368   0 /* imm_offset  */
 369 };
 370
 371 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 372 {
 373     {
 374       0, /* hi  */
 375       0, /* si  */
 376       0, /* di  */
 377       2, /* ti  */
 378     },
 379   0, /* pre_modify  */
 380   0, /* post_modify  */
 381   1, /* register_offset  */
 382   1, /* register_sextend  */
 383   2, /* register_zextend  */
 384   0, /* imm_offset  */
 385 };
 386
 387 static const struct cpu_addrcost_table xgene1_addrcost_table =
 388 {
 389     {
 390       1, /* hi  */
 391       0, /* si  */
 392       0, /* di  */
 393       1, /* ti  */
 394     },
 395   1, /* pre_modify  */
 396   1, /* post_modify  */
 397   0, /* register_offset  */
 398   1, /* register_sextend  */
 399   1, /* register_zextend  */
 400   0, /* imm_offset  */
 401 };
 402
 403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 404 {
 405     {
 406       1, /* hi  */
 407       1, /* si  */
 408       1, /* di  */
 409       2, /* ti  */
 410     },
 411   0, /* pre_modify  */
 412   0, /* post_modify  */
 413   2, /* register_offset  */
 414   3, /* register_sextend  */
 415   3, /* register_zextend  */
 416   0, /* imm_offset  */
 417 };
 418
 419 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
 420 {
 421     {
 422       1, /* hi  */
 423       1, /* si  */
 424       1, /* di  */
 425       2, /* ti  */
 426     },
 427   0, /* pre_modify  */
 428   0, /* post_modify  */
 429   2, /* register_offset  */
 430   3, /* register_sextend  */
 431   3, /* register_zextend  */
 432   0, /* imm_offset  */
 433 };
 434
 435 static const struct cpu_addrcost_table tsv110_addrcost_table =
 436 {
 437     {
 438       1, /* hi  */
 439       0, /* si  */
 440       0, /* di  */
 441       1, /* ti  */
 442     },
 443   0, /* pre_modify  */
 444   0, /* post_modify  */
 445   0, /* register_offset  */
 446   1, /* register_sextend  */
 447   1, /* register_zextend  */
 448   0, /* imm_offset  */
 449 };
 450
 451 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 452 {
 453     {
 454       1, /* hi  */
 455       1, /* si  */
 456       1, /* di  */
 457       2, /* ti  */
 458     },
 459   1, /* pre_modify  */
 460   1, /* post_modify  */
 461   3, /* register_offset  */
 462   3, /* register_sextend  */
 463   3, /* register_zextend  */
 464   2, /* imm_offset  */
 465 };
 466
 467 static const struct cpu_regmove_cost generic_regmove_cost =
 468 {
 469   1, /* GP2GP  */
 470   /* Avoid the use of slow int<->fp moves for spilling by setting
 471      their cost higher than memmov_cost.  */
 472   5, /* GP2FP  */
 473   5, /* FP2GP  */
 474   2 /* FP2FP  */
 475 };
 476
 477 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 478 {
 479   1, /* GP2GP  */
 480   /* Avoid the use of slow int<->fp moves for spilling by setting
 481      their cost higher than memmov_cost.  */
 482   5, /* GP2FP  */
 483   5, /* FP2GP  */
 484   2 /* FP2FP  */
 485 };
 486
 487 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 488 {
 489   1, /* GP2GP  */
 490   /* Avoid the use of slow int<->fp moves for spilling by setting
 491      their cost higher than memmov_cost.  */
 492   5, /* GP2FP  */
 493   5, /* FP2GP  */
 494   2 /* FP2FP  */
 495 };
 496
 497 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 498 {
 499   1, /* GP2GP  */
 500   /* Avoid the use of slow int<->fp moves for spilling by setting
 501      their cost higher than memmov_cost (actual, 4 and 9).  */
 502   9, /* GP2FP  */
 503   9, /* FP2GP  */
 504   1 /* FP2FP  */
 505 };
 506
 507 static const struct cpu_regmove_cost thunderx_regmove_cost =
 508 {
 509   2, /* GP2GP  */
 510   2, /* GP2FP  */
 511   6, /* FP2GP  */
 512   4 /* FP2FP  */
 513 };
 514
 515 static const struct cpu_regmove_cost xgene1_regmove_cost =
 516 {
 517   1, /* GP2GP  */
 518   /* Avoid the use of slow int<->fp moves for spilling by setting
 519      their cost higher than memmov_cost.  */
 520   8, /* GP2FP  */
 521   8, /* FP2GP  */
 522   2 /* FP2FP  */
 523 };
 524
 525 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 526 {
 527   2, /* GP2GP  */
 528   /* Avoid the use of int<->fp moves for spilling.  */
 529   6, /* GP2FP  */
 530   6, /* FP2GP  */
 531   4 /* FP2FP  */
 532 };
 533
 534 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 535 {
 536   1, /* GP2GP  */
 537   /* Avoid the use of int<->fp moves for spilling.  */
 538   5, /* GP2FP  */
 539   6, /* FP2GP  */
 540   3, /* FP2FP  */
 541 };
 542
 543 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
 544 {
 545   1, /* GP2GP  */
 546   /* Avoid the use of int<->fp moves for spilling.  */
 547   4, /* GP2FP  */
 548   5, /* FP2GP  */
 549   4  /* FP2FP  */
 550 };
 551
 552 static const struct cpu_regmove_cost tsv110_regmove_cost =
 553 {
 554   1, /* GP2GP  */
 555   /* Avoid the use of slow int<->fp moves for spilling by setting
 556      their cost higher than memmov_cost.  */
 557   2, /* GP2FP  */
 558   3, /* FP2GP  */
 559   2  /* FP2FP  */
 560 };
 561
 562 /* Generic costs for Advanced SIMD vector operations.   */
 563 static const advsimd_vec_cost generic_advsimd_vector_cost =
 564 {
 565   1, /* int_stmt_cost  */
 566   1, /* fp_stmt_cost  */
 567   2, /* permute_cost  */
 568   2, /* vec_to_scalar_cost  */
 569   1, /* scalar_to_vec_cost  */
 570   1, /* align_load_cost  */
 571   1, /* unalign_load_cost  */
 572   1, /* unalign_store_cost  */
 573   1  /* store_cost  */
 574 };
 575
 576 /* Generic costs for SVE vector operations.  */
 577 static const sve_vec_cost generic_sve_vector_cost =
 578 {
 579   1, /* int_stmt_cost  */
 580   1, /* fp_stmt_cost  */
 581   2, /* permute_cost  */
 582   2, /* vec_to_scalar_cost  */
 583   1, /* scalar_to_vec_cost  */
 584   1, /* align_load_cost  */
 585   1, /* unalign_load_cost  */
 586   1, /* unalign_store_cost  */
 587   1  /* store_cost  */
 588 };
 589
 590 /* Generic costs for vector insn classes.  */
 591 static const struct cpu_vector_cost generic_vector_cost =
 592 {
 593   1, /* scalar_int_stmt_cost  */
 594   1, /* scalar_fp_stmt_cost  */
 595   1, /* scalar_load_cost  */
 596   1, /* scalar_store_cost  */
 597   3, /* cond_taken_branch_cost  */
 598   1, /* cond_not_taken_branch_cost  */
 599   &generic_advsimd_vector_cost, /* advsimd  */
 600   &generic_sve_vector_cost /* sve */
 601 };
 602
 603 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
 604 {
 605   1, /* int_stmt_cost  */
 606   3, /* fp_stmt_cost  */
 607   2, /* permute_cost  */
 608   1, /* vec_to_scalar_cost  */
 609   1, /* scalar_to_vec_cost  */
 610   1, /* align_load_cost  */
 611   1, /* unalign_load_cost  */
 612   1, /* unalign_store_cost  */
 613   1  /* store_cost  */
 614 };
 615
 616 /* QDF24XX costs for vector insn classes.  */
 617 static const struct cpu_vector_cost qdf24xx_vector_cost =
 618 {
 619   1, /* scalar_int_stmt_cost  */
 620   1, /* scalar_fp_stmt_cost  */
 621   1, /* scalar_load_cost  */
 622   1, /* scalar_store_cost  */
 623   3, /* cond_taken_branch_cost  */
 624   1, /* cond_not_taken_branch_cost  */
 625   &qdf24xx_advsimd_vector_cost, /* advsimd  */
 626   NULL /* sve  */
 627 };
 628
 629
 630 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
 631 {
 632   4, /* int_stmt_cost  */
 633   1, /* fp_stmt_cost  */
 634   4, /* permute_cost  */
 635   2, /* vec_to_scalar_cost  */
 636   2, /* scalar_to_vec_cost  */
 637   3, /* align_load_cost  */
 638   5, /* unalign_load_cost  */
 639   5, /* unalign_store_cost  */
 640   1  /* store_cost  */
 641 };
 642
 643 /* ThunderX costs for vector insn classes.  */
 644 static const struct cpu_vector_cost thunderx_vector_cost =
 645 {
 646   1, /* scalar_int_stmt_cost  */
 647   1, /* scalar_fp_stmt_cost  */
 648   3, /* scalar_load_cost  */
 649   1, /* scalar_store_cost  */
 650   3, /* cond_taken_branch_cost  */
 651   3, /* cond_not_taken_branch_cost  */
 652   &thunderx_advsimd_vector_cost, /* advsimd  */
 653   NULL /* sve  */
 654 };
 655
 656 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
 657 {
 658   2, /* int_stmt_cost  */
 659   2, /* fp_stmt_cost  */
 660   2, /* permute_cost  */
 661   3, /* vec_to_scalar_cost  */
 662   2, /* scalar_to_vec_cost  */
 663   5, /* align_load_cost  */
 664   5, /* unalign_load_cost  */
 665   1, /* unalign_store_cost  */
 666   1  /* store_cost  */
 667 };
 668
 669 static const struct cpu_vector_cost tsv110_vector_cost =
 670 {
 671   1, /* scalar_int_stmt_cost  */
 672   1, /* scalar_fp_stmt_cost  */
 673   5, /* scalar_load_cost  */
 674   1, /* scalar_store_cost  */
 675   1, /* cond_taken_branch_cost  */
 676   1, /* cond_not_taken_branch_cost  */
 677   &tsv110_advsimd_vector_cost, /* advsimd  */
 678   NULL, /* sve  */
 679 };
 680
 681 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
 682 {
 683   2, /* int_stmt_cost  */
 684   2, /* fp_stmt_cost  */
 685   3, /* permute_cost  */
 686   8, /* vec_to_scalar_cost  */
 687   8, /* scalar_to_vec_cost  */
 688   4, /* align_load_cost  */
 689   4, /* unalign_load_cost  */
 690   1, /* unalign_store_cost  */
 691   1  /* store_cost  */
 692 };
 693
 694 /* Cortex-A57 costs for vector insn classes.  */
 695 static const struct cpu_vector_cost cortexa57_vector_cost =
 696 {
 697   1, /* scalar_int_stmt_cost  */
 698   1, /* scalar_fp_stmt_cost  */
 699   4, /* scalar_load_cost  */
 700   1, /* scalar_store_cost  */
 701   1, /* cond_taken_branch_cost  */
 702   1, /* cond_not_taken_branch_cost  */
 703   &cortexa57_advsimd_vector_cost, /* advsimd  */
 704   NULL /* sve  */
 705 };
 706
 707 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
 708 {
 709   3, /* int_stmt_cost  */
 710   3, /* fp_stmt_cost  */
 711   3, /* permute_cost  */
 712   3, /* vec_to_scalar_cost  */
 713   3, /* scalar_to_vec_cost  */
 714   5, /* align_load_cost  */
 715   5, /* unalign_load_cost  */
 716   1, /* unalign_store_cost  */
 717   1  /* store_cost  */
 718 };
 719
 720 static const struct cpu_vector_cost exynosm1_vector_cost =
 721 {
 722   1, /* scalar_int_stmt_cost  */
 723   1, /* scalar_fp_stmt_cost  */
 724   5, /* scalar_load_cost  */
 725   1, /* scalar_store_cost  */
 726   1, /* cond_taken_branch_cost  */
 727   1, /* cond_not_taken_branch_cost  */
 728   &exynosm1_advsimd_vector_cost, /* advsimd  */
 729   NULL /* sve  */
 730 };
 731
 732 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
 733 {
 734   2, /* int_stmt_cost  */
 735   2, /* fp_stmt_cost  */
 736   2, /* permute_cost  */
 737   4, /* vec_to_scalar_cost  */
 738   4, /* scalar_to_vec_cost  */
 739   10, /* align_load_cost  */
 740   10, /* unalign_load_cost  */
 741   2, /* unalign_store_cost  */
 742   2  /* store_cost  */
 743 };
 744
 745 /* Generic costs for vector insn classes.  */
 746 static const struct cpu_vector_cost xgene1_vector_cost =
 747 {
 748   1, /* scalar_int_stmt_cost  */
 749   1, /* scalar_fp_stmt_cost  */
 750   5, /* scalar_load_cost  */
 751   1, /* scalar_store_cost  */
 752   2, /* cond_taken_branch_cost  */
 753   1, /* cond_not_taken_branch_cost  */
 754   &xgene1_advsimd_vector_cost, /* advsimd  */
 755   NULL /* sve  */
 756 };
 757
 758 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
 759 {
 760   4, /* int_stmt_cost  */
 761   5, /* fp_stmt_cost  */
 762   10, /* permute_cost  */
 763   6, /* vec_to_scalar_cost  */
 764   5, /* scalar_to_vec_cost  */
 765   4, /* align_load_cost  */
 766   4, /* unalign_load_cost  */
 767   1, /* unalign_store_cost  */
 768   1  /* store_cost  */
 769 };
 770
 771 /* Costs for vector insn classes for Vulcan.  */
 772 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 773 {
 774   1, /* scalar_int_stmt_cost  */
 775   6, /* scalar_fp_stmt_cost  */
 776   4, /* scalar_load_cost  */
 777   1, /* scalar_store_cost  */
 778   2, /* cond_taken_branch_cost  */
 779   1,  /* cond_not_taken_branch_cost  */
 780   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
 781   NULL /* sve  */
 782 };
 783
 784 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
 785 {
 786   5, /* int_stmt_cost  */
 787   5, /* fp_stmt_cost  */
 788   10, /* permute_cost  */
 789   5, /* vec_to_scalar_cost  */
 790   5, /* scalar_to_vec_cost  */
 791   4, /* align_load_cost  */
 792   4, /* unalign_load_cost  */
 793   4, /* unalign_store_cost  */
 794   4  /* store_cost  */
 795 };
 796
 797 static const struct cpu_vector_cost thunderx3t110_vector_cost =
 798 {
 799   1, /* scalar_int_stmt_cost  */
 800   5, /* scalar_fp_stmt_cost  */
 801   4, /* scalar_load_cost  */
 802   1, /* scalar_store_cost  */
 803   2, /* cond_taken_branch_cost  */
 804   1,  /* cond_not_taken_branch_cost  */
 805   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
 806   NULL /* sve  */
 807 };
 808
 809
 810 /* Generic costs for branch instructions.  */
 811 static const struct cpu_branch_cost generic_branch_cost =
 812 {
 813   1,  /* Predictable.  */
 814   3   /* Unpredictable.  */
 815 };
 816
 817 /* Generic approximation modes.  */
 818 static const cpu_approx_modes generic_approx_modes =
 819 {
 820   AARCH64_APPROX_NONE,  /* division  */
 821   AARCH64_APPROX_NONE,  /* sqrt  */
 822   AARCH64_APPROX_NONE   /* recip_sqrt  */
 823 };
 824
 825 /* Approximation modes for Exynos M1.  */
 826 static const cpu_approx_modes exynosm1_approx_modes =
 827 {
 828   AARCH64_APPROX_NONE,  /* division  */
 829   AARCH64_APPROX_ALL,   /* sqrt  */
 830   AARCH64_APPROX_ALL    /* recip_sqrt  */
 831 };
 832
 833 /* Approximation modes for X-Gene 1.  */
 834 static const cpu_approx_modes xgene1_approx_modes =
 835 {
 836   AARCH64_APPROX_NONE,  /* division  */
 837   AARCH64_APPROX_NONE,  /* sqrt  */
 838   AARCH64_APPROX_ALL    /* recip_sqrt  */
 839 };
 840
 841 /* Generic prefetch settings (which disable prefetch).  */
 842 static const cpu_prefetch_tune generic_prefetch_tune =
 843 {
 844   0,                    /* num_slots  */
 845   -1,                   /* l1_cache_size  */
 846   -1,                   /* l1_cache_line_size  */
 847   -1,                   /* l2_cache_size  */
 848   true,                 /* prefetch_dynamic_strides */
 849   -1,                   /* minimum_stride */
 850   -1                    /* default_opt_level  */
 851 };
 852
 853 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 854 {
 855   0,                    /* num_slots  */
 856   -1,                   /* l1_cache_size  */
 857   64,                   /* l1_cache_line_size  */
 858   -1,                   /* l2_cache_size  */
 859   true,                 /* prefetch_dynamic_strides */
 860   -1,                   /* minimum_stride */
 861   -1                    /* default_opt_level  */
 862 };
 863
 864 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 865 {
 866   4,                    /* num_slots  */
 867   32,                   /* l1_cache_size  */
 868   64,                   /* l1_cache_line_size  */
 869   512,                  /* l2_cache_size  */
 870   false,                /* prefetch_dynamic_strides */
 871   2048,                 /* minimum_stride */
 872   3                     /* default_opt_level  */
 873 };
 874
 875 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 876 {
 877   8,                    /* num_slots  */
 878   32,                   /* l1_cache_size  */
 879   128,                  /* l1_cache_line_size  */
 880   16*1024,              /* l2_cache_size  */
 881   true,                 /* prefetch_dynamic_strides */
 882   -1,                   /* minimum_stride */
 883   3                     /* default_opt_level  */
 884 };
 885
 886 static const cpu_prefetch_tune thunderx_prefetch_tune =
 887 {
 888   8,                    /* num_slots  */
 889   32,                   /* l1_cache_size  */
 890   128,                  /* l1_cache_line_size  */
 891   -1,                   /* l2_cache_size  */
 892   true,                 /* prefetch_dynamic_strides */
 893   -1,                   /* minimum_stride */
 894   -1                    /* default_opt_level  */
 895 };
 896
 897 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 898 {
 899   8,                    /* num_slots  */
 900   32,                   /* l1_cache_size  */
 901   64,                   /* l1_cache_line_size  */
 902   256,                  /* l2_cache_size  */
 903   true,                 /* prefetch_dynamic_strides */
 904   -1,                   /* minimum_stride */
 905   -1                    /* default_opt_level  */
 906 };
 907
 908 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
 909 {
 910   8,                    /* num_slots  */
 911   32,                   /* l1_cache_size  */
 912   64,                   /* l1_cache_line_size  */
 913   256,                  /* l2_cache_size  */
 914   true,                 /* prefetch_dynamic_strides */
 915   -1,                   /* minimum_stride */
 916   -1                    /* default_opt_level  */
 917 };
 918
 919 static const cpu_prefetch_tune tsv110_prefetch_tune =
 920 {
 921   0,                    /* num_slots  */
 922   64,                   /* l1_cache_size  */
 923   64,                   /* l1_cache_line_size  */
 924   512,                  /* l2_cache_size  */
 925   true,                 /* prefetch_dynamic_strides */
 926   -1,                   /* minimum_stride */
 927   -1                    /* default_opt_level  */
 928 };
 929
 930 static const cpu_prefetch_tune xgene1_prefetch_tune =
 931 {
 932   8,                    /* num_slots  */
 933   32,                   /* l1_cache_size  */
 934   64,                   /* l1_cache_line_size  */
 935   256,                  /* l2_cache_size  */
 936   true,                 /* prefetch_dynamic_strides */
 937   -1,                   /* minimum_stride */
 938   -1                    /* default_opt_level  */
 939 };
 940
 941 static const cpu_prefetch_tune a64fx_prefetch_tune =
 942 {
 943   8,                    /* num_slots  */
 944   64,                   /* l1_cache_size  */
 945   256,                  /* l1_cache_line_size  */
 946   32768,                /* l2_cache_size  */
 947   true,                 /* prefetch_dynamic_strides */
 948   -1,                   /* minimum_stride */
 949   -1                    /* default_opt_level  */
 950 };
 951
 952 static const struct tune_params generic_tunings =
 953 {
 954   &cortexa57_extra_costs,
 955   &generic_addrcost_table,
 956   &generic_regmove_cost,
 957   &generic_vector_cost,
 958   &generic_branch_cost,
 959   &generic_approx_modes,
 960   SVE_NOT_IMPLEMENTED, /* sve_width  */
 961   4, /* memmov_cost  */
 962   2, /* issue_rate  */
 963   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
 964   "16:12",      /* function_align.  */
 965   "4",  /* jump_align.  */
 966   "8",  /* loop_align.  */
 967   2,    /* int_reassoc_width.  */
 968   4,    /* fp_reassoc_width.  */
 969   1,    /* vec_reassoc_width.  */
 970   2,    /* min_div_recip_mul_sf.  */
 971   2,    /* min_div_recip_mul_df.  */
 972   0,    /* max_case_values.  */
 973   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 974   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 975   &generic_prefetch_tune
 976 };
 977
 978 static const struct tune_params cortexa35_tunings =
 979 {
 980   &cortexa53_extra_costs,
 981   &generic_addrcost_table,
 982   &cortexa53_regmove_cost,
 983   &generic_vector_cost,
 984   &generic_branch_cost,
 985   &generic_approx_modes,
 986   SVE_NOT_IMPLEMENTED, /* sve_width  */
 987   4, /* memmov_cost  */
 988   1, /* issue_rate  */
 989   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 990    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 991   "16", /* function_align.  */
 992   "4",  /* jump_align.  */
 993   "8",  /* loop_align.  */
 994   2,    /* int_reassoc_width.  */
 995   4,    /* fp_reassoc_width.  */
 996   1,    /* vec_reassoc_width.  */
 997   2,    /* min_div_recip_mul_sf.  */
 998   2,    /* min_div_recip_mul_df.  */
 999   0,    /* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1002   &generic_prefetch_tune
1003 };
1004
1005 static const struct tune_params cortexa53_tunings =
1006 {
1007   &cortexa53_extra_costs,
1008   &generic_addrcost_table,
1009   &cortexa53_regmove_cost,
1010   &generic_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost  */
1015   2, /* issue_rate  */
1016   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1018   "16", /* function_align.  */
1019   "4",  /* jump_align.  */
1020   "8",  /* loop_align.  */
1021   2,    /* int_reassoc_width.  */
1022   4,    /* fp_reassoc_width.  */
1023   1,    /* vec_reassoc_width.  */
1024   2,    /* min_div_recip_mul_sf.  */
1025   2,    /* min_div_recip_mul_df.  */
1026   0,    /* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1028   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1029   &generic_prefetch_tune
1030 };
1031
1032 static const struct tune_params cortexa57_tunings =
1033 {
1034   &cortexa57_extra_costs,
1035   &generic_addrcost_table,
1036   &cortexa57_regmove_cost,
1037   &cortexa57_vector_cost,
1038   &generic_branch_cost,
1039   &generic_approx_modes,
1040   SVE_NOT_IMPLEMENTED, /* sve_width  */
1041   4, /* memmov_cost  */
1042   3, /* issue_rate  */
1043   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1044    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1045   "16", /* function_align.  */
1046   "4",  /* jump_align.  */
1047   "8",  /* loop_align.  */
1048   2,    /* int_reassoc_width.  */
1049   4,    /* fp_reassoc_width.  */
1050   1,    /* vec_reassoc_width.  */
1051   2,    /* min_div_recip_mul_sf.  */
1052   2,    /* min_div_recip_mul_df.  */
1053   0,    /* max_case_values.  */
1054   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1055   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
1056   &generic_prefetch_tune
1057 };
1058
1059 static const struct tune_params cortexa72_tunings =
1060 {
1061   &cortexa57_extra_costs,
1062   &generic_addrcost_table,
1063   &cortexa57_regmove_cost,
1064   &cortexa57_vector_cost,
1065   &generic_branch_cost,
1066   &generic_approx_modes,
1067   SVE_NOT_IMPLEMENTED, /* sve_width  */
1068   4, /* memmov_cost  */
1069   3, /* issue_rate  */
1070   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1071    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1072   "16", /* function_align.  */
1073   "4",  /* jump_align.  */
1074   "8",  /* loop_align.  */
1075   2,    /* int_reassoc_width.  */
1076   4,    /* fp_reassoc_width.  */
1077   1,    /* vec_reassoc_width.  */
1078   2,    /* min_div_recip_mul_sf.  */
1079   2,    /* min_div_recip_mul_df.  */
1080   0,    /* max_case_values.  */
1081   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1082   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1083   &generic_prefetch_tune
1084 };
1085
1086 static const struct tune_params cortexa73_tunings =
1087 {
1088   &cortexa57_extra_costs,
1089   &generic_addrcost_table,
1090   &cortexa57_regmove_cost,
1091   &cortexa57_vector_cost,
1092   &generic_branch_cost,
1093   &generic_approx_modes,
1094   SVE_NOT_IMPLEMENTED, /* sve_width  */
1095   4, /* memmov_cost.  */
1096   2, /* issue_rate.  */
1097   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1098    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1099   "16", /* function_align.  */
1100   "4",  /* jump_align.  */
1101   "8",  /* loop_align.  */
1102   2,    /* int_reassoc_width.  */
1103   4,    /* fp_reassoc_width.  */
1104   1,    /* vec_reassoc_width.  */
1105   2,    /* min_div_recip_mul_sf.  */
1106   2,    /* min_div_recip_mul_df.  */
1107   0,    /* max_case_values.  */
1108   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1109   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1110   &generic_prefetch_tune
1111 };
1112
1113
1114
1115 static const struct tune_params exynosm1_tunings =
1116 {
1117   &exynosm1_extra_costs,
1118   &exynosm1_addrcost_table,
1119   &exynosm1_regmove_cost,
1120   &exynosm1_vector_cost,
1121   &generic_branch_cost,
1122   &exynosm1_approx_modes,
1123   SVE_NOT_IMPLEMENTED, /* sve_width  */
1124   4,    /* memmov_cost  */
1125   3,    /* issue_rate  */
1126   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1127   "4",  /* function_align.  */
1128   "4",  /* jump_align.  */
1129   "4",  /* loop_align.  */
1130   2,    /* int_reassoc_width.  */
1131   4,    /* fp_reassoc_width.  */
1132   1,    /* vec_reassoc_width.  */
1133   2,    /* min_div_recip_mul_sf.  */
1134   2,    /* min_div_recip_mul_df.  */
1135   48,   /* max_case_values.  */
1136   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1137   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1138   &exynosm1_prefetch_tune
1139 };
1140
1141 static const struct tune_params thunderxt88_tunings =
1142 {
1143   &thunderx_extra_costs,
1144   &generic_addrcost_table,
1145   &thunderx_regmove_cost,
1146   &thunderx_vector_cost,
1147   &generic_branch_cost,
1148   &generic_approx_modes,
1149   SVE_NOT_IMPLEMENTED, /* sve_width  */
1150   6, /* memmov_cost  */
1151   2, /* issue_rate  */
1152   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1153   "8",  /* function_align.  */
1154   "8",  /* jump_align.  */
1155   "8",  /* loop_align.  */
1156   2,    /* int_reassoc_width.  */
1157   4,    /* fp_reassoc_width.  */
1158   1,    /* vec_reassoc_width.  */
1159   2,    /* min_div_recip_mul_sf.  */
1160   2,    /* min_div_recip_mul_df.  */
1161   0,    /* max_case_values.  */
1162   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1163   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1164   &thunderxt88_prefetch_tune
1165 };
1166
1167 static const struct tune_params thunderx_tunings =
1168 {
1169   &thunderx_extra_costs,
1170   &generic_addrcost_table,
1171   &thunderx_regmove_cost,
1172   &thunderx_vector_cost,
1173   &generic_branch_cost,
1174   &generic_approx_modes,
1175   SVE_NOT_IMPLEMENTED, /* sve_width  */
1176   6, /* memmov_cost  */
1177   2, /* issue_rate  */
1178   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1179   "8",  /* function_align.  */
1180   "8",  /* jump_align.  */
1181   "8",  /* loop_align.  */
1182   2,    /* int_reassoc_width.  */
1183   4,    /* fp_reassoc_width.  */
1184   1,    /* vec_reassoc_width.  */
1185   2,    /* min_div_recip_mul_sf.  */
1186   2,    /* min_div_recip_mul_df.  */
1187   0,    /* max_case_values.  */
1188   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1189   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1190    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1191   &thunderx_prefetch_tune
1192 };
1193
1194 static const struct tune_params tsv110_tunings =
1195 {
1196   &tsv110_extra_costs,
1197   &tsv110_addrcost_table,
1198   &tsv110_regmove_cost,
1199   &tsv110_vector_cost,
1200   &generic_branch_cost,
1201   &generic_approx_modes,
1202   SVE_NOT_IMPLEMENTED, /* sve_width  */
1203   4,    /* memmov_cost  */
1204   4,    /* issue_rate  */
1205   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1206    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1207   "16", /* function_align.  */
1208   "4",  /* jump_align.  */
1209   "8",  /* loop_align.  */
1210   2,    /* int_reassoc_width.  */
1211   4,    /* fp_reassoc_width.  */
1212   1,    /* vec_reassoc_width.  */
1213   2,    /* min_div_recip_mul_sf.  */
1214   2,    /* min_div_recip_mul_df.  */
1215   0,    /* max_case_values.  */
1216   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1217   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1218   &tsv110_prefetch_tune
1219 };
1220
1221 static const struct tune_params xgene1_tunings =
1222 {
1223   &xgene1_extra_costs,
1224   &xgene1_addrcost_table,
1225   &xgene1_regmove_cost,
1226   &xgene1_vector_cost,
1227   &generic_branch_cost,
1228   &xgene1_approx_modes,
1229   SVE_NOT_IMPLEMENTED, /* sve_width  */
1230   6, /* memmov_cost  */
1231   4, /* issue_rate  */
1232   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1233   "16", /* function_align.  */
1234   "16", /* jump_align.  */
1235   "16", /* loop_align.  */
1236   2,    /* int_reassoc_width.  */
1237   4,    /* fp_reassoc_width.  */
1238   1,    /* vec_reassoc_width.  */
1239   2,    /* min_div_recip_mul_sf.  */
1240   2,    /* min_div_recip_mul_df.  */
1241   17,   /* max_case_values.  */
1242   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1243   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1244   &xgene1_prefetch_tune
1245 };
1246
1247 static const struct tune_params emag_tunings =
1248 {
1249   &xgene1_extra_costs,
1250   &xgene1_addrcost_table,
1251   &xgene1_regmove_cost,
1252   &xgene1_vector_cost,
1253   &generic_branch_cost,
1254   &xgene1_approx_modes,
1255   SVE_NOT_IMPLEMENTED,
1256   6, /* memmov_cost  */
1257   4, /* issue_rate  */
1258   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1259   "16", /* function_align.  */
1260   "16", /* jump_align.  */
1261   "16", /* loop_align.  */
1262   2,    /* int_reassoc_width.  */
1263   4,    /* fp_reassoc_width.  */
1264   1,    /* vec_reassoc_width.  */
1265   2,    /* min_div_recip_mul_sf.  */
1266   2,    /* min_div_recip_mul_df.  */
1267   17,   /* max_case_values.  */
1268   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1269   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1270   &xgene1_prefetch_tune
1271 };
1272
1273 static const struct tune_params qdf24xx_tunings =
1274 {
1275   &qdf24xx_extra_costs,
1276   &qdf24xx_addrcost_table,
1277   &qdf24xx_regmove_cost,
1278   &qdf24xx_vector_cost,
1279   &generic_branch_cost,
1280   &generic_approx_modes,
1281   SVE_NOT_IMPLEMENTED, /* sve_width  */
1282   4, /* memmov_cost  */
1283   4, /* issue_rate  */
1284   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1285    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1286   "16", /* function_align.  */
1287   "8",  /* jump_align.  */
1288   "16", /* loop_align.  */
1289   2,    /* int_reassoc_width.  */
1290   4,    /* fp_reassoc_width.  */
1291   1,    /* vec_reassoc_width.  */
1292   2,    /* min_div_recip_mul_sf.  */
1293   2,    /* min_div_recip_mul_df.  */
1294   0,    /* max_case_values.  */
1295   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1296   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1297   &qdf24xx_prefetch_tune
1298 };
1299
1300 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1301    for now.  */
1302 static const struct tune_params saphira_tunings =
1303 {
1304   &generic_extra_costs,
1305   &generic_addrcost_table,
1306   &generic_regmove_cost,
1307   &generic_vector_cost,
1308   &generic_branch_cost,
1309   &generic_approx_modes,
1310   SVE_NOT_IMPLEMENTED, /* sve_width  */
1311   4, /* memmov_cost  */
1312   4, /* issue_rate  */
1313   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1314    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1315   "16", /* function_align.  */
1316   "8",  /* jump_align.  */
1317   "16", /* loop_align.  */
1318   2,    /* int_reassoc_width.  */
1319   4,    /* fp_reassoc_width.  */
1320   1,    /* vec_reassoc_width.  */
1321   2,    /* min_div_recip_mul_sf.  */
1322   2,    /* min_div_recip_mul_df.  */
1323   0,    /* max_case_values.  */
1324   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1325   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1326   &generic_prefetch_tune
1327 };
1328
1329 static const struct tune_params thunderx2t99_tunings =
1330 {
1331   &thunderx2t99_extra_costs,
1332   &thunderx2t99_addrcost_table,
1333   &thunderx2t99_regmove_cost,
1334   &thunderx2t99_vector_cost,
1335   &generic_branch_cost,
1336   &generic_approx_modes,
1337   SVE_NOT_IMPLEMENTED, /* sve_width  */
1338   4, /* memmov_cost.  */
1339   4, /* issue_rate.  */
1340   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1341    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1342   "16", /* function_align.  */
1343   "8",  /* jump_align.  */
1344   "16", /* loop_align.  */
1345   3,    /* int_reassoc_width.  */
1346   2,    /* fp_reassoc_width.  */
1347   2,    /* vec_reassoc_width.  */
1348   2,    /* min_div_recip_mul_sf.  */
1349   2,    /* min_div_recip_mul_df.  */
1350   0,    /* max_case_values.  */
1351   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1352   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1353   &thunderx2t99_prefetch_tune
1354 };
1355
1356 static const struct tune_params thunderx3t110_tunings =
1357 {
1358   &thunderx3t110_extra_costs,
1359   &thunderx3t110_addrcost_table,
1360   &thunderx3t110_regmove_cost,
1361   &thunderx3t110_vector_cost,
1362   &generic_branch_cost,
1363   &generic_approx_modes,
1364   SVE_NOT_IMPLEMENTED, /* sve_width  */
1365   4, /* memmov_cost.  */
1366   6, /* issue_rate.  */
1367   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1368    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1369   "16", /* function_align.  */
1370   "8",  /* jump_align.  */
1371   "16", /* loop_align.  */
1372   3,    /* int_reassoc_width.  */
1373   2,    /* fp_reassoc_width.  */
1374   2,    /* vec_reassoc_width.  */
1375   2,    /* min_div_recip_mul_sf.  */
1376   2,    /* min_div_recip_mul_df.  */
1377   0,    /* max_case_values.  */
1378   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1379   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1380   &thunderx3t110_prefetch_tune
1381 };
1382
1383 static const struct tune_params neoversen1_tunings =
1384 {
1385   &cortexa76_extra_costs,
1386   &generic_addrcost_table,
1387   &generic_regmove_cost,
1388   &cortexa57_vector_cost,
1389   &generic_branch_cost,
1390   &generic_approx_modes,
1391   SVE_NOT_IMPLEMENTED, /* sve_width  */
1392   4, /* memmov_cost  */
1393   3, /* issue_rate  */
1394   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1395   "32:16",      /* function_align.  */
1396   "4",          /* jump_align.  */
1397   "32:16",      /* loop_align.  */
1398   2,    /* int_reassoc_width.  */
1399   4,    /* fp_reassoc_width.  */
1400   2,    /* vec_reassoc_width.  */
1401   2,    /* min_div_recip_mul_sf.  */
1402   2,    /* min_div_recip_mul_df.  */
1403   0,    /* max_case_values.  */
1404   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1405   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1406   &generic_prefetch_tune
1407 };
1408
1409 static const struct tune_params neoversev1_tunings =
1410 {
1411   &cortexa76_extra_costs,
1412   &generic_addrcost_table,
1413   &generic_regmove_cost,
1414   &cortexa57_vector_cost,
1415   &generic_branch_cost,
1416   &generic_approx_modes,
1417   SVE_256, /* sve_width  */
1418   4, /* memmov_cost  */
1419   3, /* issue_rate  */
1420   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1421   "32:16",      /* function_align.  */
1422   "4",          /* jump_align.  */
1423   "32:16",      /* loop_align.  */
1424   2,    /* int_reassoc_width.  */
1425   4,    /* fp_reassoc_width.  */
1426   2,    /* vec_reassoc_width.  */
1427   2,    /* min_div_recip_mul_sf.  */
1428   2,    /* min_div_recip_mul_df.  */
1429   0,    /* max_case_values.  */
1430   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1431   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1432   &generic_prefetch_tune
1433 };
1434
1435 static const struct tune_params neoversen2_tunings =
1436 {
1437   &cortexa76_extra_costs,
1438   &generic_addrcost_table,
1439   &generic_regmove_cost,
1440   &cortexa57_vector_cost,
1441   &generic_branch_cost,
1442   &generic_approx_modes,
1443   SVE_128, /* sve_width  */
1444   4, /* memmov_cost  */
1445   3, /* issue_rate  */
1446   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1447   "32:16",      /* function_align.  */
1448   "4",          /* jump_align.  */
1449   "32:16",      /* loop_align.  */
1450   2,    /* int_reassoc_width.  */
1451   4,    /* fp_reassoc_width.  */
1452   2,    /* vec_reassoc_width.  */
1453   2,    /* min_div_recip_mul_sf.  */
1454   2,    /* min_div_recip_mul_df.  */
1455   0,    /* max_case_values.  */
1456   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1457   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1458   &generic_prefetch_tune
1459 };
1460
1461 static const struct tune_params a64fx_tunings =
1462 {
1463   &generic_extra_costs,
1464   &generic_addrcost_table,
1465   &generic_regmove_cost,
1466   &generic_vector_cost,
1467   &generic_branch_cost,
1468   &generic_approx_modes,
1469   SVE_512, /* sve_width  */
1470   4, /* memmov_cost  */
1471   7, /* issue_rate  */
1472   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1473   "32", /* function_align.  */
1474   "16", /* jump_align.  */
1475   "32", /* loop_align.  */
1476   4,    /* int_reassoc_width.  */
1477   2,    /* fp_reassoc_width.  */
1478   2,    /* vec_reassoc_width.  */
1479   2,    /* min_div_recip_mul_sf.  */
1480   2,    /* min_div_recip_mul_df.  */
1481   0,    /* max_case_values.  */
1482   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1483   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1484   &a64fx_prefetch_tune
1485 };
1486
1487 /* Support for fine-grained override of the tuning structures.  */
1488 struct aarch64_tuning_override_function
1489 {
1490   const char* name;
1491   void (*parse_override)(const char*, struct tune_params*);
1492 };
1493
1494 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1495 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1496 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1497
1498 static const struct aarch64_tuning_override_function
1499 aarch64_tuning_override_functions[] =
1500 {
1501   { "fuse", aarch64_parse_fuse_string },
1502   { "tune", aarch64_parse_tune_string },
1503   { "sve_width", aarch64_parse_sve_width_string },
1504   { NULL, NULL }
1505 };
1506
1507 /* A processor implementing AArch64.  */
1508 struct processor
1509 {
1510   const char *const name;
1511   enum aarch64_processor ident;
1512   enum aarch64_processor sched_core;
1513   enum aarch64_arch arch;
1514   unsigned architecture_version;
1515   const uint64_t flags;
1516   const struct tune_params *const tune;
1517 };
1518
1519 /* Architectures implementing AArch64.  */
1520 static const struct processor all_architectures[] =
1521 {
1522 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1523   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1524 #include "aarch64-arches.def"
1525   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1526 };
1527
1528 /* Processor cores implementing AArch64.  */
1529 static const struct processor all_cores[] =
1530 {
1531 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1532   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1533   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1534   FLAGS, &COSTS##_tunings},
1535 #include "aarch64-cores.def"
1536   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1537     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1538   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1539 };
1540
1541
1542 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1543    handling code or by target attributes.  */
1544 static const struct processor *selected_arch;
1545 static const struct processor *selected_cpu;
1546 static const struct processor *selected_tune;
1547
1548 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1549
1550 /* The current tuning set.  */
1551 struct tune_params aarch64_tune_params = generic_tunings;
1552
1553 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1554
1555 static tree
1556 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1557                                      int, bool *no_add_attrs)
1558 {
1559   /* Since we set fn_type_req to true, the caller should have checked
1560      this for us.  */
1561   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1562   switch ((arm_pcs) fntype_abi (*node).id ())
1563     {
1564     case ARM_PCS_AAPCS64:
1565     case ARM_PCS_SIMD:
1566       return NULL_TREE;
1567
1568     case ARM_PCS_SVE:
1569       error ("the %qE attribute cannot be applied to an SVE function type",
1570              name);
1571       *no_add_attrs = true;
1572       return NULL_TREE;
1573
1574     case ARM_PCS_TLSDESC:
1575     case ARM_PCS_UNKNOWN:
1576       break;
1577     }
1578   gcc_unreachable ();
1579 }
1580
1581 /* Table of machine attributes.  */
1582 static const struct attribute_spec aarch64_attribute_table[] =
1583 {
1584   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1585        affects_type_identity, handler, exclude } */
1586   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1587                           handle_aarch64_vector_pcs_attribute, NULL },
1588   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
1589                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
1590                           NULL },
1591   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
1592   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
1593   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
1594   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1595 };
1596
1597 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1598
1599 /* An ISA extension in the co-processor and main instruction set space.  */
1600 struct aarch64_option_extension
1601 {
1602   const char *const name;
1603   const unsigned long flags_on;
1604   const unsigned long flags_off;
1605 };
1606
1607 typedef enum aarch64_cond_code
1608 {
1609   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1610   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1611   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1612 }
1613 aarch64_cc;
1614
1615 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1616
1617 struct aarch64_branch_protect_type
1618 {
1619   /* The type's name that the user passes to the branch-protection option
1620     string.  */
1621   const char* name;
1622   /* Function to handle the protection type and set global variables.
1623     First argument is the string token corresponding with this type and the
1624     second argument is the next token in the option string.
1625     Return values:
1626     * AARCH64_PARSE_OK: Handling was sucessful.
1627     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1628       should print an error.
1629     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1630       own error.  */
1631   enum aarch64_parse_opt_result (*handler)(char*, char*);
1632   /* A list of types that can follow this type in the option string.  */
1633   const aarch64_branch_protect_type* subtypes;
1634   unsigned int num_subtypes;
1635 };
1636
1637 static enum aarch64_parse_opt_result
1638 aarch64_handle_no_branch_protection (char* str, char* rest)
1639 {
1640   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1641   aarch64_enable_bti = 0;
1642   if (rest)
1643     {
1644       error ("unexpected %<%s%> after %<%s%>", rest, str);
1645       return AARCH64_PARSE_INVALID_FEATURE;
1646     }
1647   return AARCH64_PARSE_OK;
1648 }
1649
1650 static enum aarch64_parse_opt_result
1651 aarch64_handle_standard_branch_protection (char* str, char* rest)
1652 {
1653   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1654   aarch64_ra_sign_key = AARCH64_KEY_A;
1655   aarch64_enable_bti = 1;
1656   if (rest)
1657     {
1658       error ("unexpected %<%s%> after %<%s%>", rest, str);
1659       return AARCH64_PARSE_INVALID_FEATURE;
1660     }
1661   return AARCH64_PARSE_OK;
1662 }
1663
1664 static enum aarch64_parse_opt_result
1665 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1666                                     char* rest ATTRIBUTE_UNUSED)
1667 {
1668   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1669   aarch64_ra_sign_key = AARCH64_KEY_A;
1670   return AARCH64_PARSE_OK;
1671 }
1672
1673 static enum aarch64_parse_opt_result
1674 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1675                               char* rest ATTRIBUTE_UNUSED)
1676 {
1677   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1678   return AARCH64_PARSE_OK;
1679 }
1680
1681 static enum aarch64_parse_opt_result
1682 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1683                               char* rest ATTRIBUTE_UNUSED)
1684 {
1685   aarch64_ra_sign_key = AARCH64_KEY_B;
1686   return AARCH64_PARSE_OK;
1687 }
1688
1689 static enum aarch64_parse_opt_result
1690 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1691                                     char* rest ATTRIBUTE_UNUSED)
1692 {
1693   aarch64_enable_bti = 1;
1694   return AARCH64_PARSE_OK;
1695 }
1696
1697 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1698   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1699   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1700   { NULL, NULL, NULL, 0 }
1701 };
1702
1703 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1704   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1705   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1706   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1707     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1708   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1709   { NULL, NULL, NULL, 0 }
1710 };
1711
1712 /* The condition codes of the processor, and the inverse function.  */
1713 static const char * const aarch64_condition_codes[] =
1714 {
1715   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1716   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1717 };
1718
1719 /* The preferred condition codes for SVE conditions.  */
1720 static const char *const aarch64_sve_condition_codes[] =
1721 {
1722   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1723   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1724 };
1725
1726 /* Return the assembly token for svpattern value VALUE.  */
1727
1728 static const char *
1729 svpattern_token (enum aarch64_svpattern pattern)
1730 {
1731   switch (pattern)
1732     {
1733 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1734     AARCH64_FOR_SVPATTERN (CASE)
1735 #undef CASE
1736     case AARCH64_NUM_SVPATTERNS:
1737       break;
1738     }
1739   gcc_unreachable ();
1740 }
1741
1742 /* Return the location of a piece that is known to be passed or returned
1743    in registers.  FIRST_ZR is the first unused vector argument register
1744    and FIRST_PR is the first unused predicate argument register.  */
1745
1746 rtx
1747 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1748                                          unsigned int first_pr) const
1749 {
1750   gcc_assert (VECTOR_MODE_P (mode)
1751               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1752               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1753
1754   if (num_zr > 0 && num_pr == 0)
1755     return gen_rtx_REG (mode, first_zr);
1756
1757   if (num_zr == 0 && num_pr == 1)
1758     return gen_rtx_REG (mode, first_pr);
1759
1760   gcc_unreachable ();
1761 }
1762
1763 /* Return the total number of vector registers required by the PST.  */
1764
1765 unsigned int
1766 pure_scalable_type_info::num_zr () const
1767 {
1768   unsigned int res = 0;
1769   for (unsigned int i = 0; i < pieces.length (); ++i)
1770     res += pieces[i].num_zr;
1771   return res;
1772 }
1773
1774 /* Return the total number of predicate registers required by the PST.  */
1775
1776 unsigned int
1777 pure_scalable_type_info::num_pr () const
1778 {
1779   unsigned int res = 0;
1780   for (unsigned int i = 0; i < pieces.length (); ++i)
1781     res += pieces[i].num_pr;
1782   return res;
1783 }
1784
1785 /* Return the location of a PST that is known to be passed or returned
1786    in registers.  FIRST_ZR is the first unused vector argument register
1787    and FIRST_PR is the first unused predicate argument register.  */
1788
1789 rtx
1790 pure_scalable_type_info::get_rtx (machine_mode mode,
1791                                   unsigned int first_zr,
1792                                   unsigned int first_pr) const
1793 {
1794   /* Try to return a single REG if possible.  This leads to better
1795      code generation; it isn't required for correctness.  */
1796   if (mode == pieces[0].mode)
1797     {
1798       gcc_assert (pieces.length () == 1);
1799       return pieces[0].get_rtx (first_zr, first_pr);
1800     }
1801
1802   /* Build up a PARALLEL that contains the individual pieces.  */
1803   rtvec rtxes = rtvec_alloc (pieces.length ());
1804   for (unsigned int i = 0; i < pieces.length (); ++i)
1805     {
1806       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1807       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1808       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1809       first_zr += pieces[i].num_zr;
1810       first_pr += pieces[i].num_pr;
1811     }
1812   return gen_rtx_PARALLEL (mode, rtxes);
1813 }
1814
1815 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1816    in the AAPCS64.  */
1817
1818 pure_scalable_type_info::analysis_result
1819 pure_scalable_type_info::analyze (const_tree type)
1820 {
1821   /* Prevent accidental reuse.  */
1822   gcc_assert (pieces.is_empty ());
1823
1824   /* No code will be generated for erroneous types, so we won't establish
1825      an ABI mapping.  */
1826   if (type == error_mark_node)
1827     return NO_ABI_IDENTITY;
1828
1829   /* Zero-sized types disappear in the language->ABI mapping.  */
1830   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1831     return NO_ABI_IDENTITY;
1832
1833   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1834   piece p = {};
1835   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1836     {
1837       machine_mode mode = TYPE_MODE_RAW (type);
1838       gcc_assert (VECTOR_MODE_P (mode)
1839                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1840
1841       p.mode = p.orig_mode = mode;
1842       add_piece (p);
1843       return IS_PST;
1844     }
1845
1846   /* Check for user-defined PSTs.  */
1847   if (TREE_CODE (type) == ARRAY_TYPE)
1848     return analyze_array (type);
1849   if (TREE_CODE (type) == RECORD_TYPE)
1850     return analyze_record (type);
1851
1852   return ISNT_PST;
1853 }
1854
1855 /* Analyze a type that is known not to be passed or returned in memory.
1856    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1857
1858 bool
1859 pure_scalable_type_info::analyze_registers (const_tree type)
1860 {
1861   analysis_result result = analyze (type);
1862   gcc_assert (result != DOESNT_MATTER);
1863   return result == IS_PST;
1864 }
1865
1866 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1867
1868 pure_scalable_type_info::analysis_result
1869 pure_scalable_type_info::analyze_array (const_tree type)
1870 {
1871   /* Analyze the element type.  */
1872   pure_scalable_type_info element_info;
1873   analysis_result result = element_info.analyze (TREE_TYPE (type));
1874   if (result != IS_PST)
1875     return result;
1876
1877   /* An array of unknown, flexible or variable length will be passed and
1878      returned by reference whatever we do.  */
1879   tree nelts_minus_one = array_type_nelts (type);
1880   if (!tree_fits_uhwi_p (nelts_minus_one))
1881     return DOESNT_MATTER;
1882
1883   /* Likewise if the array is constant-sized but too big to be interesting.
1884      The double checks against MAX_PIECES are to protect against overflow.  */
1885   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1886   if (count > MAX_PIECES)
1887     return DOESNT_MATTER;
1888   count += 1;
1889   if (count * element_info.pieces.length () > MAX_PIECES)
1890     return DOESNT_MATTER;
1891
1892   /* The above checks should have weeded out elements of unknown size.  */
1893   poly_uint64 element_bytes;
1894   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1895     gcc_unreachable ();
1896
1897   /* Build up the list of individual vectors and predicates.  */
1898   gcc_assert (!element_info.pieces.is_empty ());
1899   for (unsigned int i = 0; i < count; ++i)
1900     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1901       {
1902         piece p = element_info.pieces[j];
1903         p.offset += i * element_bytes;
1904         add_piece (p);
1905       }
1906   return IS_PST;
1907 }
1908
1909 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1910
1911 pure_scalable_type_info::analysis_result
1912 pure_scalable_type_info::analyze_record (const_tree type)
1913 {
1914   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1915     {
1916       if (TREE_CODE (field) != FIELD_DECL)
1917         continue;
1918
1919       /* Zero-sized fields disappear in the language->ABI mapping.  */
1920       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1921         continue;
1922
1923       /* All fields with an ABI identity must be PSTs for the record as
1924          a whole to be a PST.  If any individual field is too big to be
1925          interesting then the record is too.  */
1926       pure_scalable_type_info field_info;
1927       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1928       if (subresult == NO_ABI_IDENTITY)
1929         continue;
1930       if (subresult != IS_PST)
1931         return subresult;
1932
1933       /* Since all previous fields are PSTs, we ought to be able to track
1934          the field offset using poly_ints.  */
1935       tree bitpos = bit_position (field);
1936       gcc_assert (poly_int_tree_p (bitpos));
1937
1938       /* For the same reason, it shouldn't be possible to create a PST field
1939          whose offset isn't byte-aligned.  */
1940       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1941                                                 BITS_PER_UNIT);
1942
1943       /* Punt if the record is too big to be interesting.  */
1944       poly_uint64 bytepos;
1945       if (!wide_bytepos.to_uhwi (&bytepos)
1946           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1947         return DOESNT_MATTER;
1948
1949       /* Add the individual vectors and predicates in the field to the
1950          record's list.  */
1951       gcc_assert (!field_info.pieces.is_empty ());
1952       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1953         {
1954           piece p = field_info.pieces[i];
1955           p.offset += bytepos;
1956           add_piece (p);
1957         }
1958     }
1959   /* Empty structures disappear in the language->ABI mapping.  */
1960   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1961 }
1962
1963 /* Add P to the list of pieces in the type.  */
1964
1965 void
1966 pure_scalable_type_info::add_piece (const piece &p)
1967 {
1968   /* Try to fold the new piece into the previous one to form a
1969      single-mode PST.  For example, if we see three consecutive vectors
1970      of the same mode, we can represent them using the corresponding
1971      3-tuple mode.
1972
1973      This is purely an optimization.  */
1974   if (!pieces.is_empty ())
1975     {
1976       piece &prev = pieces.last ();
1977       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1978       unsigned int nelems1, nelems2;
1979       if (prev.orig_mode == p.orig_mode
1980           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1981           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1982                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1983           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1984                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1985           && targetm.array_mode (p.orig_mode,
1986                                  nelems1 + nelems2).exists (&prev.mode))
1987         {
1988           prev.num_zr += p.num_zr;
1989           prev.num_pr += p.num_pr;
1990           return;
1991         }
1992     }
1993   pieces.quick_push (p);
1994 }
1995
1996 /* Return true if at least one possible value of type TYPE includes at
1997    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1998
1999    This is a relatively expensive test for some types, so it should
2000    generally be made as late as possible.  */
2001
2002 static bool
2003 aarch64_some_values_include_pst_objects_p (const_tree type)
2004 {
2005   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2006     return false;
2007
2008   if (aarch64_sve::builtin_type_p (type))
2009     return true;
2010
2011   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2012     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2013
2014   if (RECORD_OR_UNION_TYPE_P (type))
2015     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2016       if (TREE_CODE (field) == FIELD_DECL
2017           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2018         return true;
2019
2020   return false;
2021 }
2022
2023 /* Return the descriptor of the SIMD ABI.  */
2024
2025 static const predefined_function_abi &
2026 aarch64_simd_abi (void)
2027 {
2028   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2029   if (!simd_abi.initialized_p ())
2030     {
2031       HARD_REG_SET full_reg_clobbers
2032         = default_function_abi.full_reg_clobbers ();
2033       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2034         if (FP_SIMD_SAVED_REGNUM_P (regno))
2035           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2036       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2037     }
2038   return simd_abi;
2039 }
2040
2041 /* Return the descriptor of the SVE PCS.  */
2042
2043 static const predefined_function_abi &
2044 aarch64_sve_abi (void)
2045 {
2046   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2047   if (!sve_abi.initialized_p ())
2048     {
2049       HARD_REG_SET full_reg_clobbers
2050         = default_function_abi.full_reg_clobbers ();
2051       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2052         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2053       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
2054         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2055       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2056     }
2057   return sve_abi;
2058 }
2059
2060 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2061    wraps, otherwise return X itself.  */
2062
2063 static rtx
2064 strip_salt (rtx x)
2065 {
2066   rtx search = x;
2067   if (GET_CODE (search) == CONST)
2068     search = XEXP (search, 0);
2069   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2070     x = XVECEXP (search, 0, 0);
2071   return x;
2072 }
2073
2074 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2075    expression.  */
2076
2077 static rtx
2078 strip_offset_and_salt (rtx addr, poly_int64 *offset)
2079 {
2080   return strip_salt (strip_offset (addr, offset));
2081 }
2082
2083 /* Generate code to enable conditional branches in functions over 1 MiB.  */
2084 const char *
2085 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2086                         const char * branch_format)
2087 {
2088     rtx_code_label * tmp_label = gen_label_rtx ();
2089     char label_buf[256];
2090     char buffer[128];
2091     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2092                                  CODE_LABEL_NUMBER (tmp_label));
2093     const char *label_ptr = targetm.strip_name_encoding (label_buf);
2094     rtx dest_label = operands[pos_label];
2095     operands[pos_label] = tmp_label;
2096
2097     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2098     output_asm_insn (buffer, operands);
2099
2100     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2101     operands[pos_label] = dest_label;
2102     output_asm_insn (buffer, operands);
2103     return "";
2104 }
2105
2106 void
2107 aarch64_err_no_fpadvsimd (machine_mode mode)
2108 {
2109   if (TARGET_GENERAL_REGS_ONLY)
2110     if (FLOAT_MODE_P (mode))
2111       error ("%qs is incompatible with the use of floating-point types",
2112              "-mgeneral-regs-only");
2113     else
2114       error ("%qs is incompatible with the use of vector types",
2115              "-mgeneral-regs-only");
2116   else
2117     if (FLOAT_MODE_P (mode))
2118       error ("%qs feature modifier is incompatible with the use of"
2119              " floating-point types", "+nofp");
2120     else
2121       error ("%qs feature modifier is incompatible with the use of"
2122              " vector types", "+nofp");
2123 }
2124
2125 /* Report when we try to do something that requires SVE when SVE is disabled.
2126    This is an error of last resort and isn't very high-quality.  It usually
2127    involves attempts to measure the vector length in some way.  */
2128 static void
2129 aarch64_report_sve_required (void)
2130 {
2131   static bool reported_p = false;
2132
2133   /* Avoid reporting a slew of messages for a single oversight.  */
2134   if (reported_p)
2135     return;
2136
2137   error ("this operation requires the SVE ISA extension");
2138   inform (input_location, "you can enable SVE using the command-line"
2139           " option %<-march%>, or by using the %<target%>"
2140           " attribute or pragma");
2141   reported_p = true;
2142 }
2143
2144 /* Return true if REGNO is P0-P15 or one of the special FFR-related
2145    registers.  */
2146 inline bool
2147 pr_or_ffr_regnum_p (unsigned int regno)
2148 {
2149   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2150 }
2151
2152 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2153    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2154    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2155    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2156    and GENERAL_REGS is lower than the memory cost (in this case the best class
2157    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
2158    cost results in bad allocations with many redundant int<->FP moves which
2159    are expensive on various cores.
2160    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2161    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
2162    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
2163    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
2164    The result of this is that it is no longer inefficient to have a higher
2165    memory move cost than the register move cost.
2166 */
2167
2168 static reg_class_t
2169 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2170                                          reg_class_t best_class)
2171 {
2172   machine_mode mode;
2173
2174   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2175       || !reg_class_subset_p (FP_REGS, allocno_class))
2176     return allocno_class;
2177
2178   if (!reg_class_subset_p (GENERAL_REGS, best_class)
2179       || !reg_class_subset_p (FP_REGS, best_class))
2180     return best_class;
2181
2182   mode = PSEUDO_REGNO_MODE (regno);
2183   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2184 }
2185
2186 static unsigned int
2187 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2188 {
2189   if (GET_MODE_UNIT_SIZE (mode) == 4)
2190     return aarch64_tune_params.min_div_recip_mul_sf;
2191   return aarch64_tune_params.min_div_recip_mul_df;
2192 }
2193
2194 /* Return the reassociation width of treeop OPC with mode MODE.  */
2195 static int
2196 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2197 {
2198   if (VECTOR_MODE_P (mode))
2199     return aarch64_tune_params.vec_reassoc_width;
2200   if (INTEGRAL_MODE_P (mode))
2201     return aarch64_tune_params.int_reassoc_width;
2202   /* Avoid reassociating floating point addition so we emit more FMAs.  */
2203   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2204     return aarch64_tune_params.fp_reassoc_width;
2205   return 1;
2206 }
2207
2208 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
2209 unsigned
2210 aarch64_dbx_register_number (unsigned regno)
2211 {
2212    if (GP_REGNUM_P (regno))
2213      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2214    else if (regno == SP_REGNUM)
2215      return AARCH64_DWARF_SP;
2216    else if (FP_REGNUM_P (regno))
2217      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2218    else if (PR_REGNUM_P (regno))
2219      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2220    else if (regno == VG_REGNUM)
2221      return AARCH64_DWARF_VG;
2222
2223    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2224       equivalent DWARF register.  */
2225    return DWARF_FRAME_REGISTERS;
2226 }
2227
2228 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2229    integer, otherwise return X unmodified.  */
2230 static rtx
2231 aarch64_bit_representation (rtx x)
2232 {
2233   if (CONST_DOUBLE_P (x))
2234     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2235   return x;
2236 }
2237
2238 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
2239 static bool
2240 aarch64_advsimd_struct_mode_p (machine_mode mode)
2241 {
2242   return (TARGET_SIMD
2243           && (mode == OImode || mode == CImode || mode == XImode));
2244 }
2245
2246 /* Return true if MODE is an SVE predicate mode.  */
2247 static bool
2248 aarch64_sve_pred_mode_p (machine_mode mode)
2249 {
2250   return (TARGET_SVE
2251           && (mode == VNx16BImode
2252               || mode == VNx8BImode
2253               || mode == VNx4BImode
2254               || mode == VNx2BImode));
2255 }
2256
2257 /* Three mutually-exclusive flags describing a vector or predicate type.  */
2258 const unsigned int VEC_ADVSIMD  = 1;
2259 const unsigned int VEC_SVE_DATA = 2;
2260 const unsigned int VEC_SVE_PRED = 4;
2261 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2262    a structure of 2, 3 or 4 vectors.  */
2263 const unsigned int VEC_STRUCT   = 8;
2264 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2265    vector has fewer significant bytes than a full SVE vector.  */
2266 const unsigned int VEC_PARTIAL  = 16;
2267 /* Useful combinations of the above.  */
2268 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
2269 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2270
2271 /* Return a set of flags describing the vector properties of mode MODE.
2272    Ignore modes that are not supported by the current target.  */
2273 static unsigned int
2274 aarch64_classify_vector_mode (machine_mode mode)
2275 {
2276   if (aarch64_advsimd_struct_mode_p (mode))
2277     return VEC_ADVSIMD | VEC_STRUCT;
2278
2279   if (aarch64_sve_pred_mode_p (mode))
2280     return VEC_SVE_PRED;
2281
2282   /* Make the decision based on the mode's enum value rather than its
2283      properties, so that we keep the correct classification regardless
2284      of -msve-vector-bits.  */
2285   switch (mode)
2286     {
2287     /* Partial SVE QI vectors.  */
2288     case E_VNx2QImode:
2289     case E_VNx4QImode:
2290     case E_VNx8QImode:
2291     /* Partial SVE HI vectors.  */
2292     case E_VNx2HImode:
2293     case E_VNx4HImode:
2294     /* Partial SVE SI vector.  */
2295     case E_VNx2SImode:
2296     /* Partial SVE HF vectors.  */
2297     case E_VNx2HFmode:
2298     case E_VNx4HFmode:
2299     /* Partial SVE BF vectors.  */
2300     case E_VNx2BFmode:
2301     case E_VNx4BFmode:
2302     /* Partial SVE SF vector.  */
2303     case E_VNx2SFmode:
2304       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2305
2306     case E_VNx16QImode:
2307     case E_VNx8HImode:
2308     case E_VNx4SImode:
2309     case E_VNx2DImode:
2310     case E_VNx8BFmode:
2311     case E_VNx8HFmode:
2312     case E_VNx4SFmode:
2313     case E_VNx2DFmode:
2314       return TARGET_SVE ? VEC_SVE_DATA : 0;
2315
2316     /* x2 SVE vectors.  */
2317     case E_VNx32QImode:
2318     case E_VNx16HImode:
2319     case E_VNx8SImode:
2320     case E_VNx4DImode:
2321     case E_VNx16BFmode:
2322     case E_VNx16HFmode:
2323     case E_VNx8SFmode:
2324     case E_VNx4DFmode:
2325     /* x3 SVE vectors.  */
2326     case E_VNx48QImode:
2327     case E_VNx24HImode:
2328     case E_VNx12SImode:
2329     case E_VNx6DImode:
2330     case E_VNx24BFmode:
2331     case E_VNx24HFmode:
2332     case E_VNx12SFmode:
2333     case E_VNx6DFmode:
2334     /* x4 SVE vectors.  */
2335     case E_VNx64QImode:
2336     case E_VNx32HImode:
2337     case E_VNx16SImode:
2338     case E_VNx8DImode:
2339     case E_VNx32BFmode:
2340     case E_VNx32HFmode:
2341     case E_VNx16SFmode:
2342     case E_VNx8DFmode:
2343       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2344
2345     /* 64-bit Advanced SIMD vectors.  */
2346     case E_V8QImode:
2347     case E_V4HImode:
2348     case E_V2SImode:
2349     /* ...E_V1DImode doesn't exist.  */
2350     case E_V4HFmode:
2351     case E_V4BFmode:
2352     case E_V2SFmode:
2353     case E_V1DFmode:
2354     /* 128-bit Advanced SIMD vectors.  */
2355     case E_V16QImode:
2356     case E_V8HImode:
2357     case E_V4SImode:
2358     case E_V2DImode:
2359     case E_V8HFmode:
2360     case E_V8BFmode:
2361     case E_V4SFmode:
2362     case E_V2DFmode:
2363       return TARGET_SIMD ? VEC_ADVSIMD : 0;
2364
2365     default:
2366       return 0;
2367     }
2368 }
2369
2370 /* Return true if MODE is any of the data vector modes, including
2371    structure modes.  */
2372 static bool
2373 aarch64_vector_data_mode_p (machine_mode mode)
2374 {
2375   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2376 }
2377
2378 /* Return true if MODE is any form of SVE mode, including predicates,
2379    vectors and structures.  */
2380 bool
2381 aarch64_sve_mode_p (machine_mode mode)
2382 {
2383   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2384 }
2385
2386 /* Return true if MODE is an SVE data vector mode; either a single vector
2387    or a structure of vectors.  */
2388 static bool
2389 aarch64_sve_data_mode_p (machine_mode mode)
2390 {
2391   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2392 }
2393
2394 /* Return the number of defined bytes in one constituent vector of
2395    SVE mode MODE, which has vector flags VEC_FLAGS.  */
2396 static poly_int64
2397 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2398 {
2399   if (vec_flags & VEC_PARTIAL)
2400     /* A single partial vector.  */
2401     return GET_MODE_SIZE (mode);
2402
2403   if (vec_flags & VEC_SVE_DATA)
2404     /* A single vector or a tuple.  */
2405     return BYTES_PER_SVE_VECTOR;
2406
2407   /* A single predicate.  */
2408   gcc_assert (vec_flags & VEC_SVE_PRED);
2409   return BYTES_PER_SVE_PRED;
2410 }
2411
2412 /* Implement target hook TARGET_ARRAY_MODE.  */
2413 static opt_machine_mode
2414 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2415 {
2416   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2417       && IN_RANGE (nelems, 2, 4))
2418     return mode_for_vector (GET_MODE_INNER (mode),
2419                             GET_MODE_NUNITS (mode) * nelems);
2420
2421   return opt_machine_mode ();
2422 }
2423
2424 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
2425 static bool
2426 aarch64_array_mode_supported_p (machine_mode mode,
2427                                 unsigned HOST_WIDE_INT nelems)
2428 {
2429   if (TARGET_SIMD
2430       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2431           || AARCH64_VALID_SIMD_DREG_MODE (mode))
2432       && (nelems >= 2 && nelems <= 4))
2433     return true;
2434
2435   return false;
2436 }
2437
2438 /* MODE is some form of SVE vector mode.  For data modes, return the number
2439    of vector register bits that each element of MODE occupies, such as 64
2440    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2441    in a 64-bit container).  For predicate modes, return the number of
2442    data bits controlled by each significant predicate bit.  */
2443
2444 static unsigned int
2445 aarch64_sve_container_bits (machine_mode mode)
2446 {
2447   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2448   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2449                              ? BITS_PER_SVE_VECTOR
2450                              : GET_MODE_BITSIZE (mode));
2451   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2452 }
2453
2454 /* Return the SVE predicate mode to use for elements that have
2455    ELEM_NBYTES bytes, if such a mode exists.  */
2456
2457 opt_machine_mode
2458 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2459 {
2460   if (TARGET_SVE)
2461     {
2462       if (elem_nbytes == 1)
2463         return VNx16BImode;
2464       if (elem_nbytes == 2)
2465         return VNx8BImode;
2466       if (elem_nbytes == 4)
2467         return VNx4BImode;
2468       if (elem_nbytes == 8)
2469         return VNx2BImode;
2470     }
2471   return opt_machine_mode ();
2472 }
2473
2474 /* Return the SVE predicate mode that should be used to control
2475    SVE mode MODE.  */
2476
2477 machine_mode
2478 aarch64_sve_pred_mode (machine_mode mode)
2479 {
2480   unsigned int bits = aarch64_sve_container_bits (mode);
2481   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2482 }
2483
2484 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
2485
2486 static opt_machine_mode
2487 aarch64_get_mask_mode (machine_mode mode)
2488 {
2489   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2490   if (vec_flags & VEC_SVE_DATA)
2491     return aarch64_sve_pred_mode (mode);
2492
2493   return default_get_mask_mode (mode);
2494 }
2495
2496 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
2497
2498 opt_machine_mode
2499 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2500 {
2501   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2502                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2503   machine_mode mode;
2504   FOR_EACH_MODE_IN_CLASS (mode, mclass)
2505     if (inner_mode == GET_MODE_INNER (mode)
2506         && known_eq (nunits, GET_MODE_NUNITS (mode))
2507         && aarch64_sve_data_mode_p (mode))
2508       return mode;
2509   return opt_machine_mode ();
2510 }
2511
2512 /* Return the integer element mode associated with SVE mode MODE.  */
2513
2514 static scalar_int_mode
2515 aarch64_sve_element_int_mode (machine_mode mode)
2516 {
2517   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2518                              ? BITS_PER_SVE_VECTOR
2519                              : GET_MODE_BITSIZE (mode));
2520   unsigned int elt_bits = vector_element_size (vector_bits,
2521                                                GET_MODE_NUNITS (mode));
2522   return int_mode_for_size (elt_bits, 0).require ();
2523 }
2524
2525 /* Return an integer element mode that contains exactly
2526    aarch64_sve_container_bits (MODE) bits.  This is wider than
2527    aarch64_sve_element_int_mode if MODE is a partial vector,
2528    otherwise it's the same.  */
2529
2530 static scalar_int_mode
2531 aarch64_sve_container_int_mode (machine_mode mode)
2532 {
2533   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2534 }
2535
2536 /* Return the integer vector mode associated with SVE mode MODE.
2537    Unlike related_int_vector_mode, this can handle the case in which
2538    MODE is a predicate (and thus has a different total size).  */
2539
2540 machine_mode
2541 aarch64_sve_int_mode (machine_mode mode)
2542 {
2543   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2544   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2545 }
2546
2547 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
2548
2549 static opt_machine_mode
2550 aarch64_vectorize_related_mode (machine_mode vector_mode,
2551                                 scalar_mode element_mode,
2552                                 poly_uint64 nunits)
2553 {
2554   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2555
2556   /* If we're operating on SVE vectors, try to return an SVE mode.  */
2557   poly_uint64 sve_nunits;
2558   if ((vec_flags & VEC_SVE_DATA)
2559       && multiple_p (BYTES_PER_SVE_VECTOR,
2560                      GET_MODE_SIZE (element_mode), &sve_nunits))
2561     {
2562       machine_mode sve_mode;
2563       if (maybe_ne (nunits, 0U))
2564         {
2565           /* Try to find a full or partial SVE mode with exactly
2566              NUNITS units.  */
2567           if (multiple_p (sve_nunits, nunits)
2568               && aarch64_sve_data_mode (element_mode,
2569                                         nunits).exists (&sve_mode))
2570             return sve_mode;
2571         }
2572       else
2573         {
2574           /* Take the preferred number of units from the number of bytes
2575              that fit in VECTOR_MODE.  We always start by "autodetecting"
2576              a full vector mode with preferred_simd_mode, so vectors
2577              chosen here will also be full vector modes.  Then
2578              autovectorize_vector_modes tries smaller starting modes
2579              and thus smaller preferred numbers of units.  */
2580           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2581           if (aarch64_sve_data_mode (element_mode,
2582                                      sve_nunits).exists (&sve_mode))
2583             return sve_mode;
2584         }
2585     }
2586
2587   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2588   if ((vec_flags & VEC_ADVSIMD)
2589       && known_eq (nunits, 0U)
2590       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2591       && maybe_ge (GET_MODE_BITSIZE (element_mode)
2592                    * GET_MODE_NUNITS (vector_mode), 128U))
2593     {
2594       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2595       if (VECTOR_MODE_P (res))
2596         return res;
2597     }
2598
2599   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2600 }
2601
2602 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
2603    prefer to use the first arithmetic operand as the else value if
2604    the else value doesn't matter, since that exactly matches the SVE
2605    destructive merging form.  For ternary operations we could either
2606    pick the first operand and use FMAD-like instructions or the last
2607    operand and use FMLA-like instructions; the latter seems more
2608    natural.  */
2609
2610 static tree
2611 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2612 {
2613   return nops == 3 ? ops[2] : ops[0];
2614 }
2615
2616 /* Implement TARGET_HARD_REGNO_NREGS.  */
2617
2618 static unsigned int
2619 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2620 {
2621   /* ??? Logically we should only need to provide a value when
2622      HARD_REGNO_MODE_OK says that the combination is valid,
2623      but at the moment we need to handle all modes.  Just ignore
2624      any runtime parts for registers that can't store them.  */
2625   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2626   switch (aarch64_regno_regclass (regno))
2627     {
2628     case FP_REGS:
2629     case FP_LO_REGS:
2630     case FP_LO8_REGS:
2631       {
2632         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2633         if (vec_flags & VEC_SVE_DATA)
2634           return exact_div (GET_MODE_SIZE (mode),
2635                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2636         return CEIL (lowest_size, UNITS_PER_VREG);
2637       }
2638     case PR_REGS:
2639     case PR_LO_REGS:
2640     case PR_HI_REGS:
2641     case FFR_REGS:
2642     case PR_AND_FFR_REGS:
2643       return 1;
2644     default:
2645       return CEIL (lowest_size, UNITS_PER_WORD);
2646     }
2647   gcc_unreachable ();
2648 }
2649
2650 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2651
2652 static bool
2653 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2654 {
2655   if (GET_MODE_CLASS (mode) == MODE_CC)
2656     return regno == CC_REGNUM;
2657
2658   if (regno == VG_REGNUM)
2659     /* This must have the same size as _Unwind_Word.  */
2660     return mode == DImode;
2661
2662   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2663   if (vec_flags & VEC_SVE_PRED)
2664     return pr_or_ffr_regnum_p (regno);
2665
2666   if (pr_or_ffr_regnum_p (regno))
2667     return false;
2668
2669   if (regno == SP_REGNUM)
2670     /* The purpose of comparing with ptr_mode is to support the
2671        global register variable associated with the stack pointer
2672        register via the syntax of asm ("wsp") in ILP32.  */
2673     return mode == Pmode || mode == ptr_mode;
2674
2675   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2676     return mode == Pmode;
2677
2678   if (GP_REGNUM_P (regno))
2679     {
2680       if (vec_flags & VEC_ANY_SVE)
2681         return false;
2682       if (known_le (GET_MODE_SIZE (mode), 8))
2683         return true;
2684       if (known_le (GET_MODE_SIZE (mode), 16))
2685         return (regno & 1) == 0;
2686     }
2687   else if (FP_REGNUM_P (regno))
2688     {
2689       if (vec_flags & VEC_STRUCT)
2690         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2691       else
2692         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2693     }
2694
2695   return false;
2696 }
2697
2698 /* Return true if a function with type FNTYPE returns its value in
2699    SVE vector or predicate registers.  */
2700
2701 static bool
2702 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2703 {
2704   tree return_type = TREE_TYPE (fntype);
2705
2706   pure_scalable_type_info pst_info;
2707   switch (pst_info.analyze (return_type))
2708     {
2709     case pure_scalable_type_info::IS_PST:
2710       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2711               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2712
2713     case pure_scalable_type_info::DOESNT_MATTER:
2714       gcc_assert (aarch64_return_in_memory_1 (return_type));
2715       return false;
2716
2717     case pure_scalable_type_info::NO_ABI_IDENTITY:
2718     case pure_scalable_type_info::ISNT_PST:
2719       return false;
2720     }
2721   gcc_unreachable ();
2722 }
2723
2724 /* Return true if a function with type FNTYPE takes arguments in
2725    SVE vector or predicate registers.  */
2726
2727 static bool
2728 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2729 {
2730   CUMULATIVE_ARGS args_so_far_v;
2731   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2732                                 NULL_TREE, 0, true);
2733   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2734
2735   for (tree chain = TYPE_ARG_TYPES (fntype);
2736        chain && chain != void_list_node;
2737        chain = TREE_CHAIN (chain))
2738     {
2739       tree arg_type = TREE_VALUE (chain);
2740       if (arg_type == error_mark_node)
2741         return false;
2742
2743       function_arg_info arg (arg_type, /*named=*/true);
2744       apply_pass_by_reference_rules (&args_so_far_v, arg);
2745       pure_scalable_type_info pst_info;
2746       if (pst_info.analyze_registers (arg.type))
2747         {
2748           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2749           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2750           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2751           return true;
2752         }
2753
2754       targetm.calls.function_arg_advance (args_so_far, arg);
2755     }
2756   return false;
2757 }
2758
2759 /* Implement TARGET_FNTYPE_ABI.  */
2760
2761 static const predefined_function_abi &
2762 aarch64_fntype_abi (const_tree fntype)
2763 {
2764   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2765     return aarch64_simd_abi ();
2766
2767   if (aarch64_returns_value_in_sve_regs_p (fntype)
2768       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2769     return aarch64_sve_abi ();
2770
2771   return default_function_abi;
2772 }
2773
2774 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2775
2776 static bool
2777 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2778 {
2779   return (aarch64_sve::builtin_type_p (type1)
2780           == aarch64_sve::builtin_type_p (type2));
2781 }
2782
2783 /* Return true if we should emit CFI for register REGNO.  */
2784
2785 static bool
2786 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2787 {
2788   return (GP_REGNUM_P (regno)
2789           || !default_function_abi.clobbers_full_reg_p (regno));
2790 }
2791
2792 /* Return the mode we should use to save and restore register REGNO.  */
2793
2794 static machine_mode
2795 aarch64_reg_save_mode (unsigned int regno)
2796 {
2797   if (GP_REGNUM_P (regno))
2798     return DImode;
2799
2800   if (FP_REGNUM_P (regno))
2801     switch (crtl->abi->id ())
2802       {
2803       case ARM_PCS_AAPCS64:
2804         /* Only the low 64 bits are saved by the base PCS.  */
2805         return DFmode;
2806
2807       case ARM_PCS_SIMD:
2808         /* The vector PCS saves the low 128 bits (which is the full
2809            register on non-SVE targets).  */
2810         return TFmode;
2811
2812       case ARM_PCS_SVE:
2813         /* Use vectors of DImode for registers that need frame
2814            information, so that the first 64 bytes of the save slot
2815            are always the equivalent of what storing D<n> would give.  */
2816         if (aarch64_emit_cfi_for_reg_p (regno))
2817           return VNx2DImode;
2818
2819         /* Use vectors of bytes otherwise, so that the layout is
2820            endian-agnostic, and so that we can use LDR and STR for
2821            big-endian targets.  */
2822         return VNx16QImode;
2823
2824       case ARM_PCS_TLSDESC:
2825       case ARM_PCS_UNKNOWN:
2826         break;
2827       }
2828
2829   if (PR_REGNUM_P (regno))
2830     /* Save the full predicate register.  */
2831     return VNx16BImode;
2832
2833   gcc_unreachable ();
2834 }
2835
2836 /* Implement TARGET_INSN_CALLEE_ABI.  */
2837
2838 const predefined_function_abi &
2839 aarch64_insn_callee_abi (const rtx_insn *insn)
2840 {
2841   rtx pat = PATTERN (insn);
2842   gcc_assert (GET_CODE (pat) == PARALLEL);
2843   rtx unspec = XVECEXP (pat, 0, 1);
2844   gcc_assert (GET_CODE (unspec) == UNSPEC
2845               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2846   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2847 }
2848
2849 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2850    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2851    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2852
2853 static bool
2854 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2855                                         unsigned int regno,
2856                                         machine_mode mode)
2857 {
2858   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2859     {
2860       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2861       unsigned int nregs = hard_regno_nregs (regno, mode);
2862       if (nregs > 1)
2863         per_register_size = exact_div (per_register_size, nregs);
2864       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2865         return maybe_gt (per_register_size, 16);
2866       return maybe_gt (per_register_size, 8);
2867     }
2868   return false;
2869 }
2870
2871 /* Implement REGMODE_NATURAL_SIZE.  */
2872 poly_uint64
2873 aarch64_regmode_natural_size (machine_mode mode)
2874 {
2875   /* The natural size for SVE data modes is one SVE data vector,
2876      and similarly for predicates.  We can't independently modify
2877      anything smaller than that.  */
2878   /* ??? For now, only do this for variable-width SVE registers.
2879      Doing it for constant-sized registers breaks lower-subreg.c.  */
2880   /* ??? And once that's fixed, we should probably have similar
2881      code for Advanced SIMD.  */
2882   if (!aarch64_sve_vg.is_constant ())
2883     {
2884       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2885       if (vec_flags & VEC_SVE_PRED)
2886         return BYTES_PER_SVE_PRED;
2887       if (vec_flags & VEC_SVE_DATA)
2888         return BYTES_PER_SVE_VECTOR;
2889     }
2890   return UNITS_PER_WORD;
2891 }
2892
2893 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2894 machine_mode
2895 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2896                                      machine_mode mode)
2897 {
2898   /* The predicate mode determines which bits are significant and
2899      which are "don't care".  Decreasing the number of lanes would
2900      lose data while increasing the number of lanes would make bits
2901      unnecessarily significant.  */
2902   if (PR_REGNUM_P (regno))
2903     return mode;
2904   if (known_ge (GET_MODE_SIZE (mode), 4))
2905     return mode;
2906   else
2907     return SImode;
2908 }
2909
2910 /* Return true if I's bits are consecutive ones from the MSB.  */
2911 bool
2912 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2913 {
2914   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2915 }
2916
2917 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2918    that strcpy from constants will be faster.  */
2919
2920 static HOST_WIDE_INT
2921 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2922 {
2923   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2924     return MAX (align, BITS_PER_WORD);
2925   return align;
2926 }
2927
2928 /* Return true if calls to DECL should be treated as
2929    long-calls (ie called via a register).  */
2930 static bool
2931 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2932 {
2933   return false;
2934 }
2935
2936 /* Return true if calls to symbol-ref SYM should be treated as
2937    long-calls (ie called via a register).  */
2938 bool
2939 aarch64_is_long_call_p (rtx sym)
2940 {
2941   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2942 }
2943
2944 /* Return true if calls to symbol-ref SYM should not go through
2945    plt stubs.  */
2946
2947 bool
2948 aarch64_is_noplt_call_p (rtx sym)
2949 {
2950   const_tree decl = SYMBOL_REF_DECL (sym);
2951
2952   if (flag_pic
2953       && decl
2954       && (!flag_plt
2955           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2956       && !targetm.binds_local_p (decl))
2957     return true;
2958
2959   return false;
2960 }
2961
2962 /* Emit an insn that's a simple single-set.  Both the operands must be
2963    known to be valid.  */
2964 inline static rtx_insn *
2965 emit_set_insn (rtx x, rtx y)
2966 {
2967   return emit_insn (gen_rtx_SET (x, y));
2968 }
2969
2970 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2971    return the rtx for register 0 in the proper mode.  */
2972 rtx
2973 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2974 {
2975   machine_mode cmp_mode = GET_MODE (x);
2976   machine_mode cc_mode;
2977   rtx cc_reg;
2978
2979   if (cmp_mode == TImode)
2980     {
2981       gcc_assert (code == NE);
2982
2983       cc_mode = CCmode;
2984       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2985
2986       rtx x_lo = operand_subword (x, 0, 0, TImode);
2987       rtx y_lo = operand_subword (y, 0, 0, TImode);
2988       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2989
2990       rtx x_hi = operand_subword (x, 1, 0, TImode);
2991       rtx y_hi = operand_subword (y, 1, 0, TImode);
2992       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2993                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2994                                GEN_INT (AARCH64_EQ)));
2995     }
2996   else
2997     {
2998       cc_mode = SELECT_CC_MODE (code, x, y);
2999       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3000       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3001     }
3002   return cc_reg;
3003 }
3004
3005 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
3006
3007 static rtx
3008 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3009                                   machine_mode y_mode)
3010 {
3011   if (y_mode == E_QImode || y_mode == E_HImode)
3012     {
3013       if (CONST_INT_P (y))
3014         {
3015           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3016           y_mode = SImode;
3017         }
3018       else
3019         {
3020           rtx t, cc_reg;
3021           machine_mode cc_mode;
3022
3023           t = gen_rtx_ZERO_EXTEND (SImode, y);
3024           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3025           cc_mode = CC_SWPmode;
3026           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3027           emit_set_insn (cc_reg, t);
3028           return cc_reg;
3029         }
3030     }
3031
3032   if (!aarch64_plus_operand (y, y_mode))
3033     y = force_reg (y_mode, y);
3034
3035   return aarch64_gen_compare_reg (code, x, y);
3036 }
3037
3038 /* Build the SYMBOL_REF for __tls_get_addr.  */
3039
3040 static GTY(()) rtx tls_get_addr_libfunc;
3041
3042 rtx
3043 aarch64_tls_get_addr (void)
3044 {
3045   if (!tls_get_addr_libfunc)
3046     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3047   return tls_get_addr_libfunc;
3048 }
3049
3050 /* Return the TLS model to use for ADDR.  */
3051
3052 static enum tls_model
3053 tls_symbolic_operand_type (rtx addr)
3054 {
3055   enum tls_model tls_kind = TLS_MODEL_NONE;
3056   poly_int64 offset;
3057   addr = strip_offset_and_salt (addr, &offset);
3058   if (SYMBOL_REF_P (addr))
3059     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3060
3061   return tls_kind;
3062 }
3063
3064 /* We'll allow lo_sum's in addresses in our legitimate addresses
3065    so that combine would take care of combining addresses where
3066    necessary, but for generation purposes, we'll generate the address
3067    as :
3068    RTL                               Absolute
3069    tmp = hi (symbol_ref);            adrp  x1, foo
3070    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
3071                                      nop
3072
3073    PIC                               TLS
3074    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
3075    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
3076                                      bl   __tls_get_addr
3077                                      nop
3078
3079    Load TLS symbol, depending on TLS mechanism and TLS access model.
3080
3081    Global Dynamic - Traditional TLS:
3082    adrp tmp, :tlsgd:imm
3083    add  dest, tmp, #:tlsgd_lo12:imm
3084    bl   __tls_get_addr
3085
3086    Global Dynamic - TLS Descriptors:
3087    adrp dest, :tlsdesc:imm
3088    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
3089    add  dest, dest, #:tlsdesc_lo12:imm
3090    blr  tmp
3091    mrs  tp, tpidr_el0
3092    add  dest, dest, tp
3093
3094    Initial Exec:
3095    mrs  tp, tpidr_el0
3096    adrp tmp, :gottprel:imm
3097    ldr  dest, [tmp, #:gottprel_lo12:imm]
3098    add  dest, dest, tp
3099
3100    Local Exec:
3101    mrs  tp, tpidr_el0
3102    add  t0, tp, #:tprel_hi12:imm, lsl #12
3103    add  t0, t0, #:tprel_lo12_nc:imm
3104 */
3105
3106 static void
3107 aarch64_load_symref_appropriately (rtx dest, rtx imm,
3108                                    enum aarch64_symbol_type type)
3109 {
3110   switch (type)
3111     {
3112     case SYMBOL_SMALL_ABSOLUTE:
3113       {
3114         /* In ILP32, the mode of dest can be either SImode or DImode.  */
3115         rtx tmp_reg = dest;
3116         machine_mode mode = GET_MODE (dest);
3117
3118         gcc_assert (mode == Pmode || mode == ptr_mode);
3119
3120         if (can_create_pseudo_p ())
3121           tmp_reg = gen_reg_rtx (mode);
3122
3123         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3124         emit_insn (gen_add_losym (dest, tmp_reg, imm));
3125         return;
3126       }
3127
3128     case SYMBOL_TINY_ABSOLUTE:
3129       emit_insn (gen_rtx_SET (dest, imm));
3130       return;
3131
3132     case SYMBOL_SMALL_GOT_28K:
3133       {
3134         machine_mode mode = GET_MODE (dest);
3135         rtx gp_rtx = pic_offset_table_rtx;
3136         rtx insn;
3137         rtx mem;
3138
3139         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3140            here before rtl expand.  Tree IVOPT will generate rtl pattern to
3141            decide rtx costs, in which case pic_offset_table_rtx is not
3142            initialized.  For that case no need to generate the first adrp
3143            instruction as the final cost for global variable access is
3144            one instruction.  */
3145         if (gp_rtx != NULL)
3146           {
3147             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3148                using the page base as GOT base, the first page may be wasted,
3149                in the worst scenario, there is only 28K space for GOT).
3150
3151                The generate instruction sequence for accessing global variable
3152                is:
3153
3154                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3155
3156                Only one instruction needed. But we must initialize
3157                pic_offset_table_rtx properly.  We generate initialize insn for
3158                every global access, and allow CSE to remove all redundant.
3159
3160                The final instruction sequences will look like the following
3161                for multiply global variables access.
3162
3163                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3164
3165                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3166                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3167                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3168                  ...  */
3169
3170             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3171             crtl->uses_pic_offset_table = 1;
3172             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3173
3174             if (mode != GET_MODE (gp_rtx))
3175              gp_rtx = gen_lowpart (mode, gp_rtx);
3176
3177           }
3178
3179         if (mode == ptr_mode)
3180           {
3181             if (mode == DImode)
3182               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3183             else
3184               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3185
3186             mem = XVECEXP (SET_SRC (insn), 0, 0);
3187           }
3188         else
3189           {
3190             gcc_assert (mode == Pmode);
3191
3192             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3193             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3194           }
3195
3196         /* The operand is expected to be MEM.  Whenever the related insn
3197            pattern changed, above code which calculate mem should be
3198            updated.  */
3199         gcc_assert (MEM_P (mem));
3200         MEM_READONLY_P (mem) = 1;
3201         MEM_NOTRAP_P (mem) = 1;
3202         emit_insn (insn);
3203         return;
3204       }
3205
3206     case SYMBOL_SMALL_GOT_4G:
3207       {
3208         /* In ILP32, the mode of dest can be either SImode or DImode,
3209            while the got entry is always of SImode size.  The mode of
3210            dest depends on how dest is used: if dest is assigned to a
3211            pointer (e.g. in the memory), it has SImode; it may have
3212            DImode if dest is dereferenced to access the memeory.
3213            This is why we have to handle three different ldr_got_small
3214            patterns here (two patterns for ILP32).  */
3215
3216         rtx insn;
3217         rtx mem;
3218         rtx tmp_reg = dest;
3219         machine_mode mode = GET_MODE (dest);
3220
3221         if (can_create_pseudo_p ())
3222           tmp_reg = gen_reg_rtx (mode);
3223
3224         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3225         if (mode == ptr_mode)
3226           {
3227             if (mode == DImode)
3228               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3229             else
3230               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3231
3232             mem = XVECEXP (SET_SRC (insn), 0, 0);
3233           }
3234         else
3235           {
3236             gcc_assert (mode == Pmode);
3237
3238             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3239             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3240           }
3241
3242         gcc_assert (MEM_P (mem));
3243         MEM_READONLY_P (mem) = 1;
3244         MEM_NOTRAP_P (mem) = 1;
3245         emit_insn (insn);
3246         return;
3247       }
3248
3249     case SYMBOL_SMALL_TLSGD:
3250       {
3251         rtx_insn *insns;
3252         /* The return type of __tls_get_addr is the C pointer type
3253            so use ptr_mode.  */
3254         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3255         rtx tmp_reg = dest;
3256
3257         if (GET_MODE (dest) != ptr_mode)
3258           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3259
3260         start_sequence ();
3261         if (ptr_mode == SImode)
3262           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3263         else
3264           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3265         insns = get_insns ();
3266         end_sequence ();
3267
3268         RTL_CONST_CALL_P (insns) = 1;
3269         emit_libcall_block (insns, tmp_reg, result, imm);
3270         /* Convert back to the mode of the dest adding a zero_extend
3271            from SImode (ptr_mode) to DImode (Pmode). */
3272         if (dest != tmp_reg)
3273           convert_move (dest, tmp_reg, true);
3274         return;
3275       }
3276
3277     case SYMBOL_SMALL_TLSDESC:
3278       {
3279         machine_mode mode = GET_MODE (dest);
3280         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3281         rtx tp;
3282
3283         gcc_assert (mode == Pmode || mode == ptr_mode);
3284
3285         /* In ILP32, the got entry is always of SImode size.  Unlike
3286            small GOT, the dest is fixed at reg 0.  */
3287         if (TARGET_ILP32)
3288           emit_insn (gen_tlsdesc_small_si (imm));
3289         else
3290           emit_insn (gen_tlsdesc_small_di (imm));
3291         tp = aarch64_load_tp (NULL);
3292
3293         if (mode != Pmode)
3294           tp = gen_lowpart (mode, tp);
3295
3296         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3297         if (REG_P (dest))
3298           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3299         return;
3300       }
3301
3302     case SYMBOL_SMALL_TLSIE:
3303       {
3304         /* In ILP32, the mode of dest can be either SImode or DImode,
3305            while the got entry is always of SImode size.  The mode of
3306            dest depends on how dest is used: if dest is assigned to a
3307            pointer (e.g. in the memory), it has SImode; it may have
3308            DImode if dest is dereferenced to access the memeory.
3309            This is why we have to handle three different tlsie_small
3310            patterns here (two patterns for ILP32).  */
3311         machine_mode mode = GET_MODE (dest);
3312         rtx tmp_reg = gen_reg_rtx (mode);
3313         rtx tp = aarch64_load_tp (NULL);
3314
3315         if (mode == ptr_mode)
3316           {
3317             if (mode == DImode)
3318               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3319             else
3320               {
3321                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3322                 tp = gen_lowpart (mode, tp);
3323               }
3324           }
3325         else
3326           {
3327             gcc_assert (mode == Pmode);
3328             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3329           }
3330
3331         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3332         if (REG_P (dest))
3333           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3334         return;
3335       }
3336
3337     case SYMBOL_TLSLE12:
3338     case SYMBOL_TLSLE24:
3339     case SYMBOL_TLSLE32:
3340     case SYMBOL_TLSLE48:
3341       {
3342         machine_mode mode = GET_MODE (dest);
3343         rtx tp = aarch64_load_tp (NULL);
3344
3345         if (mode != Pmode)
3346           tp = gen_lowpart (mode, tp);
3347
3348         switch (type)
3349           {
3350           case SYMBOL_TLSLE12:
3351             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3352                         (dest, tp, imm));
3353             break;
3354           case SYMBOL_TLSLE24:
3355             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3356                         (dest, tp, imm));
3357           break;
3358           case SYMBOL_TLSLE32:
3359             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3360                         (dest, imm));
3361             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3362                         (dest, dest, tp));
3363           break;
3364           case SYMBOL_TLSLE48:
3365             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3366                         (dest, imm));
3367             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3368                         (dest, dest, tp));
3369             break;
3370           default:
3371             gcc_unreachable ();
3372           }
3373
3374         if (REG_P (dest))
3375           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3376         return;
3377       }
3378
3379     case SYMBOL_TINY_GOT:
3380       {
3381         rtx insn;
3382         machine_mode mode = GET_MODE (dest);
3383
3384         if (mode == ptr_mode)
3385           insn = gen_ldr_got_tiny (mode, dest, imm);
3386         else
3387           {
3388             gcc_assert (mode == Pmode);
3389             insn = gen_ldr_got_tiny_sidi (dest, imm);
3390           }
3391
3392         emit_insn (insn);
3393         return;
3394       }
3395
3396     case SYMBOL_TINY_TLSIE:
3397       {
3398         machine_mode mode = GET_MODE (dest);
3399         rtx tp = aarch64_load_tp (NULL);
3400
3401         if (mode == ptr_mode)
3402           {
3403             if (mode == DImode)
3404               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3405             else
3406               {
3407                 tp = gen_lowpart (mode, tp);
3408                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3409               }
3410           }
3411         else
3412           {
3413             gcc_assert (mode == Pmode);
3414             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3415           }
3416
3417         if (REG_P (dest))
3418           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3419         return;
3420       }
3421
3422     default:
3423       gcc_unreachable ();
3424     }
3425 }
3426
3427 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3428    handle all moves if !can_create_pseudo_p ().  The distinction is
3429    important because, unlike emit_move_insn, the move expanders know
3430    how to force Pmode objects into the constant pool even when the
3431    constant pool address is not itself legitimate.  */
3432 static rtx
3433 aarch64_emit_move (rtx dest, rtx src)
3434 {
3435   return (can_create_pseudo_p ()
3436           ? emit_move_insn (dest, src)
3437           : emit_move_insn_1 (dest, src));
3438 }
3439
3440 /* Apply UNOPTAB to OP and store the result in DEST.  */
3441
3442 static void
3443 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3444 {
3445   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3446   if (dest != tmp)
3447     emit_move_insn (dest, tmp);
3448 }
3449
3450 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3451
3452 static void
3453 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3454 {
3455   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3456                           OPTAB_DIRECT);
3457   if (dest != tmp)
3458     emit_move_insn (dest, tmp);
3459 }
3460
3461 /* Split a 128-bit move operation into two 64-bit move operations,
3462    taking care to handle partial overlap of register to register
3463    copies.  Special cases are needed when moving between GP regs and
3464    FP regs.  SRC can be a register, constant or memory; DST a register
3465    or memory.  If either operand is memory it must not have any side
3466    effects.  */
3467 void
3468 aarch64_split_128bit_move (rtx dst, rtx src)
3469 {
3470   rtx dst_lo, dst_hi;
3471   rtx src_lo, src_hi;
3472
3473   machine_mode mode = GET_MODE (dst);
3474
3475   gcc_assert (mode == TImode || mode == TFmode);
3476   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3477   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3478
3479   if (REG_P (dst) && REG_P (src))
3480     {
3481       int src_regno = REGNO (src);
3482       int dst_regno = REGNO (dst);
3483
3484       /* Handle FP <-> GP regs.  */
3485       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3486         {
3487           src_lo = gen_lowpart (word_mode, src);
3488           src_hi = gen_highpart (word_mode, src);
3489
3490           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3491           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3492           return;
3493         }
3494       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3495         {
3496           dst_lo = gen_lowpart (word_mode, dst);
3497           dst_hi = gen_highpart (word_mode, dst);
3498
3499           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3500           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3501           return;
3502         }
3503     }
3504
3505   dst_lo = gen_lowpart (word_mode, dst);
3506   dst_hi = gen_highpart (word_mode, dst);
3507   src_lo = gen_lowpart (word_mode, src);
3508   src_hi = gen_highpart_mode (word_mode, mode, src);
3509
3510   /* At most one pairing may overlap.  */
3511   if (reg_overlap_mentioned_p (dst_lo, src_hi))
3512     {
3513       aarch64_emit_move (dst_hi, src_hi);
3514       aarch64_emit_move (dst_lo, src_lo);
3515     }
3516   else
3517     {
3518       aarch64_emit_move (dst_lo, src_lo);
3519       aarch64_emit_move (dst_hi, src_hi);
3520     }
3521 }
3522
3523 /* Return true if we should split a move from 128-bit value SRC
3524    to 128-bit register DEST.  */
3525
3526 bool
3527 aarch64_split_128bit_move_p (rtx dst, rtx src)
3528 {
3529   if (FP_REGNUM_P (REGNO (dst)))
3530     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3531   /* All moves to GPRs need to be split.  */
3532   return true;
3533 }
3534
3535 /* Split a complex SIMD combine.  */
3536
3537 void
3538 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3539 {
3540   machine_mode src_mode = GET_MODE (src1);
3541   machine_mode dst_mode = GET_MODE (dst);
3542
3543   gcc_assert (VECTOR_MODE_P (dst_mode));
3544   gcc_assert (register_operand (dst, dst_mode)
3545               && register_operand (src1, src_mode)
3546               && register_operand (src2, src_mode));
3547
3548   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3549   return;
3550 }
3551
3552 /* Split a complex SIMD move.  */
3553
3554 void
3555 aarch64_split_simd_move (rtx dst, rtx src)
3556 {
3557   machine_mode src_mode = GET_MODE (src);
3558   machine_mode dst_mode = GET_MODE (dst);
3559
3560   gcc_assert (VECTOR_MODE_P (dst_mode));
3561
3562   if (REG_P (dst) && REG_P (src))
3563     {
3564       gcc_assert (VECTOR_MODE_P (src_mode));
3565       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3566     }
3567 }
3568
3569 bool
3570 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3571                               machine_mode ymode, rtx y)
3572 {
3573   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3574   gcc_assert (r != NULL);
3575   return rtx_equal_p (x, r);
3576 }
3577
3578 /* Return TARGET if it is nonnull and a register of mode MODE.
3579    Otherwise, return a fresh register of mode MODE if we can,
3580    or TARGET reinterpreted as MODE if we can't.  */
3581
3582 static rtx
3583 aarch64_target_reg (rtx target, machine_mode mode)
3584 {
3585   if (target && REG_P (target) && GET_MODE (target) == mode)
3586     return target;
3587   if (!can_create_pseudo_p ())
3588     {
3589       gcc_assert (target);
3590       return gen_lowpart (mode, target);
3591     }
3592   return gen_reg_rtx (mode);
3593 }
3594
3595 /* Return a register that contains the constant in BUILDER, given that
3596    the constant is a legitimate move operand.  Use TARGET as the register
3597    if it is nonnull and convenient.  */
3598
3599 static rtx
3600 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3601 {
3602   rtx src = builder.build ();
3603   target = aarch64_target_reg (target, GET_MODE (src));
3604   emit_insn (gen_rtx_SET (target, src));
3605   return target;
3606 }
3607
3608 static rtx
3609 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3610 {
3611   if (can_create_pseudo_p ())
3612     return force_reg (mode, value);
3613   else
3614     {
3615       gcc_assert (x);
3616       aarch64_emit_move (x, value);
3617       return x;
3618     }
3619 }
3620
3621 /* Return true if predicate value X is a constant in which every element
3622    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3623    value, i.e. as a predicate in which all bits are significant.  */
3624
3625 static bool
3626 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3627 {
3628   if (GET_CODE (x) != CONST_VECTOR)
3629     return false;
3630
3631   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3632                                              GET_MODE_NUNITS (GET_MODE (x)));
3633   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3634   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3635   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3636
3637   unsigned int nelts = const_vector_encoded_nelts (x);
3638   for (unsigned int i = 0; i < nelts; ++i)
3639     {
3640       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3641       if (!CONST_INT_P (elt))
3642         return false;
3643
3644       builder.quick_push (elt);
3645       for (unsigned int j = 1; j < factor; ++j)
3646         builder.quick_push (const0_rtx);
3647     }
3648   builder.finalize ();
3649   return true;
3650 }
3651
3652 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3653    widest predicate element size it can have (that is, the largest size
3654    for which each element would still be 0 or 1).  */
3655
3656 unsigned int
3657 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3658 {
3659   /* Start with the most optimistic assumption: that we only need
3660      one bit per pattern.  This is what we will use if only the first
3661      bit in each pattern is ever set.  */
3662   unsigned int mask = GET_MODE_SIZE (DImode);
3663   mask |= builder.npatterns ();
3664
3665   /* Look for set bits.  */
3666   unsigned int nelts = builder.encoded_nelts ();
3667   for (unsigned int i = 1; i < nelts; ++i)
3668     if (INTVAL (builder.elt (i)) != 0)
3669       {
3670         if (i & 1)
3671           return 1;
3672         mask |= i;
3673       }
3674   return mask & -mask;
3675 }
3676
3677 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3678    return that predicate mode, otherwise return opt_machine_mode ().  */
3679
3680 opt_machine_mode
3681 aarch64_ptrue_all_mode (rtx x)
3682 {
3683   gcc_assert (GET_MODE (x) == VNx16BImode);
3684   if (GET_CODE (x) != CONST_VECTOR
3685       || !CONST_VECTOR_DUPLICATE_P (x)
3686       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3687       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3688     return opt_machine_mode ();
3689
3690   unsigned int nelts = const_vector_encoded_nelts (x);
3691   for (unsigned int i = 1; i < nelts; ++i)
3692     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3693       return opt_machine_mode ();
3694
3695   return aarch64_sve_pred_mode (nelts);
3696 }
3697
3698 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3699    that the constant would have with predicate element size ELT_SIZE
3700    (ignoring the upper bits in each element) and return:
3701
3702    * -1 if all bits are set
3703    * N if the predicate has N leading set bits followed by all clear bits
3704    * 0 if the predicate does not have any of these forms.  */
3705
3706 int
3707 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3708                               unsigned int elt_size)
3709 {
3710   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3711      followed by set bits.  */
3712   if (builder.nelts_per_pattern () == 3)
3713     return 0;
3714
3715   /* Skip over leading set bits.  */
3716   unsigned int nelts = builder.encoded_nelts ();
3717   unsigned int i = 0;
3718   for (; i < nelts; i += elt_size)
3719     if (INTVAL (builder.elt (i)) == 0)
3720       break;
3721   unsigned int vl = i / elt_size;
3722
3723   /* Check for the all-true case.  */
3724   if (i == nelts)
3725     return -1;
3726
3727   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3728      repeating pattern of set bits followed by clear bits.  */
3729   if (builder.nelts_per_pattern () != 2)
3730     return 0;
3731
3732   /* We have a "foreground" value and a duplicated "background" value.
3733      If the background might repeat and the last set bit belongs to it,
3734      we might have set bits followed by clear bits followed by set bits.  */
3735   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3736     return 0;
3737
3738   /* Make sure that the rest are all clear.  */
3739   for (; i < nelts; i += elt_size)
3740     if (INTVAL (builder.elt (i)) != 0)
3741       return 0;
3742
3743   return vl;
3744 }
3745
3746 /* See if there is an svpattern that encodes an SVE predicate of mode
3747    PRED_MODE in which the first VL bits are set and the rest are clear.
3748    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3749    A VL of -1 indicates an all-true vector.  */
3750
3751 aarch64_svpattern
3752 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3753 {
3754   if (vl < 0)
3755     return AARCH64_SV_ALL;
3756
3757   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3758     return AARCH64_NUM_SVPATTERNS;
3759
3760   if (vl >= 1 && vl <= 8)
3761     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3762
3763   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3764     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3765
3766   int max_vl;
3767   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3768     {
3769       if (vl == (max_vl / 3) * 3)
3770         return AARCH64_SV_MUL3;
3771       /* These would only trigger for non-power-of-2 lengths.  */
3772       if (vl == (max_vl & -4))
3773         return AARCH64_SV_MUL4;
3774       if (vl == (1 << floor_log2 (max_vl)))
3775         return AARCH64_SV_POW2;
3776       if (vl == max_vl)
3777         return AARCH64_SV_ALL;
3778     }
3779   return AARCH64_NUM_SVPATTERNS;
3780 }
3781
3782 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3783    bits has the lowest bit set and the upper bits clear.  This is the
3784    VNx16BImode equivalent of a PTRUE for controlling elements of
3785    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3786    all bits are significant, even the upper zeros.  */
3787
3788 rtx
3789 aarch64_ptrue_all (unsigned int elt_size)
3790 {
3791   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3792   builder.quick_push (const1_rtx);
3793   for (unsigned int i = 1; i < elt_size; ++i)
3794     builder.quick_push (const0_rtx);
3795   return builder.build ();
3796 }
3797
3798 /* Return an all-true predicate register of mode MODE.  */
3799
3800 rtx
3801 aarch64_ptrue_reg (machine_mode mode)
3802 {
3803   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3804   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3805   return gen_lowpart (mode, reg);
3806 }
3807
3808 /* Return an all-false predicate register of mode MODE.  */
3809
3810 rtx
3811 aarch64_pfalse_reg (machine_mode mode)
3812 {
3813   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3814   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3815   return gen_lowpart (mode, reg);
3816 }
3817
3818 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3819    for it.  PRED2[0] is the predicate for the instruction whose result
3820    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3821    for it.  Return true if we can prove that the two predicates are
3822    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3823    with PRED1[0] without changing behavior.  */
3824
3825 bool
3826 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3827 {
3828   machine_mode mode = GET_MODE (pred1[0]);
3829   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3830               && mode == GET_MODE (pred2[0])
3831               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3832               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3833
3834   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3835                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3836   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3837                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3838   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3839 }
3840
3841 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3842    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3843    Use TARGET as the target register if nonnull and convenient.  */
3844
3845 static rtx
3846 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3847                           machine_mode data_mode, rtx op1, rtx op2)
3848 {
3849   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3850   expand_operand ops[5];
3851   create_output_operand (&ops[0], target, pred_mode);
3852   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3853   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3854   create_input_operand (&ops[3], op1, data_mode);
3855   create_input_operand (&ops[4], op2, data_mode);
3856   expand_insn (icode, 5, ops);
3857   return ops[0].value;
3858 }
3859
3860 /* Use a comparison to convert integer vector SRC into MODE, which is
3861    the corresponding SVE predicate mode.  Use TARGET for the result
3862    if it's nonnull and convenient.  */
3863
3864 rtx
3865 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3866 {
3867   machine_mode src_mode = GET_MODE (src);
3868   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3869                                    src, CONST0_RTX (src_mode));
3870 }
3871
3872 /* Return the assembly token for svprfop value PRFOP.  */
3873
3874 static const char *
3875 svprfop_token (enum aarch64_svprfop prfop)
3876 {
3877   switch (prfop)
3878     {
3879 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3880     AARCH64_FOR_SVPRFOP (CASE)
3881 #undef CASE
3882     case AARCH64_NUM_SVPRFOPS:
3883       break;
3884     }
3885   gcc_unreachable ();
3886 }
3887
3888 /* Return the assembly string for an SVE prefetch operation with
3889    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3890    and that SUFFIX is the format for the remaining operands.  */
3891
3892 char *
3893 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3894                              const char *suffix)
3895 {
3896   static char buffer[128];
3897   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3898   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3899                                    mnemonic, svprfop_token (prfop), suffix);
3900   gcc_assert (written < sizeof (buffer));
3901   return buffer;
3902 }
3903
3904 /* Check whether we can calculate the number of elements in PATTERN
3905    at compile time, given that there are NELTS_PER_VQ elements per
3906    128-bit block.  Return the value if so, otherwise return -1.  */
3907
3908 HOST_WIDE_INT
3909 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3910 {
3911   unsigned int vl, const_vg;
3912   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3913     vl = 1 + (pattern - AARCH64_SV_VL1);
3914   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3915     vl = 16 << (pattern - AARCH64_SV_VL16);
3916   else if (aarch64_sve_vg.is_constant (&const_vg))
3917     {
3918       /* There are two vector granules per quadword.  */
3919       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3920       switch (pattern)
3921         {
3922         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3923         case AARCH64_SV_MUL4: return nelts & -4;
3924         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3925         case AARCH64_SV_ALL: return nelts;
3926         default: gcc_unreachable ();
3927         }
3928     }
3929   else
3930     return -1;
3931
3932   /* There are two vector granules per quadword.  */
3933   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3934   if (known_le (vl, nelts_all))
3935     return vl;
3936
3937   /* Requesting more elements than are available results in a PFALSE.  */
3938   if (known_gt (vl, nelts_all))
3939     return 0;
3940
3941   return -1;
3942 }
3943
3944 /* Return true if we can move VALUE into a register using a single
3945    CNT[BHWD] instruction.  */
3946
3947 static bool
3948 aarch64_sve_cnt_immediate_p (poly_int64 value)
3949 {
3950   HOST_WIDE_INT factor = value.coeffs[0];
3951   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3952   return (value.coeffs[1] == factor
3953           && IN_RANGE (factor, 2, 16 * 16)
3954           && (factor & 1) == 0
3955           && factor <= 16 * (factor & -factor));
3956 }
3957
3958 /* Likewise for rtx X.  */
3959
3960 bool
3961 aarch64_sve_cnt_immediate_p (rtx x)
3962 {
3963   poly_int64 value;
3964   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3965 }
3966
3967 /* Return the asm string for an instruction with a CNT-like vector size
3968    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3969    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3970    first part of the operands template (the part that comes before the
3971    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3972    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3973    in each quadword.  If it is zero, we can use any element size.  */
3974
3975 static char *
3976 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3977                                   aarch64_svpattern pattern,
3978                                   unsigned int factor,
3979                                   unsigned int nelts_per_vq)
3980 {
3981   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3982
3983   if (nelts_per_vq == 0)
3984     /* There is some overlap in the ranges of the four CNT instructions.
3985        Here we always use the smallest possible element size, so that the
3986        multiplier is 1 whereever possible.  */
3987     nelts_per_vq = factor & -factor;
3988   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3989   gcc_assert (IN_RANGE (shift, 1, 4));
3990   char suffix = "dwhb"[shift - 1];
3991
3992   factor >>= shift;
3993   unsigned int written;
3994   if (pattern == AARCH64_SV_ALL && factor == 1)
3995     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3996                         prefix, suffix, operands);
3997   else if (factor == 1)
3998     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3999                         prefix, suffix, operands, svpattern_token (pattern));
4000   else
4001     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4002                         prefix, suffix, operands, svpattern_token (pattern),
4003                         factor);
4004   gcc_assert (written < sizeof (buffer));
4005   return buffer;
4006 }
4007
4008 /* Return the asm string for an instruction with a CNT-like vector size
4009    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4010    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4011    first part of the operands template (the part that comes before the
4012    vector size itself).  X is the value of the vector size operand,
4013    as a polynomial integer rtx; we need to convert this into an "all"
4014    pattern with a multiplier.  */
4015
4016 char *
4017 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4018                                   rtx x)
4019 {
4020   poly_int64 value = rtx_to_poly_int64 (x);
4021   gcc_assert (aarch64_sve_cnt_immediate_p (value));
4022   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4023                                            value.coeffs[1], 0);
4024 }
4025
4026 /* Return the asm string for an instruction with a CNT-like vector size
4027    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4028    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4029    first part of the operands template (the part that comes before the
4030    vector size itself).  CNT_PAT[0..2] are the operands of the
4031    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
4032
4033 char *
4034 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4035                                       const char *operands, rtx *cnt_pat)
4036 {
4037   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4038   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4039   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4040   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4041                                            factor, nelts_per_vq);
4042 }
4043
4044 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
4045
4046 bool
4047 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4048 {
4049   poly_int64 value;
4050   return (poly_int_rtx_p (x, &value)
4051           && (aarch64_sve_cnt_immediate_p (value)
4052               || aarch64_sve_cnt_immediate_p (-value)));
4053 }
4054
4055 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4056    operand 0.  */
4057
4058 char *
4059 aarch64_output_sve_scalar_inc_dec (rtx offset)
4060 {
4061   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4062   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4063   if (offset_value.coeffs[1] > 0)
4064     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4065                                              offset_value.coeffs[1], 0);
4066   else
4067     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4068                                              -offset_value.coeffs[1], 0);
4069 }
4070
4071 /* Return true if we can add VALUE to a register using a single ADDVL
4072    or ADDPL instruction.  */
4073
4074 static bool
4075 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4076 {
4077   HOST_WIDE_INT factor = value.coeffs[0];
4078   if (factor == 0 || value.coeffs[1] != factor)
4079     return false;
4080   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4081      and a value of 16 is one vector width.  */
4082   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4083           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4084 }
4085
4086 /* Likewise for rtx X.  */
4087
4088 bool
4089 aarch64_sve_addvl_addpl_immediate_p (rtx x)
4090 {
4091   poly_int64 value;
4092   return (poly_int_rtx_p (x, &value)
4093           && aarch64_sve_addvl_addpl_immediate_p (value));
4094 }
4095
4096 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4097    to operand 1 and storing the result in operand 0.  */
4098
4099 char *
4100 aarch64_output_sve_addvl_addpl (rtx offset)
4101 {
4102   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4103   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4104   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4105
4106   int factor = offset_value.coeffs[1];
4107   if ((factor & 15) == 0)
4108     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4109   else
4110     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4111   return buffer;
4112 }
4113
4114 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4115    instruction.  If it is, store the number of elements in each vector
4116    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4117    factor in *FACTOR_OUT (if nonnull).  */
4118
4119 bool
4120 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4121                                         unsigned int *nelts_per_vq_out)
4122 {
4123   rtx elt;
4124   poly_int64 value;
4125
4126   if (!const_vec_duplicate_p (x, &elt)
4127       || !poly_int_rtx_p (elt, &value))
4128     return false;
4129
4130   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4131   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4132     /* There's no vector INCB.  */
4133     return false;
4134
4135   HOST_WIDE_INT factor = value.coeffs[0];
4136   if (value.coeffs[1] != factor)
4137     return false;
4138
4139   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4140   if ((factor % nelts_per_vq) != 0
4141       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4142     return false;
4143
4144   if (factor_out)
4145     *factor_out = factor;
4146   if (nelts_per_vq_out)
4147     *nelts_per_vq_out = nelts_per_vq;
4148   return true;
4149 }
4150
4151 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4152    instruction.  */
4153
4154 bool
4155 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4156 {
4157   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4158 }
4159
4160 /* Return the asm template for an SVE vector INC or DEC instruction.
4161    OPERANDS gives the operands before the vector count and X is the
4162    value of the vector count operand itself.  */
4163
4164 char *
4165 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4166 {
4167   int factor;
4168   unsigned int nelts_per_vq;
4169   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4170     gcc_unreachable ();
4171   if (factor < 0)
4172     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4173                                              -factor, nelts_per_vq);
4174   else
4175     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4176                                              factor, nelts_per_vq);
4177 }
4178
4179 static int
4180 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4181                                 scalar_int_mode mode)
4182 {
4183   int i;
4184   unsigned HOST_WIDE_INT val, val2, mask;
4185   int one_match, zero_match;
4186   int num_insns;
4187
4188   val = INTVAL (imm);
4189
4190   if (aarch64_move_imm (val, mode))
4191     {
4192       if (generate)
4193         emit_insn (gen_rtx_SET (dest, imm));
4194       return 1;
4195     }
4196
4197   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4198      (with XXXX non-zero). In that case check to see if the move can be done in
4199      a smaller mode.  */
4200   val2 = val & 0xffffffff;
4201   if (mode == DImode
4202       && aarch64_move_imm (val2, SImode)
4203       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4204     {
4205       if (generate)
4206         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4207
4208       /* Check if we have to emit a second instruction by checking to see
4209          if any of the upper 32 bits of the original DI mode value is set.  */
4210       if (val == val2)
4211         return 1;
4212
4213       i = (val >> 48) ? 48 : 32;
4214
4215       if (generate)
4216          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4217                                     GEN_INT ((val >> i) & 0xffff)));
4218
4219       return 2;
4220     }
4221
4222   if ((val >> 32) == 0 || mode == SImode)
4223     {
4224       if (generate)
4225         {
4226           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4227           if (mode == SImode)
4228             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4229                                        GEN_INT ((val >> 16) & 0xffff)));
4230           else
4231             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4232                                        GEN_INT ((val >> 16) & 0xffff)));
4233         }
4234       return 2;
4235     }
4236
4237   /* Remaining cases are all for DImode.  */
4238
4239   mask = 0xffff;
4240   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4241     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4242   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4243     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4244
4245   if (zero_match != 2 && one_match != 2)
4246     {
4247       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4248          For a 64-bit bitmask try whether changing 16 bits to all ones or
4249          zeroes creates a valid bitmask.  To check any repeated bitmask,
4250          try using 16 bits from the other 32-bit half of val.  */
4251
4252       for (i = 0; i < 64; i += 16, mask <<= 16)
4253         {
4254           val2 = val & ~mask;
4255           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4256             break;
4257           val2 = val | mask;
4258           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4259             break;
4260           val2 = val2 & ~mask;
4261           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4262           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4263             break;
4264         }
4265       if (i != 64)
4266         {
4267           if (generate)
4268             {
4269               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4270               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4271                                          GEN_INT ((val >> i) & 0xffff)));
4272             }
4273           return 2;
4274         }
4275     }
4276
4277   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4278      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4279      otherwise skip zero bits.  */
4280
4281   num_insns = 1;
4282   mask = 0xffff;
4283   val2 = one_match > zero_match ? ~val : val;
4284   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4285
4286   if (generate)
4287     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4288                                            ? (val | ~(mask << i))
4289                                            : (val & (mask << i)))));
4290   for (i += 16; i < 64; i += 16)
4291     {
4292       if ((val2 & (mask << i)) == 0)
4293         continue;
4294       if (generate)
4295         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4296                                    GEN_INT ((val >> i) & 0xffff)));
4297       num_insns ++;
4298     }
4299
4300   return num_insns;
4301 }
4302
4303 /* Return whether imm is a 128-bit immediate which is simple enough to
4304    expand inline.  */
4305 bool
4306 aarch64_mov128_immediate (rtx imm)
4307 {
4308   if (CONST_INT_P (imm))
4309     return true;
4310
4311   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4312
4313   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4314   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4315
4316   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4317          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4318 }
4319
4320
4321 /* Return the number of temporary registers that aarch64_add_offset_1
4322    would need to add OFFSET to a register.  */
4323
4324 static unsigned int
4325 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4326 {
4327   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4328 }
4329
4330 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4331    a non-polynomial OFFSET.  MODE is the mode of the addition.
4332    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4333    be set and CFA adjustments added to the generated instructions.
4334
4335    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4336    temporary if register allocation is already complete.  This temporary
4337    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4338    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4339    the immediate again.
4340
4341    Since this function may be used to adjust the stack pointer, we must
4342    ensure that it cannot cause transient stack deallocation (for example
4343    by first incrementing SP and then decrementing when adjusting by a
4344    large immediate).  */
4345
4346 static void
4347 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4348                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4349                       bool frame_related_p, bool emit_move_imm)
4350 {
4351   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4352   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4353
4354   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4355   rtx_insn *insn;
4356
4357   if (!moffset)
4358     {
4359       if (!rtx_equal_p (dest, src))
4360         {
4361           insn = emit_insn (gen_rtx_SET (dest, src));
4362           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4363         }
4364       return;
4365     }
4366
4367   /* Single instruction adjustment.  */
4368   if (aarch64_uimm12_shift (moffset))
4369     {
4370       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4371       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4372       return;
4373     }
4374
4375   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4376      and either:
4377
4378      a) the offset cannot be loaded by a 16-bit move or
4379      b) there is no spare register into which we can move it.  */
4380   if (moffset < 0x1000000
4381       && ((!temp1 && !can_create_pseudo_p ())
4382           || !aarch64_move_imm (moffset, mode)))
4383     {
4384       HOST_WIDE_INT low_off = moffset & 0xfff;
4385
4386       low_off = offset < 0 ? -low_off : low_off;
4387       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4388       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4389       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4390       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4391       return;
4392     }
4393
4394   /* Emit a move immediate if required and an addition/subtraction.  */
4395   if (emit_move_imm)
4396     {
4397       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4398       temp1 = aarch64_force_temporary (mode, temp1,
4399                                        gen_int_mode (moffset, mode));
4400     }
4401   insn = emit_insn (offset < 0
4402                     ? gen_sub3_insn (dest, src, temp1)
4403                     : gen_add3_insn (dest, src, temp1));
4404   if (frame_related_p)
4405     {
4406       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4407       rtx adj = plus_constant (mode, src, offset);
4408       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4409     }
4410 }
4411
4412 /* Return the number of temporary registers that aarch64_add_offset
4413    would need to move OFFSET into a register or add OFFSET to a register;
4414    ADD_P is true if we want the latter rather than the former.  */
4415
4416 static unsigned int
4417 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4418 {
4419   /* This follows the same structure as aarch64_add_offset.  */
4420   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4421     return 0;
4422
4423   unsigned int count = 0;
4424   HOST_WIDE_INT factor = offset.coeffs[1];
4425   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4426   poly_int64 poly_offset (factor, factor);
4427   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4428     /* Need one register for the ADDVL/ADDPL result.  */
4429     count += 1;
4430   else if (factor != 0)
4431     {
4432       factor = abs (factor);
4433       if (factor > 16 * (factor & -factor))
4434         /* Need one register for the CNT result and one for the multiplication
4435            factor.  If necessary, the second temporary can be reused for the
4436            constant part of the offset.  */
4437         return 2;
4438       /* Need one register for the CNT result (which might then
4439          be shifted).  */
4440       count += 1;
4441     }
4442   return count + aarch64_add_offset_1_temporaries (constant);
4443 }
4444
4445 /* If X can be represented as a poly_int64, return the number
4446    of temporaries that are required to add it to a register.
4447    Return -1 otherwise.  */
4448
4449 int
4450 aarch64_add_offset_temporaries (rtx x)
4451 {
4452   poly_int64 offset;
4453   if (!poly_int_rtx_p (x, &offset))
4454     return -1;
4455   return aarch64_offset_temporaries (true, offset);
4456 }
4457
4458 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4459    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4460    be set and CFA adjustments added to the generated instructions.
4461
4462    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4463    temporary if register allocation is already complete.  This temporary
4464    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4465    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4466    false to avoid emitting the immediate again.
4467
4468    TEMP2, if nonnull, is a second temporary register that doesn't
4469    overlap either DEST or REG.
4470
4471    Since this function may be used to adjust the stack pointer, we must
4472    ensure that it cannot cause transient stack deallocation (for example
4473    by first incrementing SP and then decrementing when adjusting by a
4474    large immediate).  */
4475
4476 static void
4477 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4478                     poly_int64 offset, rtx temp1, rtx temp2,
4479                     bool frame_related_p, bool emit_move_imm = true)
4480 {
4481   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4482   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4483   gcc_assert (temp1 == NULL_RTX
4484               || !frame_related_p
4485               || !reg_overlap_mentioned_p (temp1, dest));
4486   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4487
4488   /* Try using ADDVL or ADDPL to add the whole value.  */
4489   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4490     {
4491       rtx offset_rtx = gen_int_mode (offset, mode);
4492       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4493       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4494       return;
4495     }
4496
4497   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4498      SVE vector register, over and above the minimum size of 128 bits.
4499      This is equivalent to half the value returned by CNTD with a
4500      vector shape of ALL.  */
4501   HOST_WIDE_INT factor = offset.coeffs[1];
4502   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4503
4504   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4505   poly_int64 poly_offset (factor, factor);
4506   if (src != const0_rtx
4507       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4508     {
4509       rtx offset_rtx = gen_int_mode (poly_offset, mode);
4510       if (frame_related_p)
4511         {
4512           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4513           RTX_FRAME_RELATED_P (insn) = true;
4514           src = dest;
4515         }
4516       else
4517         {
4518           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4519           src = aarch64_force_temporary (mode, temp1, addr);
4520           temp1 = temp2;
4521           temp2 = NULL_RTX;
4522         }
4523     }
4524   /* Otherwise use a CNT-based sequence.  */
4525   else if (factor != 0)
4526     {
4527       /* Use a subtraction if we have a negative factor.  */
4528       rtx_code code = PLUS;
4529       if (factor < 0)
4530         {
4531           factor = -factor;
4532           code = MINUS;
4533         }
4534
4535       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
4536          into the multiplication.  */
4537       rtx val;
4538       int shift = 0;
4539       if (factor & 1)
4540         /* Use a right shift by 1.  */
4541         shift = -1;
4542       else
4543         factor /= 2;
4544       HOST_WIDE_INT low_bit = factor & -factor;
4545       if (factor <= 16 * low_bit)
4546         {
4547           if (factor > 16 * 8)
4548             {
4549               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4550                  the value with the minimum multiplier and shift it into
4551                  position.  */
4552               int extra_shift = exact_log2 (low_bit);
4553               shift += extra_shift;
4554               factor >>= extra_shift;
4555             }
4556           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4557         }
4558       else
4559         {
4560           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4561              directly, since that should increase the chances of being
4562              able to use a shift and add sequence.  If LOW_BIT itself
4563              is out of range, just use CNTD.  */
4564           if (low_bit <= 16 * 8)
4565             factor /= low_bit;
4566           else
4567             low_bit = 1;
4568
4569           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4570           val = aarch64_force_temporary (mode, temp1, val);
4571
4572           if (can_create_pseudo_p ())
4573             {
4574               rtx coeff1 = gen_int_mode (factor, mode);
4575               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4576             }
4577           else
4578             {
4579               /* Go back to using a negative multiplication factor if we have
4580                  no register from which to subtract.  */
4581               if (code == MINUS && src == const0_rtx)
4582                 {
4583                   factor = -factor;
4584                   code = PLUS;
4585                 }
4586               rtx coeff1 = gen_int_mode (factor, mode);
4587               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4588               val = gen_rtx_MULT (mode, val, coeff1);
4589             }
4590         }
4591
4592       if (shift > 0)
4593         {
4594           /* Multiply by 1 << SHIFT.  */
4595           val = aarch64_force_temporary (mode, temp1, val);
4596           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4597         }
4598       else if (shift == -1)
4599         {
4600           /* Divide by 2.  */
4601           val = aarch64_force_temporary (mode, temp1, val);
4602           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4603         }
4604
4605       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
4606       if (src != const0_rtx)
4607         {
4608           val = aarch64_force_temporary (mode, temp1, val);
4609           val = gen_rtx_fmt_ee (code, mode, src, val);
4610         }
4611       else if (code == MINUS)
4612         {
4613           val = aarch64_force_temporary (mode, temp1, val);
4614           val = gen_rtx_NEG (mode, val);
4615         }
4616
4617       if (constant == 0 || frame_related_p)
4618         {
4619           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4620           if (frame_related_p)
4621             {
4622               RTX_FRAME_RELATED_P (insn) = true;
4623               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4624                             gen_rtx_SET (dest, plus_constant (Pmode, src,
4625                                                               poly_offset)));
4626             }
4627           src = dest;
4628           if (constant == 0)
4629             return;
4630         }
4631       else
4632         {
4633           src = aarch64_force_temporary (mode, temp1, val);
4634           temp1 = temp2;
4635           temp2 = NULL_RTX;
4636         }
4637
4638       emit_move_imm = true;
4639     }
4640
4641   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4642                         frame_related_p, emit_move_imm);
4643 }
4644
4645 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4646    than a poly_int64.  */
4647
4648 void
4649 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4650                           rtx offset_rtx, rtx temp1, rtx temp2)
4651 {
4652   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4653                       temp1, temp2, false);
4654 }
4655
4656 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4657    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4658    if TEMP1 already contains abs (DELTA).  */
4659
4660 static inline void
4661 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4662 {
4663   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4664                       temp1, temp2, true, emit_move_imm);
4665 }
4666
4667 /* Subtract DELTA from the stack pointer, marking the instructions
4668    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4669    if nonnull.  */
4670
4671 static inline void
4672 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4673                 bool emit_move_imm = true)
4674 {
4675   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4676                       temp1, temp2, frame_related_p, emit_move_imm);
4677 }
4678
4679 /* Set DEST to (vec_series BASE STEP).  */
4680
4681 static void
4682 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4683 {
4684   machine_mode mode = GET_MODE (dest);
4685   scalar_mode inner = GET_MODE_INNER (mode);
4686
4687   /* Each operand can be a register or an immediate in the range [-16, 15].  */
4688   if (!aarch64_sve_index_immediate_p (base))
4689     base = force_reg (inner, base);
4690   if (!aarch64_sve_index_immediate_p (step))
4691     step = force_reg (inner, step);
4692
4693   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4694 }
4695
4696 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4697    register of mode MODE.  Use TARGET for the result if it's nonnull
4698    and convenient.
4699
4700    The two vector modes must have the same element mode.  The behavior
4701    is to duplicate architectural lane N of SRC into architectural lanes
4702    N + I * STEP of the result.  On big-endian targets, architectural
4703    lane 0 of an Advanced SIMD vector is the last element of the vector
4704    in memory layout, so for big-endian targets this operation has the
4705    effect of reversing SRC before duplicating it.  Callers need to
4706    account for this.  */
4707
4708 rtx
4709 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4710 {
4711   machine_mode src_mode = GET_MODE (src);
4712   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4713   insn_code icode = (BYTES_BIG_ENDIAN
4714                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
4715                      : code_for_aarch64_vec_duplicate_vq_le (mode));
4716
4717   unsigned int i = 0;
4718   expand_operand ops[3];
4719   create_output_operand (&ops[i++], target, mode);
4720   create_output_operand (&ops[i++], src, src_mode);
4721   if (BYTES_BIG_ENDIAN)
4722     {
4723       /* Create a PARALLEL describing the reversal of SRC.  */
4724       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4725       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4726                                                   nelts_per_vq - 1, -1);
4727       create_fixed_operand (&ops[i++], sel);
4728     }
4729   expand_insn (icode, i, ops);
4730   return ops[0].value;
4731 }
4732
4733 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4734    the memory image into DEST.  Return true on success.  */
4735
4736 static bool
4737 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4738 {
4739   src = force_const_mem (GET_MODE (src), src);
4740   if (!src)
4741     return false;
4742
4743   /* Make sure that the address is legitimate.  */
4744   if (!aarch64_sve_ld1rq_operand_p (src))
4745     {
4746       rtx addr = force_reg (Pmode, XEXP (src, 0));
4747       src = replace_equiv_address (src, addr);
4748     }
4749
4750   machine_mode mode = GET_MODE (dest);
4751   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4752   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4753   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4754   return true;
4755 }
4756
4757 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4758    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4759    result if convenient.
4760
4761    The returned register can have whatever mode seems most natural
4762    given the contents of SRC.  */
4763
4764 static rtx
4765 aarch64_expand_sve_const_vector (rtx target, rtx src)
4766 {
4767   machine_mode mode = GET_MODE (src);
4768   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4769   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4770   scalar_mode elt_mode = GET_MODE_INNER (mode);
4771   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4772   unsigned int container_bits = aarch64_sve_container_bits (mode);
4773   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4774
4775   if (nelts_per_pattern == 1
4776       && encoded_bits <= 128
4777       && container_bits != elt_bits)
4778     {
4779       /* We have a partial vector mode and a constant whose full-vector
4780          equivalent would occupy a repeating 128-bit sequence.  Build that
4781          full-vector equivalent instead, so that we have the option of
4782          using LD1RQ and Advanced SIMD operations.  */
4783       unsigned int repeat = container_bits / elt_bits;
4784       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4785       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4786       for (unsigned int i = 0; i < npatterns; ++i)
4787         for (unsigned int j = 0; j < repeat; ++j)
4788           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4789       target = aarch64_target_reg (target, full_mode);
4790       return aarch64_expand_sve_const_vector (target, builder.build ());
4791     }
4792
4793   if (nelts_per_pattern == 1 && encoded_bits == 128)
4794     {
4795       /* The constant is a duplicated quadword but can't be narrowed
4796          beyond a quadword.  Get the memory image of the first quadword
4797          as a 128-bit vector and try using LD1RQ to load it from memory.
4798
4799          The effect for both endiannesses is to load memory lane N into
4800          architectural lanes N + I * STEP of the result.  On big-endian
4801          targets, the layout of the 128-bit vector in an Advanced SIMD
4802          register would be different from its layout in an SVE register,
4803          but this 128-bit vector is a memory value only.  */
4804       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4805       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4806       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4807         return target;
4808     }
4809
4810   if (nelts_per_pattern == 1 && encoded_bits < 128)
4811     {
4812       /* The vector is a repeating sequence of 64 bits or fewer.
4813          See if we can load them using an Advanced SIMD move and then
4814          duplicate it to fill a vector.  This is better than using a GPR
4815          move because it keeps everything in the same register file.  */
4816       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4817       rtx_vector_builder builder (vq_mode, npatterns, 1);
4818       for (unsigned int i = 0; i < npatterns; ++i)
4819         {
4820           /* We want memory lane N to go into architectural lane N,
4821              so reverse for big-endian targets.  The DUP .Q pattern
4822              has a compensating reverse built-in.  */
4823           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4824           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4825         }
4826       rtx vq_src = builder.build ();
4827       if (aarch64_simd_valid_immediate (vq_src, NULL))
4828         {
4829           vq_src = force_reg (vq_mode, vq_src);
4830           return aarch64_expand_sve_dupq (target, mode, vq_src);
4831         }
4832
4833       /* Get an integer representation of the repeating part of Advanced
4834          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4835          which for big-endian targets is lane-swapped wrt a normal
4836          Advanced SIMD vector.  This means that for both endiannesses,
4837          memory lane N of SVE vector SRC corresponds to architectural
4838          lane N of a register holding VQ_SRC.  This in turn means that
4839          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4840          as a single 128-bit value) and thus that memory lane 0 of SRC is
4841          in the lsb of the integer.  Duplicating the integer therefore
4842          ensures that memory lane N of SRC goes into architectural lane
4843          N + I * INDEX of the SVE register.  */
4844       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4845       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4846       if (elt_value)
4847         {
4848           /* Pretend that we had a vector of INT_MODE to start with.  */
4849           elt_mode = int_mode;
4850           mode = aarch64_full_sve_mode (int_mode).require ();
4851
4852           /* If the integer can be moved into a general register by a
4853              single instruction, do that and duplicate the result.  */
4854           if (CONST_INT_P (elt_value)
4855               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4856             {
4857               elt_value = force_reg (elt_mode, elt_value);
4858               return expand_vector_broadcast (mode, elt_value);
4859             }
4860         }
4861       else if (npatterns == 1)
4862         /* We're duplicating a single value, but can't do better than
4863            force it to memory and load from there.  This handles things
4864            like symbolic constants.  */
4865         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4866
4867       if (elt_value)
4868         {
4869           /* Load the element from memory if we can, otherwise move it into
4870              a register and use a DUP.  */
4871           rtx op = force_const_mem (elt_mode, elt_value);
4872           if (!op)
4873             op = force_reg (elt_mode, elt_value);
4874           return expand_vector_broadcast (mode, op);
4875         }
4876     }
4877
4878   /* Try using INDEX.  */
4879   rtx base, step;
4880   if (const_vec_series_p (src, &base, &step))
4881     {
4882       aarch64_expand_vec_series (target, base, step);
4883       return target;
4884     }
4885
4886   /* From here on, it's better to force the whole constant to memory
4887      if we can.  */
4888   if (GET_MODE_NUNITS (mode).is_constant ())
4889     return NULL_RTX;
4890
4891   /* Expand each pattern individually.  */
4892   gcc_assert (npatterns > 1);
4893   rtx_vector_builder builder;
4894   auto_vec<rtx, 16> vectors (npatterns);
4895   for (unsigned int i = 0; i < npatterns; ++i)
4896     {
4897       builder.new_vector (mode, 1, nelts_per_pattern);
4898       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4899         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4900       vectors.quick_push (force_reg (mode, builder.build ()));
4901     }
4902
4903   /* Use permutes to interleave the separate vectors.  */
4904   while (npatterns > 1)
4905     {
4906       npatterns /= 2;
4907       for (unsigned int i = 0; i < npatterns; ++i)
4908         {
4909           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4910           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4911           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4912           vectors[i] = tmp;
4913         }
4914     }
4915   gcc_assert (vectors[0] == target);
4916   return target;
4917 }
4918
4919 /* Use WHILE to set a predicate register of mode MODE in which the first
4920    VL bits are set and the rest are clear.  Use TARGET for the register
4921    if it's nonnull and convenient.  */
4922
4923 static rtx
4924 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4925                                  unsigned int vl)
4926 {
4927   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4928   target = aarch64_target_reg (target, mode);
4929   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4930                         target, const0_rtx, limit));
4931   return target;
4932 }
4933
4934 static rtx
4935 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4936
4937 /* BUILDER is a constant predicate in which the index of every set bit
4938    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4939    by inverting every element at a multiple of ELT_SIZE and EORing the
4940    result with an ELT_SIZE PTRUE.
4941
4942    Return a register that contains the constant on success, otherwise
4943    return null.  Use TARGET as the register if it is nonnull and
4944    convenient.  */
4945
4946 static rtx
4947 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4948                                    unsigned int elt_size)
4949 {
4950   /* Invert every element at a multiple of ELT_SIZE, keeping the
4951      other bits zero.  */
4952   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4953                                   builder.nelts_per_pattern ());
4954   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4955     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4956       inv_builder.quick_push (const1_rtx);
4957     else
4958       inv_builder.quick_push (const0_rtx);
4959   inv_builder.finalize ();
4960
4961   /* See if we can load the constant cheaply.  */
4962   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4963   if (!inv)
4964     return NULL_RTX;
4965
4966   /* EOR the result with an ELT_SIZE PTRUE.  */
4967   rtx mask = aarch64_ptrue_all (elt_size);
4968   mask = force_reg (VNx16BImode, mask);
4969   inv = gen_lowpart (VNx16BImode, inv);
4970   target = aarch64_target_reg (target, VNx16BImode);
4971   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4972   return target;
4973 }
4974
4975 /* BUILDER is a constant predicate in which the index of every set bit
4976    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4977    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4978    register on success, otherwise return null.  Use TARGET as the register
4979    if nonnull and convenient.  */
4980
4981 static rtx
4982 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4983                                    unsigned int elt_size,
4984                                    unsigned int permute_size)
4985 {
4986   /* We're going to split the constant into two new constants A and B,
4987      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4988      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4989
4990      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4991      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4992
4993      where _ indicates elements that will be discarded by the permute.
4994
4995      First calculate the ELT_SIZEs for A and B.  */
4996   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4997   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4998   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4999     if (INTVAL (builder.elt (i)) != 0)
5000       {
5001         if (i & permute_size)
5002           b_elt_size |= i - permute_size;
5003         else
5004           a_elt_size |= i;
5005       }
5006   a_elt_size &= -a_elt_size;
5007   b_elt_size &= -b_elt_size;
5008
5009   /* Now construct the vectors themselves.  */
5010   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5011                                 builder.nelts_per_pattern ());
5012   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5013                                 builder.nelts_per_pattern ());
5014   unsigned int nelts = builder.encoded_nelts ();
5015   for (unsigned int i = 0; i < nelts; ++i)
5016     if (i & (elt_size - 1))
5017       {
5018         a_builder.quick_push (const0_rtx);
5019         b_builder.quick_push (const0_rtx);
5020       }
5021     else if ((i & permute_size) == 0)
5022       {
5023         /* The A and B elements are significant.  */
5024         a_builder.quick_push (builder.elt (i));
5025         b_builder.quick_push (builder.elt (i + permute_size));
5026       }
5027     else
5028       {
5029         /* The A and B elements are going to be discarded, so pick whatever
5030            is likely to give a nice constant.  We are targeting element
5031            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5032            with the aim of each being a sequence of ones followed by
5033            a sequence of zeros.  So:
5034
5035            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5036              duplicate the last X_ELT_SIZE element, to extend the
5037              current sequence of ones or zeros.
5038
5039            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5040              zero, so that the constant really does have X_ELT_SIZE and
5041              not a smaller size.  */
5042         if (a_elt_size > permute_size)
5043           a_builder.quick_push (const0_rtx);
5044         else
5045           a_builder.quick_push (a_builder.elt (i - a_elt_size));
5046         if (b_elt_size > permute_size)
5047           b_builder.quick_push (const0_rtx);
5048         else
5049           b_builder.quick_push (b_builder.elt (i - b_elt_size));
5050       }
5051   a_builder.finalize ();
5052   b_builder.finalize ();
5053
5054   /* Try loading A into a register.  */
5055   rtx_insn *last = get_last_insn ();
5056   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5057   if (!a)
5058     return NULL_RTX;
5059
5060   /* Try loading B into a register.  */
5061   rtx b = a;
5062   if (a_builder != b_builder)
5063     {
5064       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5065       if (!b)
5066         {
5067           delete_insns_since (last);
5068           return NULL_RTX;
5069         }
5070     }
5071
5072   /* Emit the TRN1 itself.  */
5073   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5074   target = aarch64_target_reg (target, mode);
5075   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
5076                               gen_lowpart (mode, a),
5077                               gen_lowpart (mode, b)));
5078   return target;
5079 }
5080
5081 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5082    constant in BUILDER into an SVE predicate register.  Return the register
5083    on success, otherwise return null.  Use TARGET for the register if
5084    nonnull and convenient.
5085
5086    ALLOW_RECURSE_P is true if we can use methods that would call this
5087    function recursively.  */
5088
5089 static rtx
5090 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5091                                  bool allow_recurse_p)
5092 {
5093   if (builder.encoded_nelts () == 1)
5094     /* A PFALSE or a PTRUE .B ALL.  */
5095     return aarch64_emit_set_immediate (target, builder);
5096
5097   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5098   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5099     {
5100       /* If we can load the constant using PTRUE, use it as-is.  */
5101       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5102       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5103         return aarch64_emit_set_immediate (target, builder);
5104
5105       /* Otherwise use WHILE to set the first VL bits.  */
5106       return aarch64_sve_move_pred_via_while (target, mode, vl);
5107     }
5108
5109   if (!allow_recurse_p)
5110     return NULL_RTX;
5111
5112   /* Try inverting the vector in element size ELT_SIZE and then EORing
5113      the result with an ELT_SIZE PTRUE.  */
5114   if (INTVAL (builder.elt (0)) == 0)
5115     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5116                                                      elt_size))
5117       return res;
5118
5119   /* Try using TRN1 to permute two simpler constants.  */
5120   for (unsigned int i = elt_size; i <= 8; i *= 2)
5121     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5122                                                      elt_size, i))
5123       return res;
5124
5125   return NULL_RTX;
5126 }
5127
5128 /* Return an SVE predicate register that contains the VNx16BImode
5129    constant in BUILDER, without going through the move expanders.
5130
5131    The returned register can have whatever mode seems most natural
5132    given the contents of BUILDER.  Use TARGET for the result if
5133    convenient.  */
5134
5135 static rtx
5136 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5137 {
5138   /* Try loading the constant using pure predicate operations.  */
5139   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5140     return res;
5141
5142   /* Try forcing the constant to memory.  */
5143   if (builder.full_nelts ().is_constant ())
5144     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5145       {
5146         target = aarch64_target_reg (target, VNx16BImode);
5147         emit_move_insn (target, mem);
5148         return target;
5149       }
5150
5151   /* The last resort is to load the constant as an integer and then
5152      compare it against zero.  Use -1 for set bits in order to increase
5153      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5154   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5155                                   builder.nelts_per_pattern ());
5156   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5157     int_builder.quick_push (INTVAL (builder.elt (i))
5158                             ? constm1_rtx : const0_rtx);
5159   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5160                                            int_builder.build ());
5161 }
5162
5163 /* Set DEST to immediate IMM.  */
5164
5165 void
5166 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5167 {
5168   machine_mode mode = GET_MODE (dest);
5169
5170   /* Check on what type of symbol it is.  */
5171   scalar_int_mode int_mode;
5172   if ((SYMBOL_REF_P (imm)
5173        || LABEL_REF_P (imm)
5174        || GET_CODE (imm) == CONST
5175        || GET_CODE (imm) == CONST_POLY_INT)
5176       && is_a <scalar_int_mode> (mode, &int_mode))
5177     {
5178       rtx mem;
5179       poly_int64 offset;
5180       HOST_WIDE_INT const_offset;
5181       enum aarch64_symbol_type sty;
5182
5183       /* If we have (const (plus symbol offset)), separate out the offset
5184          before we start classifying the symbol.  */
5185       rtx base = strip_offset (imm, &offset);
5186
5187       /* We must always add an offset involving VL separately, rather than
5188          folding it into the relocation.  */
5189       if (!offset.is_constant (&const_offset))
5190         {
5191           if (!TARGET_SVE)
5192             {
5193               aarch64_report_sve_required ();
5194               return;
5195             }
5196           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5197             emit_insn (gen_rtx_SET (dest, imm));
5198           else
5199             {
5200               /* Do arithmetic on 32-bit values if the result is smaller
5201                  than that.  */
5202               if (partial_subreg_p (int_mode, SImode))
5203                 {
5204                   /* It is invalid to do symbol calculations in modes
5205                      narrower than SImode.  */
5206                   gcc_assert (base == const0_rtx);
5207                   dest = gen_lowpart (SImode, dest);
5208                   int_mode = SImode;
5209                 }
5210               if (base != const0_rtx)
5211                 {
5212                   base = aarch64_force_temporary (int_mode, dest, base);
5213                   aarch64_add_offset (int_mode, dest, base, offset,
5214                                       NULL_RTX, NULL_RTX, false);
5215                 }
5216               else
5217                 aarch64_add_offset (int_mode, dest, base, offset,
5218                                     dest, NULL_RTX, false);
5219             }
5220           return;
5221         }
5222
5223       sty = aarch64_classify_symbol (base, const_offset);
5224       switch (sty)
5225         {
5226         case SYMBOL_FORCE_TO_MEM:
5227           if (const_offset != 0
5228               && targetm.cannot_force_const_mem (int_mode, imm))
5229             {
5230               gcc_assert (can_create_pseudo_p ());
5231               base = aarch64_force_temporary (int_mode, dest, base);
5232               aarch64_add_offset (int_mode, dest, base, const_offset,
5233                                   NULL_RTX, NULL_RTX, false);
5234               return;
5235             }
5236
5237           mem = force_const_mem (ptr_mode, imm);
5238           gcc_assert (mem);
5239
5240           /* If we aren't generating PC relative literals, then
5241              we need to expand the literal pool access carefully.
5242              This is something that needs to be done in a number
5243              of places, so could well live as a separate function.  */
5244           if (!aarch64_pcrelative_literal_loads)
5245             {
5246               gcc_assert (can_create_pseudo_p ());
5247               base = gen_reg_rtx (ptr_mode);
5248               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5249               if (ptr_mode != Pmode)
5250                 base = convert_memory_address (Pmode, base);
5251               mem = gen_rtx_MEM (ptr_mode, base);
5252             }
5253
5254           if (int_mode != ptr_mode)
5255             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5256
5257           emit_insn (gen_rtx_SET (dest, mem));
5258
5259           return;
5260
5261         case SYMBOL_SMALL_TLSGD:
5262         case SYMBOL_SMALL_TLSDESC:
5263         case SYMBOL_SMALL_TLSIE:
5264         case SYMBOL_SMALL_GOT_28K:
5265         case SYMBOL_SMALL_GOT_4G:
5266         case SYMBOL_TINY_GOT:
5267         case SYMBOL_TINY_TLSIE:
5268           if (const_offset != 0)
5269             {
5270               gcc_assert(can_create_pseudo_p ());
5271               base = aarch64_force_temporary (int_mode, dest, base);
5272               aarch64_add_offset (int_mode, dest, base, const_offset,
5273                                   NULL_RTX, NULL_RTX, false);
5274               return;
5275             }
5276           /* FALLTHRU */
5277
5278         case SYMBOL_SMALL_ABSOLUTE:
5279         case SYMBOL_TINY_ABSOLUTE:
5280         case SYMBOL_TLSLE12:
5281         case SYMBOL_TLSLE24:
5282         case SYMBOL_TLSLE32:
5283         case SYMBOL_TLSLE48:
5284           aarch64_load_symref_appropriately (dest, imm, sty);
5285           return;
5286
5287         default:
5288           gcc_unreachable ();
5289         }
5290     }
5291
5292   if (!CONST_INT_P (imm))
5293     {
5294       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5295         {
5296           /* Only the low bit of each .H, .S and .D element is defined,
5297              so we can set the upper bits to whatever we like.  If the
5298              predicate is all-true in MODE, prefer to set all the undefined
5299              bits as well, so that we can share a single .B predicate for
5300              all modes.  */
5301           if (imm == CONSTM1_RTX (mode))
5302             imm = CONSTM1_RTX (VNx16BImode);
5303
5304           /* All methods for constructing predicate modes wider than VNx16BI
5305              will set the upper bits of each element to zero.  Expose this
5306              by moving such constants as a VNx16BI, so that all bits are
5307              significant and so that constants for different modes can be
5308              shared.  The wider constant will still be available as a
5309              REG_EQUAL note.  */
5310           rtx_vector_builder builder;
5311           if (aarch64_get_sve_pred_bits (builder, imm))
5312             {
5313               rtx res = aarch64_expand_sve_const_pred (dest, builder);
5314               if (dest != res)
5315                 emit_move_insn (dest, gen_lowpart (mode, res));
5316               return;
5317             }
5318         }
5319
5320       if (GET_CODE (imm) == HIGH
5321           || aarch64_simd_valid_immediate (imm, NULL))
5322         {
5323           emit_insn (gen_rtx_SET (dest, imm));
5324           return;
5325         }
5326
5327       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5328         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5329           {
5330             if (dest != res)
5331               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5332             return;
5333           }
5334
5335       rtx mem = force_const_mem (mode, imm);
5336       gcc_assert (mem);
5337       emit_move_insn (dest, mem);
5338       return;
5339     }
5340
5341   aarch64_internal_mov_immediate (dest, imm, true,
5342                                   as_a <scalar_int_mode> (mode));
5343 }
5344
5345 /* Return the MEM rtx that provides the canary value that should be used
5346    for stack-smashing protection.  MODE is the mode of the memory.
5347    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
5348    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
5349    indicates whether the caller is performing a SET or a TEST operation.  */
5350
5351 rtx
5352 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
5353                                   aarch64_salt_type salt_type)
5354 {
5355   rtx addr;
5356   if (aarch64_stack_protector_guard == SSP_GLOBAL)
5357     {
5358       gcc_assert (MEM_P (decl_rtl));
5359       addr = XEXP (decl_rtl, 0);
5360       poly_int64 offset;
5361       rtx base = strip_offset_and_salt (addr, &offset);
5362       if (!SYMBOL_REF_P (base))
5363         return decl_rtl;
5364
5365       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
5366       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
5367       addr = gen_rtx_CONST (Pmode, addr);
5368       addr = plus_constant (Pmode, addr, offset);
5369     }
5370   else
5371     {
5372       /* Calculate the address from the system register.  */
5373       rtx salt = GEN_INT (salt_type);
5374       addr = gen_reg_rtx (mode);
5375       if (mode == DImode)
5376         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
5377       else
5378         {
5379           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
5380           addr = convert_memory_address (Pmode, addr);
5381         }
5382       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
5383     }
5384   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
5385 }
5386
5387 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
5388    that is known to contain PTRUE.  */
5389
5390 void
5391 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5392 {
5393   expand_operand ops[3];
5394   machine_mode mode = GET_MODE (dest);
5395   create_output_operand (&ops[0], dest, mode);
5396   create_input_operand (&ops[1], pred, GET_MODE(pred));
5397   create_input_operand (&ops[2], src, mode);
5398   temporary_volatile_ok v (true);
5399   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5400 }
5401
5402 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5403    operand is in memory.  In this case we need to use the predicated LD1
5404    and ST1 instead of LDR and STR, both for correctness on big-endian
5405    targets and because LD1 and ST1 support a wider range of addressing modes.
5406    PRED_MODE is the mode of the predicate.
5407
5408    See the comment at the head of aarch64-sve.md for details about the
5409    big-endian handling.  */
5410
5411 void
5412 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5413 {
5414   machine_mode mode = GET_MODE (dest);
5415   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5416   if (!register_operand (src, mode)
5417       && !register_operand (dest, mode))
5418     {
5419       rtx tmp = gen_reg_rtx (mode);
5420       if (MEM_P (src))
5421         aarch64_emit_sve_pred_move (tmp, ptrue, src);
5422       else
5423         emit_move_insn (tmp, src);
5424       src = tmp;
5425     }
5426   aarch64_emit_sve_pred_move (dest, ptrue, src);
5427 }
5428
5429 /* Called only on big-endian targets.  See whether an SVE vector move
5430    from SRC to DEST is effectively a REV[BHW] instruction, because at
5431    least one operand is a subreg of an SVE vector that has wider or
5432    narrower elements.  Return true and emit the instruction if so.
5433
5434    For example:
5435
5436      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5437
5438    represents a VIEW_CONVERT between the following vectors, viewed
5439    in memory order:
5440
5441      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
5442      R1: { [0],      [1],      [2],      [3],     ... }
5443
5444    The high part of lane X in R2 should therefore correspond to lane X*2
5445    of R1, but the register representations are:
5446
5447          msb                                      lsb
5448      R2: ...... [1].high  [1].low   [0].high  [0].low
5449      R1: ...... [3]       [2]       [1]       [0]
5450
5451    where the low part of lane X in R2 corresponds to lane X*2 in R1.
5452    We therefore need a reverse operation to swap the high and low values
5453    around.
5454
5455    This is purely an optimization.  Without it we would spill the
5456    subreg operand to the stack in one mode and reload it in the
5457    other mode, which has the same effect as the REV.  */
5458
5459 bool
5460 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5461 {
5462   gcc_assert (BYTES_BIG_ENDIAN);
5463
5464   /* Do not try to optimize subregs that LRA has created for matched
5465      reloads.  These subregs only exist as a temporary measure to make
5466      the RTL well-formed, but they are exempt from the usual
5467      TARGET_CAN_CHANGE_MODE_CLASS rules.
5468
5469      For example, if we have:
5470
5471        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
5472
5473      and the constraints require R1 and R2 to be in the same register,
5474      LRA may need to create RTL such as:
5475
5476        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
5477        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
5478        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
5479
5480      which forces both the input and output of the original instruction
5481      to use the same hard register.  But for this to work, the normal
5482      rules have to be suppressed on the subreg input, otherwise LRA
5483      would need to reload that input too, meaning that the process
5484      would never terminate.  To compensate for this, the normal rules
5485      are also suppressed for the subreg output of the first move.
5486      Ignoring the special case and handling the first move normally
5487      would therefore generate wrong code: we would reverse the elements
5488      for the first subreg but not reverse them back for the second subreg.  */
5489   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
5490     dest = SUBREG_REG (dest);
5491   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
5492     src = SUBREG_REG (src);
5493
5494   /* The optimization handles two single SVE REGs with different element
5495      sizes.  */
5496   if (!REG_P (dest)
5497       || !REG_P (src)
5498       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5499       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5500       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5501           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5502     return false;
5503
5504   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
5505   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5506   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5507                                UNSPEC_REV_SUBREG);
5508   emit_insn (gen_rtx_SET (dest, unspec));
5509   return true;
5510 }
5511
5512 /* Return a copy of X with mode MODE, without changing its other
5513    attributes.  Unlike gen_lowpart, this doesn't care whether the
5514    mode change is valid.  */
5515
5516 rtx
5517 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5518 {
5519   if (GET_MODE (x) == mode)
5520     return x;
5521
5522   x = shallow_copy_rtx (x);
5523   set_mode_and_regno (x, mode, REGNO (x));
5524   return x;
5525 }
5526
5527 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5528    stored in wider integer containers.  */
5529
5530 static unsigned int
5531 aarch64_sve_rev_unspec (machine_mode mode)
5532 {
5533   switch (GET_MODE_UNIT_SIZE (mode))
5534     {
5535     case 1: return UNSPEC_REVB;
5536     case 2: return UNSPEC_REVH;
5537     case 4: return UNSPEC_REVW;
5538     }
5539   gcc_unreachable ();
5540 }
5541
5542 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5543    operands.  */
5544
5545 void
5546 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5547 {
5548   /* Decide which REV operation we need.  The mode with wider elements
5549      determines the mode of the operands and the mode with the narrower
5550      elements determines the reverse width.  */
5551   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5552   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5553   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5554       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5555     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5556
5557   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5558   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5559
5560   /* Get the operands in the appropriate modes and emit the instruction.  */
5561   ptrue = gen_lowpart (pred_mode, ptrue);
5562   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5563   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5564   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5565                                dest, ptrue, src));
5566 }
5567
5568 static bool
5569 aarch64_function_ok_for_sibcall (tree, tree exp)
5570 {
5571   if (crtl->abi->id () != expr_callee_abi (exp).id ())
5572     return false;
5573
5574   return true;
5575 }
5576
5577 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5578    passed in SVE registers.  */
5579
5580 static bool
5581 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5582                              const function_arg_info &arg)
5583 {
5584   HOST_WIDE_INT size;
5585   machine_mode dummymode;
5586   int nregs;
5587
5588   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
5589   if (arg.mode == BLKmode && arg.type)
5590     size = int_size_in_bytes (arg.type);
5591   else
5592     /* No frontends can create types with variable-sized modes, so we
5593        shouldn't be asked to pass or return them.  */
5594     size = GET_MODE_SIZE (arg.mode).to_constant ();
5595
5596   /* Aggregates are passed by reference based on their size.  */
5597   if (arg.aggregate_type_p ())
5598     size = int_size_in_bytes (arg.type);
5599
5600   /* Variable sized arguments are always returned by reference.  */
5601   if (size < 0)
5602     return true;
5603
5604   /* Can this be a candidate to be passed in fp/simd register(s)?  */
5605   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5606                                                &dummymode, &nregs, NULL,
5607                                                !pcum || pcum->silent_p))
5608     return false;
5609
5610   /* Arguments which are variable sized or larger than 2 registers are
5611      passed by reference unless they are a homogenous floating point
5612      aggregate.  */
5613   return size > 2 * UNITS_PER_WORD;
5614 }
5615
5616 /* Implement TARGET_PASS_BY_REFERENCE.  */
5617
5618 static bool
5619 aarch64_pass_by_reference (cumulative_args_t pcum_v,
5620                            const function_arg_info &arg)
5621 {
5622   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5623
5624   if (!arg.type)
5625     return aarch64_pass_by_reference_1 (pcum, arg);
5626
5627   pure_scalable_type_info pst_info;
5628   switch (pst_info.analyze (arg.type))
5629     {
5630     case pure_scalable_type_info::IS_PST:
5631       if (pcum && !pcum->silent_p && !TARGET_SVE)
5632         /* We can't gracefully recover at this point, so make this a
5633            fatal error.  */
5634         fatal_error (input_location, "arguments of type %qT require"
5635                      " the SVE ISA extension", arg.type);
5636
5637       /* Variadic SVE types are passed by reference.  Normal non-variadic
5638          arguments are too if we've run out of registers.  */
5639       return (!arg.named
5640               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5641               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5642
5643     case pure_scalable_type_info::DOESNT_MATTER:
5644       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
5645       return true;
5646
5647     case pure_scalable_type_info::NO_ABI_IDENTITY:
5648     case pure_scalable_type_info::ISNT_PST:
5649       return aarch64_pass_by_reference_1 (pcum, arg);
5650     }
5651   gcc_unreachable ();
5652 }
5653
5654 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
5655 static bool
5656 aarch64_return_in_msb (const_tree valtype)
5657 {
5658   machine_mode dummy_mode;
5659   int dummy_int;
5660
5661   /* Never happens in little-endian mode.  */
5662   if (!BYTES_BIG_ENDIAN)
5663     return false;
5664
5665   /* Only composite types smaller than or equal to 16 bytes can
5666      be potentially returned in registers.  */
5667   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5668       || int_size_in_bytes (valtype) <= 0
5669       || int_size_in_bytes (valtype) > 16)
5670     return false;
5671
5672   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5673      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5674      is always passed/returned in the least significant bits of fp/simd
5675      register(s).  */
5676   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5677                                                &dummy_mode, &dummy_int, NULL,
5678                                                false))
5679     return false;
5680
5681   /* Likewise pure scalable types for SVE vector and predicate registers.  */
5682   pure_scalable_type_info pst_info;
5683   if (pst_info.analyze_registers (valtype))
5684     return false;
5685
5686   return true;
5687 }
5688
5689 /* Implement TARGET_FUNCTION_VALUE.
5690    Define how to find the value returned by a function.  */
5691
5692 static rtx
5693 aarch64_function_value (const_tree type, const_tree func,
5694                         bool outgoing ATTRIBUTE_UNUSED)
5695 {
5696   machine_mode mode;
5697   int unsignedp;
5698
5699   mode = TYPE_MODE (type);
5700   if (INTEGRAL_TYPE_P (type))
5701     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5702
5703   pure_scalable_type_info pst_info;
5704   if (type && pst_info.analyze_registers (type))
5705     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5706
5707   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5708      are returned in memory, not by value.  */
5709   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5710   bool sve_p = (vec_flags & VEC_ANY_SVE);
5711
5712   if (aarch64_return_in_msb (type))
5713     {
5714       HOST_WIDE_INT size = int_size_in_bytes (type);
5715
5716       if (size % UNITS_PER_WORD != 0)
5717         {
5718           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5719           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5720         }
5721     }
5722
5723   int count;
5724   machine_mode ag_mode;
5725   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5726                                                NULL, false))
5727     {
5728       gcc_assert (!sve_p);
5729       if (!aarch64_composite_type_p (type, mode))
5730         {
5731           gcc_assert (count == 1 && mode == ag_mode);
5732           return gen_rtx_REG (mode, V0_REGNUM);
5733         }
5734       else
5735         {
5736           int i;
5737           rtx par;
5738
5739           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5740           for (i = 0; i < count; i++)
5741             {
5742               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5743               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5744               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5745               XVECEXP (par, 0, i) = tmp;
5746             }
5747           return par;
5748         }
5749     }
5750   else
5751     {
5752       if (sve_p)
5753         {
5754           /* Vector types can acquire a partial SVE mode using things like
5755              __attribute__((vector_size(N))), and this is potentially useful.
5756              However, the choice of mode doesn't affect the type's ABI
5757              identity, so we should treat the types as though they had
5758              the associated integer mode, just like they did before SVE
5759              was introduced.
5760
5761              We know that the vector must be 128 bits or smaller,
5762              otherwise we'd have returned it in memory instead.  */
5763           gcc_assert (type
5764                       && (aarch64_some_values_include_pst_objects_p (type)
5765                           || (vec_flags & VEC_PARTIAL)));
5766
5767           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5768           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5769           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5770           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5771         }
5772       return gen_rtx_REG (mode, R0_REGNUM);
5773     }
5774 }
5775
5776 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5777    Return true if REGNO is the number of a hard register in which the values
5778    of called function may come back.  */
5779
5780 static bool
5781 aarch64_function_value_regno_p (const unsigned int regno)
5782 {
5783   /* Maximum of 16 bytes can be returned in the general registers.  Examples
5784      of 16-byte return values are: 128-bit integers and 16-byte small
5785      structures (excluding homogeneous floating-point aggregates).  */
5786   if (regno == R0_REGNUM || regno == R1_REGNUM)
5787     return true;
5788
5789   /* Up to four fp/simd registers can return a function value, e.g. a
5790      homogeneous floating-point aggregate having four members.  */
5791   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5792     return TARGET_FLOAT;
5793
5794   return false;
5795 }
5796
5797 /* Subroutine for aarch64_return_in_memory for types that are not returned
5798    in SVE registers.  */
5799
5800 static bool
5801 aarch64_return_in_memory_1 (const_tree type)
5802 {
5803   HOST_WIDE_INT size;
5804   machine_mode ag_mode;
5805   int count;
5806
5807   if (!AGGREGATE_TYPE_P (type)
5808       && TREE_CODE (type) != COMPLEX_TYPE
5809       && TREE_CODE (type) != VECTOR_TYPE)
5810     /* Simple scalar types always returned in registers.  */
5811     return false;
5812
5813   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5814                                                &ag_mode, &count, NULL, false))
5815     return false;
5816
5817   /* Types larger than 2 registers returned in memory.  */
5818   size = int_size_in_bytes (type);
5819   return (size < 0 || size > 2 * UNITS_PER_WORD);
5820 }
5821
5822 /* Implement TARGET_RETURN_IN_MEMORY.
5823
5824    If the type T of the result of a function is such that
5825      void func (T arg)
5826    would require that arg be passed as a value in a register (or set of
5827    registers) according to the parameter passing rules, then the result
5828    is returned in the same registers as would be used for such an
5829    argument.  */
5830
5831 static bool
5832 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5833 {
5834   pure_scalable_type_info pst_info;
5835   switch (pst_info.analyze (type))
5836     {
5837     case pure_scalable_type_info::IS_PST:
5838       return (pst_info.num_zr () > NUM_FP_ARG_REGS
5839               || pst_info.num_pr () > NUM_PR_ARG_REGS);
5840
5841     case pure_scalable_type_info::DOESNT_MATTER:
5842       gcc_assert (aarch64_return_in_memory_1 (type));
5843       return true;
5844
5845     case pure_scalable_type_info::NO_ABI_IDENTITY:
5846     case pure_scalable_type_info::ISNT_PST:
5847       return aarch64_return_in_memory_1 (type);
5848     }
5849   gcc_unreachable ();
5850 }
5851
5852 static bool
5853 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5854                                const_tree type, int *nregs)
5855 {
5856   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5857   return aarch64_vfp_is_call_or_return_candidate (mode, type,
5858                                                   &pcum->aapcs_vfp_rmode,
5859                                                   nregs, NULL, pcum->silent_p);
5860 }
5861
5862 /* Given MODE and TYPE of a function argument, return the alignment in
5863    bits.  The idea is to suppress any stronger alignment requested by
5864    the user and opt for the natural alignment (specified in AAPCS64 \S
5865    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
5866    calculated in versions of GCC prior to GCC-9.  This is a helper
5867    function for local use only.  */
5868
5869 static unsigned int
5870 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5871                                 bool *abi_break)
5872 {
5873   *abi_break = false;
5874   if (!type)
5875     return GET_MODE_ALIGNMENT (mode);
5876
5877   if (integer_zerop (TYPE_SIZE (type)))
5878     return 0;
5879
5880   gcc_assert (TYPE_MODE (type) == mode);
5881
5882   if (!AGGREGATE_TYPE_P (type))
5883     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5884
5885   if (TREE_CODE (type) == ARRAY_TYPE)
5886     return TYPE_ALIGN (TREE_TYPE (type));
5887
5888   unsigned int alignment = 0;
5889   unsigned int bitfield_alignment = 0;
5890   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5891     if (TREE_CODE (field) == FIELD_DECL)
5892       {
5893         /* Note that we explicitly consider zero-sized fields here,
5894            even though they don't map to AAPCS64 machine types.
5895            For example, in:
5896
5897                struct __attribute__((aligned(8))) empty {};
5898
5899                struct s {
5900                  [[no_unique_address]] empty e;
5901                  int x;
5902                };
5903
5904            "s" contains only one Fundamental Data Type (the int field)
5905            but gains 8-byte alignment and size thanks to "e".  */
5906         alignment = std::max (alignment, DECL_ALIGN (field));
5907         if (DECL_BIT_FIELD_TYPE (field))
5908           bitfield_alignment
5909             = std::max (bitfield_alignment,
5910                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5911       }
5912
5913   if (bitfield_alignment > alignment)
5914     {
5915       *abi_break = true;
5916       return bitfield_alignment;
5917     }
5918
5919   return alignment;
5920 }
5921
5922 /* Layout a function argument according to the AAPCS64 rules.  The rule
5923    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
5924    mode that was originally given to us by the target hook, whereas the
5925    mode in ARG might be the result of replacing partial SVE modes with
5926    the equivalent integer mode.  */
5927
5928 static void
5929 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5930 {
5931   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5932   tree type = arg.type;
5933   machine_mode mode = arg.mode;
5934   int ncrn, nvrn, nregs;
5935   bool allocate_ncrn, allocate_nvrn;
5936   HOST_WIDE_INT size;
5937   bool abi_break;
5938
5939   /* We need to do this once per argument.  */
5940   if (pcum->aapcs_arg_processed)
5941     return;
5942
5943   pcum->aapcs_arg_processed = true;
5944
5945   pure_scalable_type_info pst_info;
5946   if (type && pst_info.analyze_registers (type))
5947     {
5948       /* The PCS says that it is invalid to pass an SVE value to an
5949          unprototyped function.  There is no ABI-defined location we
5950          can return in this case, so we have no real choice but to raise
5951          an error immediately, even though this is only a query function.  */
5952       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5953         {
5954           gcc_assert (!pcum->silent_p);
5955           error ("SVE type %qT cannot be passed to an unprototyped function",
5956                  arg.type);
5957           /* Avoid repeating the message, and avoid tripping the assert
5958              below.  */
5959           pcum->pcs_variant = ARM_PCS_SVE;
5960         }
5961
5962       /* We would have converted the argument into pass-by-reference
5963          form if it didn't fit in registers.  */
5964       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5965       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
5966       gcc_assert (arg.named
5967                   && pcum->pcs_variant == ARM_PCS_SVE
5968                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5969                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5970       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5971                                           P0_REGNUM + pcum->aapcs_nprn);
5972       return;
5973     }
5974
5975   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5976      are passed by reference, not by value.  */
5977   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5978   bool sve_p = (vec_flags & VEC_ANY_SVE);
5979   if (sve_p)
5980     /* Vector types can acquire a partial SVE mode using things like
5981        __attribute__((vector_size(N))), and this is potentially useful.
5982        However, the choice of mode doesn't affect the type's ABI
5983        identity, so we should treat the types as though they had
5984        the associated integer mode, just like they did before SVE
5985        was introduced.
5986
5987        We know that the vector must be 128 bits or smaller,
5988        otherwise we'd have passed it in memory instead.  */
5989     gcc_assert (type
5990                 && (aarch64_some_values_include_pst_objects_p (type)
5991                     || (vec_flags & VEC_PARTIAL)));
5992
5993   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5994   if (type)
5995     size = int_size_in_bytes (type);
5996   else
5997     /* No frontends can create types with variable-sized modes, so we
5998        shouldn't be asked to pass or return them.  */
5999     size = GET_MODE_SIZE (mode).to_constant ();
6000   size = ROUND_UP (size, UNITS_PER_WORD);
6001
6002   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6003   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6004                                                  mode,
6005                                                  type,
6006                                                  &nregs);
6007   gcc_assert (!sve_p || !allocate_nvrn);
6008
6009   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6010      The following code thus handles passing by SIMD/FP registers first.  */
6011
6012   nvrn = pcum->aapcs_nvrn;
6013
6014   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6015      and homogenous short-vector aggregates (HVA).  */
6016   if (allocate_nvrn)
6017     {
6018       if (!pcum->silent_p && !TARGET_FLOAT)
6019         aarch64_err_no_fpadvsimd (mode);
6020
6021       if (nvrn + nregs <= NUM_FP_ARG_REGS)
6022         {
6023           pcum->aapcs_nextnvrn = nvrn + nregs;
6024           if (!aarch64_composite_type_p (type, mode))
6025             {
6026               gcc_assert (nregs == 1);
6027               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6028             }
6029           else
6030             {
6031               rtx par;
6032               int i;
6033               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6034               for (i = 0; i < nregs; i++)
6035                 {
6036                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6037                                          V0_REGNUM + nvrn + i);
6038                   rtx offset = gen_int_mode
6039                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6040                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6041                   XVECEXP (par, 0, i) = tmp;
6042                 }
6043               pcum->aapcs_reg = par;
6044             }
6045           return;
6046         }
6047       else
6048         {
6049           /* C.3 NSRN is set to 8.  */
6050           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6051           goto on_stack;
6052         }
6053     }
6054
6055   ncrn = pcum->aapcs_ncrn;
6056   nregs = size / UNITS_PER_WORD;
6057
6058   /* C6 - C9.  though the sign and zero extension semantics are
6059      handled elsewhere.  This is the case where the argument fits
6060      entirely general registers.  */
6061   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6062     {
6063       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6064
6065       /* C.8 if the argument has an alignment of 16 then the NGRN is
6066          rounded up to the next even number.  */
6067       if (nregs == 2
6068           && ncrn % 2
6069           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6070              comparison is there because for > 16 * BITS_PER_UNIT
6071              alignment nregs should be > 2 and therefore it should be
6072              passed by reference rather than value.  */
6073           && (aarch64_function_arg_alignment (mode, type, &abi_break)
6074               == 16 * BITS_PER_UNIT))
6075         {
6076           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6077             inform (input_location, "parameter passing for argument of type "
6078                     "%qT changed in GCC 9.1", type);
6079           ++ncrn;
6080           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6081         }
6082
6083       /* If an argument with an SVE mode needs to be shifted up to the
6084          high part of the register, treat it as though it had an integer mode.
6085          Using the normal (parallel [...]) would suppress the shifting.  */
6086       if (sve_p
6087           && BYTES_BIG_ENDIAN
6088           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6089           && aarch64_pad_reg_upward (mode, type, false))
6090         {
6091           mode = int_mode_for_mode (mode).require ();
6092           sve_p = false;
6093         }
6094
6095       /* NREGS can be 0 when e.g. an empty structure is to be passed.
6096          A reg is still generated for it, but the caller should be smart
6097          enough not to use it.  */
6098       if (nregs == 0
6099           || (nregs == 1 && !sve_p)
6100           || GET_MODE_CLASS (mode) == MODE_INT)
6101         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6102       else
6103         {
6104           rtx par;
6105           int i;
6106
6107           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6108           for (i = 0; i < nregs; i++)
6109             {
6110               scalar_int_mode reg_mode = word_mode;
6111               if (nregs == 1)
6112                 reg_mode = int_mode_for_mode (mode).require ();
6113               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6114               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6115                                        GEN_INT (i * UNITS_PER_WORD));
6116               XVECEXP (par, 0, i) = tmp;
6117             }
6118           pcum->aapcs_reg = par;
6119         }
6120
6121       pcum->aapcs_nextncrn = ncrn + nregs;
6122       return;
6123     }
6124
6125   /* C.11  */
6126   pcum->aapcs_nextncrn = NUM_ARG_REGS;
6127
6128   /* The argument is passed on stack; record the needed number of words for
6129      this argument and align the total size if necessary.  */
6130 on_stack:
6131   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
6132
6133   if (aarch64_function_arg_alignment (mode, type, &abi_break)
6134       == 16 * BITS_PER_UNIT)
6135     {
6136       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6137       if (pcum->aapcs_stack_size != new_size)
6138         {
6139           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6140             inform (input_location, "parameter passing for argument of type "
6141                     "%qT changed in GCC 9.1", type);
6142           pcum->aapcs_stack_size = new_size;
6143         }
6144     }
6145   return;
6146 }
6147
6148 /* Implement TARGET_FUNCTION_ARG.  */
6149
6150 static rtx
6151 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6152 {
6153   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6154   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
6155               || pcum->pcs_variant == ARM_PCS_SIMD
6156               || pcum->pcs_variant == ARM_PCS_SVE);
6157
6158   if (arg.end_marker_p ())
6159     return gen_int_mode (pcum->pcs_variant, DImode);
6160
6161   aarch64_layout_arg (pcum_v, arg);
6162   return pcum->aapcs_reg;
6163 }
6164
6165 void
6166 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
6167                               const_tree fntype,
6168                               rtx libname ATTRIBUTE_UNUSED,
6169                               const_tree fndecl ATTRIBUTE_UNUSED,
6170                               unsigned n_named ATTRIBUTE_UNUSED,
6171                               bool silent_p)
6172 {
6173   pcum->aapcs_ncrn = 0;
6174   pcum->aapcs_nvrn = 0;
6175   pcum->aapcs_nprn = 0;
6176   pcum->aapcs_nextncrn = 0;
6177   pcum->aapcs_nextnvrn = 0;
6178   pcum->aapcs_nextnprn = 0;
6179   if (fntype)
6180     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6181   else
6182     pcum->pcs_variant = ARM_PCS_AAPCS64;
6183   pcum->aapcs_reg = NULL_RTX;
6184   pcum->aapcs_arg_processed = false;
6185   pcum->aapcs_stack_words = 0;
6186   pcum->aapcs_stack_size = 0;
6187   pcum->silent_p = silent_p;
6188
6189   if (!silent_p
6190       && !TARGET_FLOAT
6191       && fntype && fntype != error_mark_node)
6192     {
6193       const_tree type = TREE_TYPE (fntype);
6194       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
6195       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
6196       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6197                                                    &mode, &nregs, NULL, false))
6198         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
6199     }
6200
6201   if (!silent_p
6202       && !TARGET_SVE
6203       && pcum->pcs_variant == ARM_PCS_SVE)
6204     {
6205       /* We can't gracefully recover at this point, so make this a
6206          fatal error.  */
6207       if (fndecl)
6208         fatal_error (input_location, "%qE requires the SVE ISA extension",
6209                      fndecl);
6210       else
6211         fatal_error (input_location, "calls to functions of type %qT require"
6212                      " the SVE ISA extension", fntype);
6213     }
6214 }
6215
6216 static void
6217 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6218                               const function_arg_info &arg)
6219 {
6220   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6221   if (pcum->pcs_variant == ARM_PCS_AAPCS64
6222       || pcum->pcs_variant == ARM_PCS_SIMD
6223       || pcum->pcs_variant == ARM_PCS_SVE)
6224     {
6225       aarch64_layout_arg (pcum_v, arg);
6226       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6227                   != (pcum->aapcs_stack_words != 0));
6228       pcum->aapcs_arg_processed = false;
6229       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6230       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6231       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6232       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6233       pcum->aapcs_stack_words = 0;
6234       pcum->aapcs_reg = NULL_RTX;
6235     }
6236 }
6237
6238 bool
6239 aarch64_function_arg_regno_p (unsigned regno)
6240 {
6241   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6242           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6243 }
6244
6245 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
6246    PARM_BOUNDARY bits of alignment, but will be given anything up
6247    to STACK_BOUNDARY bits if the type requires it.  This makes sure
6248    that both before and after the layout of each argument, the Next
6249    Stacked Argument Address (NSAA) will have a minimum alignment of
6250    8 bytes.  */
6251
6252 static unsigned int
6253 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6254 {
6255   bool abi_break;
6256   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6257                                                            &abi_break);
6258   if (abi_break & warn_psabi)
6259     inform (input_location, "parameter passing for argument of type "
6260             "%qT changed in GCC 9.1", type);
6261
6262   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6263 }
6264
6265 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
6266
6267 static fixed_size_mode
6268 aarch64_get_reg_raw_mode (int regno)
6269 {
6270   if (TARGET_SVE && FP_REGNUM_P (regno))
6271     /* Don't use the SVE part of the register for __builtin_apply and
6272        __builtin_return.  The SVE registers aren't used by the normal PCS,
6273        so using them there would be a waste of time.  The PCS extensions
6274        for SVE types are fundamentally incompatible with the
6275        __builtin_return/__builtin_apply interface.  */
6276     return as_a <fixed_size_mode> (V16QImode);
6277   return default_get_reg_raw_mode (regno);
6278 }
6279
6280 /* Implement TARGET_FUNCTION_ARG_PADDING.
6281
6282    Small aggregate types are placed in the lowest memory address.
6283
6284    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
6285
6286 static pad_direction
6287 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6288 {
6289   /* On little-endian targets, the least significant byte of every stack
6290      argument is passed at the lowest byte address of the stack slot.  */
6291   if (!BYTES_BIG_ENDIAN)
6292     return PAD_UPWARD;
6293
6294   /* Otherwise, integral, floating-point and pointer types are padded downward:
6295      the least significant byte of a stack argument is passed at the highest
6296      byte address of the stack slot.  */
6297   if (type
6298       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6299          || POINTER_TYPE_P (type))
6300       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6301     return PAD_DOWNWARD;
6302
6303   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
6304   return PAD_UPWARD;
6305 }
6306
6307 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6308
6309    It specifies padding for the last (may also be the only)
6310    element of a block move between registers and memory.  If
6311    assuming the block is in the memory, padding upward means that
6312    the last element is padded after its highest significant byte,
6313    while in downward padding, the last element is padded at the
6314    its least significant byte side.
6315
6316    Small aggregates and small complex types are always padded
6317    upwards.
6318
6319    We don't need to worry about homogeneous floating-point or
6320    short-vector aggregates; their move is not affected by the
6321    padding direction determined here.  Regardless of endianness,
6322    each element of such an aggregate is put in the least
6323    significant bits of a fp/simd register.
6324
6325    Return !BYTES_BIG_ENDIAN if the least significant byte of the
6326    register has useful data, and return the opposite if the most
6327    significant byte does.  */
6328
6329 bool
6330 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6331                      bool first ATTRIBUTE_UNUSED)
6332 {
6333
6334   /* Aside from pure scalable types, small composite types are always
6335      padded upward.  */
6336   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6337     {
6338       HOST_WIDE_INT size;
6339       if (type)
6340         size = int_size_in_bytes (type);
6341       else
6342         /* No frontends can create types with variable-sized modes, so we
6343            shouldn't be asked to pass or return them.  */
6344         size = GET_MODE_SIZE (mode).to_constant ();
6345       if (size < 2 * UNITS_PER_WORD)
6346         {
6347           pure_scalable_type_info pst_info;
6348           if (pst_info.analyze_registers (type))
6349             return false;
6350           return true;
6351         }
6352     }
6353
6354   /* Otherwise, use the default padding.  */
6355   return !BYTES_BIG_ENDIAN;
6356 }
6357
6358 static scalar_int_mode
6359 aarch64_libgcc_cmp_return_mode (void)
6360 {
6361   return SImode;
6362 }
6363
6364 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6365
6366 /* We use the 12-bit shifted immediate arithmetic instructions so values
6367    must be multiple of (1 << 12), i.e. 4096.  */
6368 #define ARITH_FACTOR 4096
6369
6370 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6371 #error Cannot use simple address calculation for stack probing
6372 #endif
6373
6374 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6375    inclusive.  These are offsets from the current stack pointer.  */
6376
6377 static void
6378 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6379 {
6380   HOST_WIDE_INT size;
6381   if (!poly_size.is_constant (&size))
6382     {
6383       sorry ("stack probes for SVE frames");
6384       return;
6385     }
6386
6387   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
6388
6389   /* See the same assertion on PROBE_INTERVAL above.  */
6390   gcc_assert ((first % ARITH_FACTOR) == 0);
6391
6392   /* See if we have a constant small number of probes to generate.  If so,
6393      that's the easy case.  */
6394   if (size <= PROBE_INTERVAL)
6395     {
6396       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6397
6398       emit_set_insn (reg1,
6399                      plus_constant (Pmode,
6400                                     stack_pointer_rtx, -(first + base)));
6401       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6402     }
6403
6404   /* The run-time loop is made up of 8 insns in the generic case while the
6405      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
6406   else if (size <= 4 * PROBE_INTERVAL)
6407     {
6408       HOST_WIDE_INT i, rem;
6409
6410       emit_set_insn (reg1,
6411                      plus_constant (Pmode,
6412                                     stack_pointer_rtx,
6413                                     -(first + PROBE_INTERVAL)));
6414       emit_stack_probe (reg1);
6415
6416       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6417          it exceeds SIZE.  If only two probes are needed, this will not
6418          generate any code.  Then probe at FIRST + SIZE.  */
6419       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6420         {
6421           emit_set_insn (reg1,
6422                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6423           emit_stack_probe (reg1);
6424         }
6425
6426       rem = size - (i - PROBE_INTERVAL);
6427       if (rem > 256)
6428         {
6429           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6430
6431           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6432           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6433         }
6434       else
6435         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6436     }
6437
6438   /* Otherwise, do the same as above, but in a loop.  Note that we must be
6439      extra careful with variables wrapping around because we might be at
6440      the very top (or the very bottom) of the address space and we have
6441      to be able to handle this case properly; in particular, we use an
6442      equality test for the loop condition.  */
6443   else
6444     {
6445       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
6446
6447       /* Step 1: round SIZE to the previous multiple of the interval.  */
6448
6449       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6450
6451
6452       /* Step 2: compute initial and final value of the loop counter.  */
6453
6454       /* TEST_ADDR = SP + FIRST.  */
6455       emit_set_insn (reg1,
6456                      plus_constant (Pmode, stack_pointer_rtx, -first));
6457
6458       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
6459       HOST_WIDE_INT adjustment = - (first + rounded_size);
6460       if (! aarch64_uimm12_shift (adjustment))
6461         {
6462           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6463                                           true, Pmode);
6464           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6465         }
6466       else
6467         emit_set_insn (reg2,
6468                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
6469
6470       /* Step 3: the loop
6471
6472          do
6473            {
6474              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6475              probe at TEST_ADDR
6476            }
6477          while (TEST_ADDR != LAST_ADDR)
6478
6479          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6480          until it is equal to ROUNDED_SIZE.  */
6481
6482       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6483
6484
6485       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6486          that SIZE is equal to ROUNDED_SIZE.  */
6487
6488       if (size != rounded_size)
6489         {
6490           HOST_WIDE_INT rem = size - rounded_size;
6491
6492           if (rem > 256)
6493             {
6494               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6495
6496               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6497               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6498             }
6499           else
6500             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6501         }
6502     }
6503
6504   /* Make sure nothing is scheduled before we are done.  */
6505   emit_insn (gen_blockage ());
6506 }
6507
6508 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
6509    absolute addresses.  */
6510
6511 const char *
6512 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6513 {
6514   static int labelno = 0;
6515   char loop_lab[32];
6516   rtx xops[2];
6517
6518   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6519
6520   /* Loop.  */
6521   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6522
6523   HOST_WIDE_INT stack_clash_probe_interval
6524     = 1 << param_stack_clash_protection_guard_size;
6525
6526   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
6527   xops[0] = reg1;
6528   HOST_WIDE_INT interval;
6529   if (flag_stack_clash_protection)
6530     interval = stack_clash_probe_interval;
6531   else
6532     interval = PROBE_INTERVAL;
6533
6534   gcc_assert (aarch64_uimm12_shift (interval));
6535   xops[1] = GEN_INT (interval);
6536
6537   output_asm_insn ("sub\t%0, %0, %1", xops);
6538
6539   /* If doing stack clash protection then we probe up by the ABI specified
6540      amount.  We do this because we're dropping full pages at a time in the
6541      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
6542   if (flag_stack_clash_protection)
6543     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6544   else
6545     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6546
6547   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
6548      by this amount for each iteration.  */
6549   output_asm_insn ("str\txzr, [%0, %1]", xops);
6550
6551   /* Test if TEST_ADDR == LAST_ADDR.  */
6552   xops[1] = reg2;
6553   output_asm_insn ("cmp\t%0, %1", xops);
6554
6555   /* Branch.  */
6556   fputs ("\tb.ne\t", asm_out_file);
6557   assemble_name_raw (asm_out_file, loop_lab);
6558   fputc ('\n', asm_out_file);
6559
6560   return "";
6561 }
6562
6563 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6564    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6565    of GUARD_SIZE.  When a probe is emitted it is done at most
6566    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6567    at most MIN_PROBE_THRESHOLD.  By the end of this function
6568    BASE = BASE - ADJUSTMENT.  */
6569
6570 const char *
6571 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6572                                       rtx min_probe_threshold, rtx guard_size)
6573 {
6574   /* This function is not allowed to use any instruction generation function
6575      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
6576      so instead emit the code you want using output_asm_insn.  */
6577   gcc_assert (flag_stack_clash_protection);
6578   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6579   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6580
6581   /* The minimum required allocation before the residual requires probing.  */
6582   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6583
6584   /* Clamp the value down to the nearest value that can be used with a cmp.  */
6585   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6586   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6587
6588   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6589   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6590
6591   static int labelno = 0;
6592   char loop_start_lab[32];
6593   char loop_end_lab[32];
6594   rtx xops[2];
6595
6596   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6597   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6598
6599   /* Emit loop start label.  */
6600   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6601
6602   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
6603   xops[0] = adjustment;
6604   xops[1] = probe_offset_value_rtx;
6605   output_asm_insn ("cmp\t%0, %1", xops);
6606
6607   /* Branch to end if not enough adjustment to probe.  */
6608   fputs ("\tb.lt\t", asm_out_file);
6609   assemble_name_raw (asm_out_file, loop_end_lab);
6610   fputc ('\n', asm_out_file);
6611
6612   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
6613   xops[0] = base;
6614   xops[1] = probe_offset_value_rtx;
6615   output_asm_insn ("sub\t%0, %0, %1", xops);
6616
6617   /* Probe at BASE.  */
6618   xops[1] = const0_rtx;
6619   output_asm_insn ("str\txzr, [%0, %1]", xops);
6620
6621   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
6622   xops[0] = adjustment;
6623   xops[1] = probe_offset_value_rtx;
6624   output_asm_insn ("sub\t%0, %0, %1", xops);
6625
6626   /* Branch to start if still more bytes to allocate.  */
6627   fputs ("\tb\t", asm_out_file);
6628   assemble_name_raw (asm_out_file, loop_start_lab);
6629   fputc ('\n', asm_out_file);
6630
6631   /* No probe leave.  */
6632   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6633
6634   /* BASE = BASE - ADJUSTMENT.  */
6635   xops[0] = base;
6636   xops[1] = adjustment;
6637   output_asm_insn ("sub\t%0, %0, %1", xops);
6638   return "";
6639 }
6640
6641 /* Determine whether a frame chain needs to be generated.  */
6642 static bool
6643 aarch64_needs_frame_chain (void)
6644 {
6645   /* Force a frame chain for EH returns so the return address is at FP+8.  */
6646   if (frame_pointer_needed || crtl->calls_eh_return)
6647     return true;
6648
6649   /* A leaf function cannot have calls or write LR.  */
6650   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6651
6652   /* Don't use a frame chain in leaf functions if leaf frame pointers
6653      are disabled.  */
6654   if (flag_omit_leaf_frame_pointer && is_leaf)
6655     return false;
6656
6657   return aarch64_use_frame_pointer;
6658 }
6659
6660 /* Mark the registers that need to be saved by the callee and calculate
6661    the size of the callee-saved registers area and frame record (both FP
6662    and LR may be omitted).  */
6663 static void
6664 aarch64_layout_frame (void)
6665 {
6666   poly_int64 offset = 0;
6667   int regno, last_fp_reg = INVALID_REGNUM;
6668   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6669   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6670   bool frame_related_fp_reg_p = false;
6671   aarch64_frame &frame = cfun->machine->frame;
6672
6673   frame.emit_frame_chain = aarch64_needs_frame_chain ();
6674
6675   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
6676      the mid-end is doing.  */
6677   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6678
6679 #define SLOT_NOT_REQUIRED (-2)
6680 #define SLOT_REQUIRED     (-1)
6681
6682   frame.wb_candidate1 = INVALID_REGNUM;
6683   frame.wb_candidate2 = INVALID_REGNUM;
6684   frame.spare_pred_reg = INVALID_REGNUM;
6685
6686   /* First mark all the registers that really need to be saved...  */
6687   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6688     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6689
6690   /* ... that includes the eh data registers (if needed)...  */
6691   if (crtl->calls_eh_return)
6692     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6693       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6694
6695   /* ... and any callee saved register that dataflow says is live.  */
6696   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6697     if (df_regs_ever_live_p (regno)
6698         && !fixed_regs[regno]
6699         && (regno == R30_REGNUM
6700             || !crtl->abi->clobbers_full_reg_p (regno)))
6701       frame.reg_offset[regno] = SLOT_REQUIRED;
6702
6703   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6704     if (df_regs_ever_live_p (regno)
6705         && !fixed_regs[regno]
6706         && !crtl->abi->clobbers_full_reg_p (regno))
6707       {
6708         frame.reg_offset[regno] = SLOT_REQUIRED;
6709         last_fp_reg = regno;
6710         if (aarch64_emit_cfi_for_reg_p (regno))
6711           frame_related_fp_reg_p = true;
6712       }
6713
6714   /* Big-endian SVE frames need a spare predicate register in order
6715      to save Z8-Z15.  Decide which register they should use.  Prefer
6716      an unused argument register if possible, so that we don't force P4
6717      to be saved unnecessarily.  */
6718   if (frame_related_fp_reg_p
6719       && crtl->abi->id () == ARM_PCS_SVE
6720       && BYTES_BIG_ENDIAN)
6721     {
6722       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6723       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6724       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6725         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6726           break;
6727       gcc_assert (regno <= P7_REGNUM);
6728       frame.spare_pred_reg = regno;
6729       df_set_regs_ever_live (regno, true);
6730     }
6731
6732   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6733     if (df_regs_ever_live_p (regno)
6734         && !fixed_regs[regno]
6735         && !crtl->abi->clobbers_full_reg_p (regno))
6736       frame.reg_offset[regno] = SLOT_REQUIRED;
6737
6738   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
6739      LR counts as an implicit probe which allows us to maintain the invariant
6740      described in the comment at expand_prologue.  */
6741   gcc_assert (crtl->is_leaf
6742               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6743
6744   /* Now assign stack slots for the registers.  Start with the predicate
6745      registers, since predicate LDR and STR have a relatively small
6746      offset range.  These saves happen below the hard frame pointer.  */
6747   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6748     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6749       {
6750         frame.reg_offset[regno] = offset;
6751         offset += BYTES_PER_SVE_PRED;
6752       }
6753
6754   if (maybe_ne (offset, 0))
6755     {
6756       /* If we have any vector registers to save above the predicate registers,
6757          the offset of the vector register save slots need to be a multiple
6758          of the vector size.  This lets us use the immediate forms of LDR/STR
6759          (or LD1/ST1 for big-endian).
6760
6761          A vector register is 8 times the size of a predicate register,
6762          and we need to save a maximum of 12 predicate registers, so the
6763          first vector register will be at either #1, MUL VL or #2, MUL VL.
6764
6765          If we don't have any vector registers to save, and we know how
6766          big the predicate save area is, we can just round it up to the
6767          next 16-byte boundary.  */
6768       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6769         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6770       else
6771         {
6772           if (known_le (offset, vector_save_size))
6773             offset = vector_save_size;
6774           else if (known_le (offset, vector_save_size * 2))
6775             offset = vector_save_size * 2;
6776           else
6777             gcc_unreachable ();
6778         }
6779     }
6780
6781   /* If we need to save any SVE vector registers, add them next.  */
6782   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6783     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6784       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6785         {
6786           frame.reg_offset[regno] = offset;
6787           offset += vector_save_size;
6788         }
6789
6790   /* OFFSET is now the offset of the hard frame pointer from the bottom
6791      of the callee save area.  */
6792   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6793   frame.below_hard_fp_saved_regs_size = offset;
6794   if (frame.emit_frame_chain)
6795     {
6796       /* FP and LR are placed in the linkage record.  */
6797       frame.reg_offset[R29_REGNUM] = offset;
6798       frame.wb_candidate1 = R29_REGNUM;
6799       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6800       frame.wb_candidate2 = R30_REGNUM;
6801       offset += 2 * UNITS_PER_WORD;
6802     }
6803
6804   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6805     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6806       {
6807         frame.reg_offset[regno] = offset;
6808         if (frame.wb_candidate1 == INVALID_REGNUM)
6809           frame.wb_candidate1 = regno;
6810         else if (frame.wb_candidate2 == INVALID_REGNUM)
6811           frame.wb_candidate2 = regno;
6812         offset += UNITS_PER_WORD;
6813       }
6814
6815   poly_int64 max_int_offset = offset;
6816   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6817   bool has_align_gap = maybe_ne (offset, max_int_offset);
6818
6819   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6820     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6821       {
6822         /* If there is an alignment gap between integer and fp callee-saves,
6823            allocate the last fp register to it if possible.  */
6824         if (regno == last_fp_reg
6825             && has_align_gap
6826             && known_eq (vector_save_size, 8)
6827             && multiple_p (offset, 16))
6828           {
6829             frame.reg_offset[regno] = max_int_offset;
6830             break;
6831           }
6832
6833         frame.reg_offset[regno] = offset;
6834         if (frame.wb_candidate1 == INVALID_REGNUM)
6835           frame.wb_candidate1 = regno;
6836         else if (frame.wb_candidate2 == INVALID_REGNUM
6837                  && frame.wb_candidate1 >= V0_REGNUM)
6838           frame.wb_candidate2 = regno;
6839         offset += vector_save_size;
6840       }
6841
6842   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6843
6844   frame.saved_regs_size = offset;
6845
6846   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6847
6848   poly_int64 above_outgoing_args
6849     = aligned_upper_bound (varargs_and_saved_regs_size
6850                            + get_frame_size (),
6851                            STACK_BOUNDARY / BITS_PER_UNIT);
6852
6853   frame.hard_fp_offset
6854     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6855
6856   /* Both these values are already aligned.  */
6857   gcc_assert (multiple_p (crtl->outgoing_args_size,
6858                           STACK_BOUNDARY / BITS_PER_UNIT));
6859   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6860
6861   frame.locals_offset = frame.saved_varargs_size;
6862
6863   frame.initial_adjust = 0;
6864   frame.final_adjust = 0;
6865   frame.callee_adjust = 0;
6866   frame.sve_callee_adjust = 0;
6867   frame.callee_offset = 0;
6868
6869   HOST_WIDE_INT max_push_offset = 0;
6870   if (frame.wb_candidate2 != INVALID_REGNUM)
6871     max_push_offset = 512;
6872   else if (frame.wb_candidate1 != INVALID_REGNUM)
6873     max_push_offset = 256;
6874
6875   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6876   HOST_WIDE_INT const_saved_regs_size;
6877   if (frame.frame_size.is_constant (&const_size)
6878       && const_size < max_push_offset
6879       && known_eq (frame.hard_fp_offset, const_size))
6880     {
6881       /* Simple, small frame with no outgoing arguments:
6882
6883          stp reg1, reg2, [sp, -frame_size]!
6884          stp reg3, reg4, [sp, 16]  */
6885       frame.callee_adjust = const_size;
6886     }
6887   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6888            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6889            && const_outgoing_args_size + const_saved_regs_size < 512
6890            /* We could handle this case even with outgoing args, provided
6891               that the number of args left us with valid offsets for all
6892               predicate and vector save slots.  It's such a rare case that
6893               it hardly seems worth the effort though.  */
6894            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6895            && !(cfun->calls_alloca
6896                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6897                 && const_fp_offset < max_push_offset))
6898     {
6899       /* Frame with small outgoing arguments:
6900
6901          sub sp, sp, frame_size
6902          stp reg1, reg2, [sp, outgoing_args_size]
6903          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
6904       frame.initial_adjust = frame.frame_size;
6905       frame.callee_offset = const_outgoing_args_size;
6906     }
6907   else if (saves_below_hard_fp_p
6908            && known_eq (frame.saved_regs_size,
6909                         frame.below_hard_fp_saved_regs_size))
6910     {
6911       /* Frame in which all saves are SVE saves:
6912
6913          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6914          save SVE registers relative to SP
6915          sub sp, sp, outgoing_args_size  */
6916       frame.initial_adjust = (frame.hard_fp_offset
6917                               + frame.below_hard_fp_saved_regs_size);
6918       frame.final_adjust = crtl->outgoing_args_size;
6919     }
6920   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6921            && const_fp_offset < max_push_offset)
6922     {
6923       /* Frame with large outgoing arguments or SVE saves, but with
6924          a small local area:
6925
6926          stp reg1, reg2, [sp, -hard_fp_offset]!
6927          stp reg3, reg4, [sp, 16]
6928          [sub sp, sp, below_hard_fp_saved_regs_size]
6929          [save SVE registers relative to SP]
6930          sub sp, sp, outgoing_args_size  */
6931       frame.callee_adjust = const_fp_offset;
6932       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6933       frame.final_adjust = crtl->outgoing_args_size;
6934     }
6935   else
6936     {
6937       /* Frame with large local area and outgoing arguments or SVE saves,
6938          using frame pointer:
6939
6940          sub sp, sp, hard_fp_offset
6941          stp x29, x30, [sp, 0]
6942          add x29, sp, 0
6943          stp reg3, reg4, [sp, 16]
6944          [sub sp, sp, below_hard_fp_saved_regs_size]
6945          [save SVE registers relative to SP]
6946          sub sp, sp, outgoing_args_size  */
6947       frame.initial_adjust = frame.hard_fp_offset;
6948       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6949       frame.final_adjust = crtl->outgoing_args_size;
6950     }
6951
6952   /* Make sure the individual adjustments add up to the full frame size.  */
6953   gcc_assert (known_eq (frame.initial_adjust
6954                         + frame.callee_adjust
6955                         + frame.sve_callee_adjust
6956                         + frame.final_adjust, frame.frame_size));
6957
6958   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
6959     {
6960       /* We've decided not to associate any register saves with the initial
6961          stack allocation.  */
6962       frame.wb_candidate1 = INVALID_REGNUM;
6963       frame.wb_candidate2 = INVALID_REGNUM;
6964     }
6965
6966   frame.laid_out = true;
6967 }
6968
6969 /* Return true if the register REGNO is saved on entry to
6970    the current function.  */
6971
6972 static bool
6973 aarch64_register_saved_on_entry (int regno)
6974 {
6975   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6976 }
6977
6978 /* Return the next register up from REGNO up to LIMIT for the callee
6979    to save.  */
6980
6981 static unsigned
6982 aarch64_next_callee_save (unsigned regno, unsigned limit)
6983 {
6984   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6985     regno ++;
6986   return regno;
6987 }
6988
6989 /* Push the register number REGNO of mode MODE to the stack with write-back
6990    adjusting the stack by ADJUSTMENT.  */
6991
6992 static void
6993 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6994                            HOST_WIDE_INT adjustment)
6995  {
6996   rtx base_rtx = stack_pointer_rtx;
6997   rtx insn, reg, mem;
6998
6999   reg = gen_rtx_REG (mode, regno);
7000   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7001                             plus_constant (Pmode, base_rtx, -adjustment));
7002   mem = gen_frame_mem (mode, mem);
7003
7004   insn = emit_move_insn (mem, reg);
7005   RTX_FRAME_RELATED_P (insn) = 1;
7006 }
7007
7008 /* Generate and return an instruction to store the pair of registers
7009    REG and REG2 of mode MODE to location BASE with write-back adjusting
7010    the stack location BASE by ADJUSTMENT.  */
7011
7012 static rtx
7013 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7014                           HOST_WIDE_INT adjustment)
7015 {
7016   switch (mode)
7017     {
7018     case E_DImode:
7019       return gen_storewb_pairdi_di (base, base, reg, reg2,
7020                                     GEN_INT (-adjustment),
7021                                     GEN_INT (UNITS_PER_WORD - adjustment));
7022     case E_DFmode:
7023       return gen_storewb_pairdf_di (base, base, reg, reg2,
7024                                     GEN_INT (-adjustment),
7025                                     GEN_INT (UNITS_PER_WORD - adjustment));
7026     case E_TFmode:
7027       return gen_storewb_pairtf_di (base, base, reg, reg2,
7028                                     GEN_INT (-adjustment),
7029                                     GEN_INT (UNITS_PER_VREG - adjustment));
7030     default:
7031       gcc_unreachable ();
7032     }
7033 }
7034
7035 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7036    stack pointer by ADJUSTMENT.  */
7037
7038 static void
7039 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
7040 {
7041   rtx_insn *insn;
7042   machine_mode mode = aarch64_reg_save_mode (regno1);
7043
7044   if (regno2 == INVALID_REGNUM)
7045     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7046
7047   rtx reg1 = gen_rtx_REG (mode, regno1);
7048   rtx reg2 = gen_rtx_REG (mode, regno2);
7049
7050   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7051                                               reg2, adjustment));
7052   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
7053   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7054   RTX_FRAME_RELATED_P (insn) = 1;
7055 }
7056
7057 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7058    adjusting it by ADJUSTMENT afterwards.  */
7059
7060 static rtx
7061 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7062                          HOST_WIDE_INT adjustment)
7063 {
7064   switch (mode)
7065     {
7066     case E_DImode:
7067       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
7068                                    GEN_INT (UNITS_PER_WORD));
7069     case E_DFmode:
7070       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
7071                                    GEN_INT (UNITS_PER_WORD));
7072     case E_TFmode:
7073       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7074                                    GEN_INT (UNITS_PER_VREG));
7075     default:
7076       gcc_unreachable ();
7077     }
7078 }
7079
7080 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7081    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7082    into CFI_OPS.  */
7083
7084 static void
7085 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7086                   rtx *cfi_ops)
7087 {
7088   machine_mode mode = aarch64_reg_save_mode (regno1);
7089   rtx reg1 = gen_rtx_REG (mode, regno1);
7090
7091   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7092
7093   if (regno2 == INVALID_REGNUM)
7094     {
7095       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7096       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
7097       emit_move_insn (reg1, gen_frame_mem (mode, mem));
7098     }
7099   else
7100     {
7101       rtx reg2 = gen_rtx_REG (mode, regno2);
7102       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7103       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7104                                           reg2, adjustment));
7105     }
7106 }
7107
7108 /* Generate and return a store pair instruction of mode MODE to store
7109    register REG1 to MEM1 and register REG2 to MEM2.  */
7110
7111 static rtx
7112 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
7113                         rtx reg2)
7114 {
7115   switch (mode)
7116     {
7117     case E_DImode:
7118       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
7119
7120     case E_DFmode:
7121       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
7122
7123     case E_TFmode:
7124       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7125
7126     case E_V4SImode:
7127       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
7128
7129     case E_V16QImode:
7130       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
7131
7132     default:
7133       gcc_unreachable ();
7134     }
7135 }
7136
7137 /* Generate and regurn a load pair isntruction of mode MODE to load register
7138    REG1 from MEM1 and register REG2 from MEM2.  */
7139
7140 static rtx
7141 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
7142                        rtx mem2)
7143 {
7144   switch (mode)
7145     {
7146     case E_DImode:
7147       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
7148
7149     case E_DFmode:
7150       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
7151
7152     case E_TFmode:
7153       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7154
7155     case E_V4SImode:
7156       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
7157
7158     default:
7159       gcc_unreachable ();
7160     }
7161 }
7162
7163 /* Return TRUE if return address signing should be enabled for the current
7164    function, otherwise return FALSE.  */
7165
7166 bool
7167 aarch64_return_address_signing_enabled (void)
7168 {
7169   /* This function should only be called after frame laid out.   */
7170   gcc_assert (cfun->machine->frame.laid_out);
7171
7172   /* Turn return address signing off in any function that uses
7173      __builtin_eh_return.  The address passed to __builtin_eh_return
7174      is not signed so either it has to be signed (with original sp)
7175      or the code path that uses it has to avoid authenticating it.
7176      Currently eh return introduces a return to anywhere gadget, no
7177      matter what we do here since it uses ret with user provided
7178      address. An ideal fix for that is to use indirect branch which
7179      can be protected with BTI j (to some extent).  */
7180   if (crtl->calls_eh_return)
7181     return false;
7182
7183   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
7184      if its LR is pushed onto stack.  */
7185   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7186           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
7187               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
7188 }
7189
7190 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
7191 bool
7192 aarch64_bti_enabled (void)
7193 {
7194   return (aarch64_enable_bti == 1);
7195 }
7196
7197 /* The caller is going to use ST1D or LD1D to save or restore an SVE
7198    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7199    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
7200
7201      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7202          or LD1D address
7203
7204      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7205          if the variable isn't already nonnull
7206
7207    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7208    Handle this case using a temporary base register that is suitable for
7209    all offsets in that range.  Use ANCHOR_REG as this base register if it
7210    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
7211
7212 static inline void
7213 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7214                                      rtx &anchor_reg, poly_int64 &offset,
7215                                      rtx &ptrue)
7216 {
7217   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7218     {
7219       /* This is the maximum valid offset of the anchor from the base.
7220          Lower values would be valid too.  */
7221       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7222       if (!anchor_reg)
7223         {
7224           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7225           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7226                                     gen_int_mode (anchor_offset, Pmode)));
7227         }
7228       base_rtx = anchor_reg;
7229       offset -= anchor_offset;
7230     }
7231   if (!ptrue)
7232     {
7233       int pred_reg = cfun->machine->frame.spare_pred_reg;
7234       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7235                       CONSTM1_RTX (VNx16BImode));
7236       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7237     }
7238 }
7239
7240 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7241    is saved at BASE + OFFSET.  */
7242
7243 static void
7244 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7245                             rtx base, poly_int64 offset)
7246 {
7247   rtx mem = gen_frame_mem (GET_MODE (reg),
7248                            plus_constant (Pmode, base, offset));
7249   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7250 }
7251
7252 /* Emit code to save the callee-saved registers from register number START
7253    to LIMIT to the stack at the location starting at offset START_OFFSET,
7254    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
7255    is true if the hard frame pointer has been set up.  */
7256
7257 static void
7258 aarch64_save_callee_saves (poly_int64 start_offset,
7259                            unsigned start, unsigned limit, bool skip_wb,
7260                            bool hard_fp_valid_p)
7261 {
7262   rtx_insn *insn;
7263   unsigned regno;
7264   unsigned regno2;
7265   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7266
7267   for (regno = aarch64_next_callee_save (start, limit);
7268        regno <= limit;
7269        regno = aarch64_next_callee_save (regno + 1, limit))
7270     {
7271       rtx reg, mem;
7272       poly_int64 offset;
7273       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7274
7275       if (skip_wb
7276           && (regno == cfun->machine->frame.wb_candidate1
7277               || regno == cfun->machine->frame.wb_candidate2))
7278         continue;
7279
7280       if (cfun->machine->reg_is_wrapped_separately[regno])
7281         continue;
7282
7283       machine_mode mode = aarch64_reg_save_mode (regno);
7284       reg = gen_rtx_REG (mode, regno);
7285       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7286       rtx base_rtx = stack_pointer_rtx;
7287       poly_int64 sp_offset = offset;
7288
7289       HOST_WIDE_INT const_offset;
7290       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7291         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7292                                              offset, ptrue);
7293       else if (GP_REGNUM_P (regno)
7294                && (!offset.is_constant (&const_offset) || const_offset >= 512))
7295         {
7296           gcc_assert (known_eq (start_offset, 0));
7297           poly_int64 fp_offset
7298             = cfun->machine->frame.below_hard_fp_saved_regs_size;
7299           if (hard_fp_valid_p)
7300             base_rtx = hard_frame_pointer_rtx;
7301           else
7302             {
7303               if (!anchor_reg)
7304                 {
7305                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7306                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7307                                             gen_int_mode (fp_offset, Pmode)));
7308                 }
7309               base_rtx = anchor_reg;
7310             }
7311           offset -= fp_offset;
7312         }
7313       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7314       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7315
7316       if (!aarch64_sve_mode_p (mode)
7317           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7318           && !cfun->machine->reg_is_wrapped_separately[regno2]
7319           && known_eq (GET_MODE_SIZE (mode),
7320                        cfun->machine->frame.reg_offset[regno2]
7321                        - cfun->machine->frame.reg_offset[regno]))
7322         {
7323           rtx reg2 = gen_rtx_REG (mode, regno2);
7324           rtx mem2;
7325
7326           offset += GET_MODE_SIZE (mode);
7327           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7328           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7329                                                     reg2));
7330
7331           /* The first part of a frame-related parallel insn is
7332              always assumed to be relevant to the frame
7333              calculations; subsequent parts, are only
7334              frame-related if explicitly marked.  */
7335           if (aarch64_emit_cfi_for_reg_p (regno2))
7336             {
7337               if (need_cfa_note_p)
7338                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7339                                             sp_offset + GET_MODE_SIZE (mode));
7340               else
7341                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7342             }
7343
7344           regno = regno2;
7345         }
7346       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7347         {
7348           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7349           need_cfa_note_p = true;
7350         }
7351       else if (aarch64_sve_mode_p (mode))
7352         insn = emit_insn (gen_rtx_SET (mem, reg));
7353       else
7354         insn = emit_move_insn (mem, reg);
7355
7356       RTX_FRAME_RELATED_P (insn) = frame_related_p;
7357       if (frame_related_p && need_cfa_note_p)
7358         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7359     }
7360 }
7361
7362 /* Emit code to restore the callee registers from register number START
7363    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
7364    skipping any write-back candidates if SKIP_WB is true.  Write the
7365    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
7366
7367 static void
7368 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7369                               unsigned limit, bool skip_wb, rtx *cfi_ops)
7370 {
7371   unsigned regno;
7372   unsigned regno2;
7373   poly_int64 offset;
7374   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7375
7376   for (regno = aarch64_next_callee_save (start, limit);
7377        regno <= limit;
7378        regno = aarch64_next_callee_save (regno + 1, limit))
7379     {
7380       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7381       if (cfun->machine->reg_is_wrapped_separately[regno])
7382         continue;
7383
7384       rtx reg, mem;
7385
7386       if (skip_wb
7387           && (regno == cfun->machine->frame.wb_candidate1
7388               || regno == cfun->machine->frame.wb_candidate2))
7389         continue;
7390
7391       machine_mode mode = aarch64_reg_save_mode (regno);
7392       reg = gen_rtx_REG (mode, regno);
7393       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7394       rtx base_rtx = stack_pointer_rtx;
7395       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7396         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7397                                              offset, ptrue);
7398       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7399
7400       if (!aarch64_sve_mode_p (mode)
7401           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7402           && !cfun->machine->reg_is_wrapped_separately[regno2]
7403           && known_eq (GET_MODE_SIZE (mode),
7404                        cfun->machine->frame.reg_offset[regno2]
7405                        - cfun->machine->frame.reg_offset[regno]))
7406         {
7407           rtx reg2 = gen_rtx_REG (mode, regno2);
7408           rtx mem2;
7409
7410           offset += GET_MODE_SIZE (mode);
7411           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7412           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7413
7414           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7415           regno = regno2;
7416         }
7417       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7418         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7419       else if (aarch64_sve_mode_p (mode))
7420         emit_insn (gen_rtx_SET (reg, mem));
7421       else
7422         emit_move_insn (reg, mem);
7423       if (frame_related_p)
7424         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7425     }
7426 }
7427
7428 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7429    of MODE.  */
7430
7431 static inline bool
7432 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7433 {
7434   HOST_WIDE_INT multiple;
7435   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7436           && IN_RANGE (multiple, -8, 7));
7437 }
7438
7439 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7440    of MODE.  */
7441
7442 static inline bool
7443 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7444 {
7445   HOST_WIDE_INT multiple;
7446   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7447           && IN_RANGE (multiple, 0, 63));
7448 }
7449
7450 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7451    of MODE.  */
7452
7453 bool
7454 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7455 {
7456   HOST_WIDE_INT multiple;
7457   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7458           && IN_RANGE (multiple, -64, 63));
7459 }
7460
7461 /* Return true if OFFSET is a signed 9-bit value.  */
7462
7463 bool
7464 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7465                                        poly_int64 offset)
7466 {
7467   HOST_WIDE_INT const_offset;
7468   return (offset.is_constant (&const_offset)
7469           && IN_RANGE (const_offset, -256, 255));
7470 }
7471
7472 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7473    of MODE.  */
7474
7475 static inline bool
7476 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7477 {
7478   HOST_WIDE_INT multiple;
7479   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7480           && IN_RANGE (multiple, -256, 255));
7481 }
7482
7483 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7484    of MODE.  */
7485
7486 static inline bool
7487 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7488 {
7489   HOST_WIDE_INT multiple;
7490   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7491           && IN_RANGE (multiple, 0, 4095));
7492 }
7493
7494 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
7495
7496 static sbitmap
7497 aarch64_get_separate_components (void)
7498 {
7499   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7500   bitmap_clear (components);
7501
7502   /* The registers we need saved to the frame.  */
7503   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7504     if (aarch64_register_saved_on_entry (regno))
7505       {
7506         /* Punt on saves and restores that use ST1D and LD1D.  We could
7507            try to be smarter, but it would involve making sure that the
7508            spare predicate register itself is safe to use at the save
7509            and restore points.  Also, when a frame pointer is being used,
7510            the slots are often out of reach of ST1D and LD1D anyway.  */
7511         machine_mode mode = aarch64_reg_save_mode (regno);
7512         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7513           continue;
7514
7515         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7516
7517         /* If the register is saved in the first SVE save slot, we use
7518            it as a stack probe for -fstack-clash-protection.  */
7519         if (flag_stack_clash_protection
7520             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7521             && known_eq (offset, 0))
7522           continue;
7523
7524         /* Get the offset relative to the register we'll use.  */
7525         if (frame_pointer_needed)
7526           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7527         else
7528           offset += crtl->outgoing_args_size;
7529
7530         /* Check that we can access the stack slot of the register with one
7531            direct load with no adjustments needed.  */
7532         if (aarch64_sve_mode_p (mode)
7533             ? offset_9bit_signed_scaled_p (mode, offset)
7534             : offset_12bit_unsigned_scaled_p (mode, offset))
7535           bitmap_set_bit (components, regno);
7536       }
7537
7538   /* Don't mess with the hard frame pointer.  */
7539   if (frame_pointer_needed)
7540     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7541
7542   /* If the spare predicate register used by big-endian SVE code
7543      is call-preserved, it must be saved in the main prologue
7544      before any saves that use it.  */
7545   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7546     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7547
7548   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7549   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7550   /* If registers have been chosen to be stored/restored with
7551      writeback don't interfere with them to avoid having to output explicit
7552      stack adjustment instructions.  */
7553   if (reg2 != INVALID_REGNUM)
7554     bitmap_clear_bit (components, reg2);
7555   if (reg1 != INVALID_REGNUM)
7556     bitmap_clear_bit (components, reg1);
7557
7558   bitmap_clear_bit (components, LR_REGNUM);
7559   bitmap_clear_bit (components, SP_REGNUM);
7560
7561   return components;
7562 }
7563
7564 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
7565
7566 static sbitmap
7567 aarch64_components_for_bb (basic_block bb)
7568 {
7569   bitmap in = DF_LIVE_IN (bb);
7570   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7571   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7572
7573   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7574   bitmap_clear (components);
7575
7576   /* Clobbered registers don't generate values in any meaningful sense,
7577      since nothing after the clobber can rely on their value.  And we can't
7578      say that partially-clobbered registers are unconditionally killed,
7579      because whether they're killed or not depends on the mode of the
7580      value they're holding.  Thus partially call-clobbered registers
7581      appear in neither the kill set nor the gen set.
7582
7583      Check manually for any calls that clobber more of a register than the
7584      current function can.  */
7585   function_abi_aggregator callee_abis;
7586   rtx_insn *insn;
7587   FOR_BB_INSNS (bb, insn)
7588     if (CALL_P (insn))
7589       callee_abis.note_callee_abi (insn_callee_abi (insn));
7590   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7591
7592   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
7593   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7594     if (!fixed_regs[regno]
7595         && !crtl->abi->clobbers_full_reg_p (regno)
7596         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7597             || bitmap_bit_p (in, regno)
7598             || bitmap_bit_p (gen, regno)
7599             || bitmap_bit_p (kill, regno)))
7600       {
7601         bitmap_set_bit (components, regno);
7602
7603         /* If there is a callee-save at an adjacent offset, add it too
7604            to increase the use of LDP/STP.  */
7605         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7606         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7607
7608         if (regno2 <= LAST_SAVED_REGNUM)
7609           {
7610             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7611             if (regno < regno2
7612                 ? known_eq (offset + 8, offset2)
7613                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7614               bitmap_set_bit (components, regno2);
7615           }
7616       }
7617
7618   return components;
7619 }
7620
7621 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7622    Nothing to do for aarch64.  */
7623
7624 static void
7625 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7626 {
7627 }
7628
7629 /* Return the next set bit in BMP from START onwards.  Return the total number
7630    of bits in BMP if no set bit is found at or after START.  */
7631
7632 static unsigned int
7633 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7634 {
7635   unsigned int nbits = SBITMAP_SIZE (bmp);
7636   if (start == nbits)
7637     return start;
7638
7639   gcc_assert (start < nbits);
7640   for (unsigned int i = start; i < nbits; i++)
7641     if (bitmap_bit_p (bmp, i))
7642       return i;
7643
7644   return nbits;
7645 }
7646
7647 /* Do the work for aarch64_emit_prologue_components and
7648    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
7649    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7650    for these components or the epilogue sequence.  That is, it determines
7651    whether we should emit stores or loads and what kind of CFA notes to attach
7652    to the insns.  Otherwise the logic for the two sequences is very
7653    similar.  */
7654
7655 static void
7656 aarch64_process_components (sbitmap components, bool prologue_p)
7657 {
7658   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7659                              ? HARD_FRAME_POINTER_REGNUM
7660                              : STACK_POINTER_REGNUM);
7661
7662   unsigned last_regno = SBITMAP_SIZE (components);
7663   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7664   rtx_insn *insn = NULL;
7665
7666   while (regno != last_regno)
7667     {
7668       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7669       machine_mode mode = aarch64_reg_save_mode (regno);
7670
7671       rtx reg = gen_rtx_REG (mode, regno);
7672       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7673       if (frame_pointer_needed)
7674         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7675       else
7676         offset += crtl->outgoing_args_size;
7677
7678       rtx addr = plus_constant (Pmode, ptr_reg, offset);
7679       rtx mem = gen_frame_mem (mode, addr);
7680
7681       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7682       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7683       /* No more registers to handle after REGNO.
7684          Emit a single save/restore and exit.  */
7685       if (regno2 == last_regno)
7686         {
7687           insn = emit_insn (set);
7688           if (frame_related_p)
7689             {
7690               RTX_FRAME_RELATED_P (insn) = 1;
7691               if (prologue_p)
7692                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7693               else
7694                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7695             }
7696           break;
7697         }
7698
7699       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7700       /* The next register is not of the same class or its offset is not
7701          mergeable with the current one into a pair.  */
7702       if (aarch64_sve_mode_p (mode)
7703           || !satisfies_constraint_Ump (mem)
7704           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7705           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7706           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7707                        GET_MODE_SIZE (mode)))
7708         {
7709           insn = emit_insn (set);
7710           if (frame_related_p)
7711             {
7712               RTX_FRAME_RELATED_P (insn) = 1;
7713               if (prologue_p)
7714                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7715               else
7716                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7717             }
7718
7719           regno = regno2;
7720           continue;
7721         }
7722
7723       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7724
7725       /* REGNO2 can be saved/restored in a pair with REGNO.  */
7726       rtx reg2 = gen_rtx_REG (mode, regno2);
7727       if (frame_pointer_needed)
7728         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7729       else
7730         offset2 += crtl->outgoing_args_size;
7731       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7732       rtx mem2 = gen_frame_mem (mode, addr2);
7733       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7734                              : gen_rtx_SET (reg2, mem2);
7735
7736       if (prologue_p)
7737         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7738       else
7739         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7740
7741       if (frame_related_p || frame_related2_p)
7742         {
7743           RTX_FRAME_RELATED_P (insn) = 1;
7744           if (prologue_p)
7745             {
7746               if (frame_related_p)
7747                 add_reg_note (insn, REG_CFA_OFFSET, set);
7748               if (frame_related2_p)
7749                 add_reg_note (insn, REG_CFA_OFFSET, set2);
7750             }
7751           else
7752             {
7753               if (frame_related_p)
7754                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7755               if (frame_related2_p)
7756                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7757             }
7758         }
7759
7760       regno = aarch64_get_next_set_bit (components, regno2 + 1);
7761     }
7762 }
7763
7764 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
7765
7766 static void
7767 aarch64_emit_prologue_components (sbitmap components)
7768 {
7769   aarch64_process_components (components, true);
7770 }
7771
7772 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
7773
7774 static void
7775 aarch64_emit_epilogue_components (sbitmap components)
7776 {
7777   aarch64_process_components (components, false);
7778 }
7779
7780 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
7781
7782 static void
7783 aarch64_set_handled_components (sbitmap components)
7784 {
7785   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7786     if (bitmap_bit_p (components, regno))
7787       cfun->machine->reg_is_wrapped_separately[regno] = true;
7788 }
7789
7790 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
7791    determining the probe offset for alloca.  */
7792
7793 static HOST_WIDE_INT
7794 aarch64_stack_clash_protection_alloca_probe_range (void)
7795 {
7796   return STACK_CLASH_CALLER_GUARD;
7797 }
7798
7799
7800 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7801    registers.  If POLY_SIZE is not large enough to require a probe this function
7802    will only adjust the stack.  When allocating the stack space
7803    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7804    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7805    arguments.  If we are then we ensure that any allocation larger than the ABI
7806    defined buffer needs a probe so that the invariant of having a 1KB buffer is
7807    maintained.
7808
7809    We emit barriers after each stack adjustment to prevent optimizations from
7810    breaking the invariant that we never drop the stack more than a page.  This
7811    invariant is needed to make it easier to correctly handle asynchronous
7812    events, e.g. if we were to allow the stack to be dropped by more than a page
7813    and then have multiple probes up and we take a signal somewhere in between
7814    then the signal handler doesn't know the state of the stack and can make no
7815    assumptions about which pages have been probed.  */
7816
7817 static void
7818 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7819                                         poly_int64 poly_size,
7820                                         bool frame_related_p,
7821                                         bool final_adjustment_p)
7822 {
7823   HOST_WIDE_INT guard_size
7824     = 1 << param_stack_clash_protection_guard_size;
7825   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7826   HOST_WIDE_INT min_probe_threshold
7827     = (final_adjustment_p
7828        ? guard_used_by_caller
7829        : guard_size - guard_used_by_caller);
7830   /* When doing the final adjustment for the outgoing arguments, take into
7831      account any unprobed space there is above the current SP.  There are
7832      two cases:
7833
7834      - When saving SVE registers below the hard frame pointer, we force
7835        the lowest save to take place in the prologue before doing the final
7836        adjustment (i.e. we don't allow the save to be shrink-wrapped).
7837        This acts as a probe at SP, so there is no unprobed space.
7838
7839      - When there are no SVE register saves, we use the store of the link
7840        register as a probe.  We can't assume that LR was saved at position 0
7841        though, so treat any space below it as unprobed.  */
7842   if (final_adjustment_p
7843       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7844     {
7845       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7846       if (known_ge (lr_offset, 0))
7847         min_probe_threshold -= lr_offset.to_constant ();
7848       else
7849         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7850     }
7851
7852   poly_int64 frame_size = cfun->machine->frame.frame_size;
7853
7854   /* We should always have a positive probe threshold.  */
7855   gcc_assert (min_probe_threshold > 0);
7856
7857   if (flag_stack_clash_protection && !final_adjustment_p)
7858     {
7859       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7860       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7861       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7862
7863       if (known_eq (frame_size, 0))
7864         {
7865           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7866         }
7867       else if (known_lt (initial_adjust + sve_callee_adjust,
7868                          guard_size - guard_used_by_caller)
7869                && known_lt (final_adjust, guard_used_by_caller))
7870         {
7871           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7872         }
7873     }
7874
7875   /* If SIZE is not large enough to require probing, just adjust the stack and
7876      exit.  */
7877   if (known_lt (poly_size, min_probe_threshold)
7878       || !flag_stack_clash_protection)
7879     {
7880       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7881       return;
7882     }
7883
7884   HOST_WIDE_INT size;
7885   /* Handle the SVE non-constant case first.  */
7886   if (!poly_size.is_constant (&size))
7887     {
7888      if (dump_file)
7889       {
7890         fprintf (dump_file, "Stack clash SVE prologue: ");
7891         print_dec (poly_size, dump_file);
7892         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7893       }
7894
7895       /* First calculate the amount of bytes we're actually spilling.  */
7896       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7897                           poly_size, temp1, temp2, false, true);
7898
7899       rtx_insn *insn = get_last_insn ();
7900
7901       if (frame_related_p)
7902         {
7903           /* This is done to provide unwinding information for the stack
7904              adjustments we're about to do, however to prevent the optimizers
7905              from removing the R11 move and leaving the CFA note (which would be
7906              very wrong) we tie the old and new stack pointer together.
7907              The tie will expand to nothing but the optimizers will not touch
7908              the instruction.  */
7909           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7910           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7911           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7912
7913           /* We want the CFA independent of the stack pointer for the
7914              duration of the loop.  */
7915           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7916           RTX_FRAME_RELATED_P (insn) = 1;
7917         }
7918
7919       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7920       rtx guard_const = gen_int_mode (guard_size, Pmode);
7921
7922       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7923                                                    stack_pointer_rtx, temp1,
7924                                                    probe_const, guard_const));
7925
7926       /* Now reset the CFA register if needed.  */
7927       if (frame_related_p)
7928         {
7929           add_reg_note (insn, REG_CFA_DEF_CFA,
7930                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7931                                       gen_int_mode (poly_size, Pmode)));
7932           RTX_FRAME_RELATED_P (insn) = 1;
7933         }
7934
7935       return;
7936     }
7937
7938   if (dump_file)
7939     fprintf (dump_file,
7940              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7941              " bytes, probing will be required.\n", size);
7942
7943   /* Round size to the nearest multiple of guard_size, and calculate the
7944      residual as the difference between the original size and the rounded
7945      size.  */
7946   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7947   HOST_WIDE_INT residual = size - rounded_size;
7948
7949   /* We can handle a small number of allocations/probes inline.  Otherwise
7950      punt to a loop.  */
7951   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7952     {
7953       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7954         {
7955           aarch64_sub_sp (NULL, temp2, guard_size, true);
7956           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7957                                            guard_used_by_caller));
7958           emit_insn (gen_blockage ());
7959         }
7960       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7961     }
7962   else
7963     {
7964       /* Compute the ending address.  */
7965       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7966                           temp1, NULL, false, true);
7967       rtx_insn *insn = get_last_insn ();
7968
7969       /* For the initial allocation, we don't have a frame pointer
7970          set up, so we always need CFI notes.  If we're doing the
7971          final allocation, then we may have a frame pointer, in which
7972          case it is the CFA, otherwise we need CFI notes.
7973
7974          We can determine which allocation we are doing by looking at
7975          the value of FRAME_RELATED_P since the final allocations are not
7976          frame related.  */
7977       if (frame_related_p)
7978         {
7979           /* We want the CFA independent of the stack pointer for the
7980              duration of the loop.  */
7981           add_reg_note (insn, REG_CFA_DEF_CFA,
7982                         plus_constant (Pmode, temp1, rounded_size));
7983           RTX_FRAME_RELATED_P (insn) = 1;
7984         }
7985
7986       /* This allocates and probes the stack.  Note that this re-uses some of
7987          the existing Ada stack protection code.  However we are guaranteed not
7988          to enter the non loop or residual branches of that code.
7989
7990          The non-loop part won't be entered because if our allocation amount
7991          doesn't require a loop, the case above would handle it.
7992
7993          The residual amount won't be entered because TEMP1 is a mutliple of
7994          the allocation size.  The residual will always be 0.  As such, the only
7995          part we are actually using from that code is the loop setup.  The
7996          actual probing is done in aarch64_output_probe_stack_range.  */
7997       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7998                                                stack_pointer_rtx, temp1));
7999
8000       /* Now reset the CFA register if needed.  */
8001       if (frame_related_p)
8002         {
8003           add_reg_note (insn, REG_CFA_DEF_CFA,
8004                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8005           RTX_FRAME_RELATED_P (insn) = 1;
8006         }
8007
8008       emit_insn (gen_blockage ());
8009       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8010     }
8011
8012   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
8013      be probed.  This maintains the requirement that each page is probed at
8014      least once.  For initial probing we probe only if the allocation is
8015      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8016      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
8017      GUARD_SIZE.  This works that for any allocation that is large enough to
8018      trigger a probe here, we'll have at least one, and if they're not large
8019      enough for this code to emit anything for them, The page would have been
8020      probed by the saving of FP/LR either by this function or any callees.  If
8021      we don't have any callees then we won't have more stack adjustments and so
8022      are still safe.  */
8023   if (residual)
8024     {
8025       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8026       /* If we're doing final adjustments, and we've done any full page
8027          allocations then any residual needs to be probed.  */
8028       if (final_adjustment_p && rounded_size != 0)
8029         min_probe_threshold = 0;
8030       /* If doing a small final adjustment, we always probe at offset 0.
8031          This is done to avoid issues when LR is not at position 0 or when
8032          the final adjustment is smaller than the probing offset.  */
8033       else if (final_adjustment_p && rounded_size == 0)
8034         residual_probe_offset = 0;
8035
8036       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8037       if (residual >= min_probe_threshold)
8038         {
8039           if (dump_file)
8040             fprintf (dump_file,
8041                      "Stack clash AArch64 prologue residuals: "
8042                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8043                      "\n", residual);
8044
8045             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8046                                              residual_probe_offset));
8047           emit_insn (gen_blockage ());
8048         }
8049     }
8050 }
8051
8052 /* Return 1 if the register is used by the epilogue.  We need to say the
8053    return register is used, but only after epilogue generation is complete.
8054    Note that in the case of sibcalls, the values "used by the epilogue" are
8055    considered live at the start of the called function.
8056
8057    For SIMD functions we need to return 1 for FP registers that are saved and
8058    restored by a function but are not zero in call_used_regs.  If we do not do
8059    this optimizations may remove the restore of the register.  */
8060
8061 int
8062 aarch64_epilogue_uses (int regno)
8063 {
8064   if (epilogue_completed)
8065     {
8066       if (regno == LR_REGNUM)
8067         return 1;
8068     }
8069   return 0;
8070 }
8071
8072 /* AArch64 stack frames generated by this compiler look like:
8073
8074         +-------------------------------+
8075         |                               |
8076         |  incoming stack arguments     |
8077         |                               |
8078         +-------------------------------+
8079         |                               | <-- incoming stack pointer (aligned)
8080         |  callee-allocated save area   |
8081         |  for register varargs         |
8082         |                               |
8083         +-------------------------------+
8084         |  local variables              | <-- frame_pointer_rtx
8085         |                               |
8086         +-------------------------------+
8087         |  padding                      | \
8088         +-------------------------------+  |
8089         |  callee-saved registers       |  | frame.saved_regs_size
8090         +-------------------------------+  |
8091         |  LR'                          |  |
8092         +-------------------------------+  |
8093         |  FP'                          |  |
8094         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
8095         |  SVE vector registers         |  | \
8096         +-------------------------------+  |  | below_hard_fp_saved_regs_size
8097         |  SVE predicate registers      | /  /
8098         +-------------------------------+
8099         |  dynamic allocation           |
8100         +-------------------------------+
8101         |  padding                      |
8102         +-------------------------------+
8103         |  outgoing stack arguments     | <-- arg_pointer
8104         |                               |
8105         +-------------------------------+
8106         |                               | <-- stack_pointer_rtx (aligned)
8107
8108    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8109    but leave frame_pointer_rtx and hard_frame_pointer_rtx
8110    unchanged.
8111
8112    By default for stack-clash we assume the guard is at least 64KB, but this
8113    value is configurable to either 4KB or 64KB.  We also force the guard size to
8114    be the same as the probing interval and both values are kept in sync.
8115
8116    With those assumptions the callee can allocate up to 63KB (or 3KB depending
8117    on the guard size) of stack space without probing.
8118
8119    When probing is needed, we emit a probe at the start of the prologue
8120    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8121
8122    We have to track how much space has been allocated and the only stores
8123    to the stack we track as implicit probes are the FP/LR stores.
8124
8125    For outgoing arguments we probe if the size is larger than 1KB, such that
8126    the ABI specified buffer is maintained for the next callee.
8127
8128    The following registers are reserved during frame layout and should not be
8129    used for any other purpose:
8130
8131    - r11: Used by stack clash protection when SVE is enabled, and also
8132           as an anchor register when saving and restoring registers
8133    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8134    - r14 and r15: Used for speculation tracking.
8135    - r16(IP0), r17(IP1): Used by indirect tailcalls.
8136    - r30(LR), r29(FP): Used by standard frame layout.
8137
8138    These registers must be avoided in frame layout related code unless the
8139    explicit intention is to interact with one of the features listed above.  */
8140
8141 /* Generate the prologue instructions for entry into a function.
8142    Establish the stack frame by decreasing the stack pointer with a
8143    properly calculated size and, if necessary, create a frame record
8144    filled with the values of LR and previous frame pointer.  The
8145    current FP is also set up if it is in use.  */
8146
8147 void
8148 aarch64_expand_prologue (void)
8149 {
8150   poly_int64 frame_size = cfun->machine->frame.frame_size;
8151   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8152   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8153   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8154   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8155   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8156   poly_int64 below_hard_fp_saved_regs_size
8157     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8158   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8159   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8160   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
8161   rtx_insn *insn;
8162
8163   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8164     {
8165       /* Fold the SVE allocation into the initial allocation.
8166          We don't do this in aarch64_layout_arg to avoid pessimizing
8167          the epilogue code.  */
8168       initial_adjust += sve_callee_adjust;
8169       sve_callee_adjust = 0;
8170     }
8171
8172   /* Sign return address for functions.  */
8173   if (aarch64_return_address_signing_enabled ())
8174     {
8175       switch (aarch64_ra_sign_key)
8176         {
8177           case AARCH64_KEY_A:
8178             insn = emit_insn (gen_paciasp ());
8179             break;
8180           case AARCH64_KEY_B:
8181             insn = emit_insn (gen_pacibsp ());
8182             break;
8183           default:
8184             gcc_unreachable ();
8185         }
8186       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8187       RTX_FRAME_RELATED_P (insn) = 1;
8188     }
8189
8190   if (flag_stack_usage_info)
8191     current_function_static_stack_size = constant_lower_bound (frame_size);
8192
8193   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8194     {
8195       if (crtl->is_leaf && !cfun->calls_alloca)
8196         {
8197           if (maybe_gt (frame_size, PROBE_INTERVAL)
8198               && maybe_gt (frame_size, get_stack_check_protect ()))
8199             aarch64_emit_probe_stack_range (get_stack_check_protect (),
8200                                             (frame_size
8201                                              - get_stack_check_protect ()));
8202         }
8203       else if (maybe_gt (frame_size, 0))
8204         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
8205     }
8206
8207   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8208   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8209
8210   /* In theory we should never have both an initial adjustment
8211      and a callee save adjustment.  Verify that is the case since the
8212      code below does not handle it for -fstack-clash-protection.  */
8213   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8214
8215   /* Will only probe if the initial adjustment is larger than the guard
8216      less the amount of the guard reserved for use by the caller's
8217      outgoing args.  */
8218   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8219                                           true, false);
8220
8221   if (callee_adjust != 0)
8222     aarch64_push_regs (reg1, reg2, callee_adjust);
8223
8224   /* The offset of the frame chain record (if any) from the current SP.  */
8225   poly_int64 chain_offset = (initial_adjust + callee_adjust
8226                              - cfun->machine->frame.hard_fp_offset);
8227   gcc_assert (known_ge (chain_offset, 0));
8228
8229   /* The offset of the bottom of the save area from the current SP.  */
8230   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8231
8232   if (emit_frame_chain)
8233     {
8234       if (callee_adjust == 0)
8235         {
8236           reg1 = R29_REGNUM;
8237           reg2 = R30_REGNUM;
8238           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8239                                      false, false);
8240         }
8241       else
8242         gcc_assert (known_eq (chain_offset, 0));
8243       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8244                           stack_pointer_rtx, chain_offset,
8245                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8246       if (frame_pointer_needed && !frame_size.is_constant ())
8247         {
8248           /* Variable-sized frames need to describe the save slot
8249              address using DW_CFA_expression rather than DW_CFA_offset.
8250              This means that, without taking further action, the
8251              locations of the registers that we've already saved would
8252              remain based on the stack pointer even after we redefine
8253              the CFA based on the frame pointer.  We therefore need new
8254              DW_CFA_expressions to re-express the save slots with addresses
8255              based on the frame pointer.  */
8256           rtx_insn *insn = get_last_insn ();
8257           gcc_assert (RTX_FRAME_RELATED_P (insn));
8258
8259           /* Add an explicit CFA definition if this was previously
8260              implicit.  */
8261           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8262             {
8263               rtx src = plus_constant (Pmode, stack_pointer_rtx,
8264                                        callee_offset);
8265               add_reg_note (insn, REG_CFA_ADJUST_CFA,
8266                             gen_rtx_SET (hard_frame_pointer_rtx, src));
8267             }
8268
8269           /* Change the save slot expressions for the registers that
8270              we've already saved.  */
8271           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8272                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
8273           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8274                                       hard_frame_pointer_rtx, 0);
8275         }
8276       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8277     }
8278
8279   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8280                              callee_adjust != 0 || emit_frame_chain,
8281                              emit_frame_chain);
8282   if (maybe_ne (sve_callee_adjust, 0))
8283     {
8284       gcc_assert (!flag_stack_clash_protection
8285                   || known_eq (initial_adjust, 0));
8286       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8287                                               sve_callee_adjust,
8288                                               !frame_pointer_needed, false);
8289       saved_regs_offset += sve_callee_adjust;
8290     }
8291   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8292                              false, emit_frame_chain);
8293   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8294                              callee_adjust != 0 || emit_frame_chain,
8295                              emit_frame_chain);
8296
8297   /* We may need to probe the final adjustment if it is larger than the guard
8298      that is assumed by the called.  */
8299   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8300                                           !frame_pointer_needed, true);
8301 }
8302
8303 /* Return TRUE if we can use a simple_return insn.
8304
8305    This function checks whether the callee saved stack is empty, which
8306    means no restore actions are need. The pro_and_epilogue will use
8307    this to check whether shrink-wrapping opt is feasible.  */
8308
8309 bool
8310 aarch64_use_return_insn_p (void)
8311 {
8312   if (!reload_completed)
8313     return false;
8314
8315   if (crtl->profile)
8316     return false;
8317
8318   return known_eq (cfun->machine->frame.frame_size, 0);
8319 }
8320
8321 /* Generate the epilogue instructions for returning from a function.
8322    This is almost exactly the reverse of the prolog sequence, except
8323    that we need to insert barriers to avoid scheduling loads that read
8324    from a deallocated stack, and we optimize the unwind records by
8325    emitting them all together if possible.  */
8326 void
8327 aarch64_expand_epilogue (bool for_sibcall)
8328 {
8329   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8330   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8331   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8332   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8333   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8334   poly_int64 below_hard_fp_saved_regs_size
8335     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8336   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8337   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8338   rtx cfi_ops = NULL;
8339   rtx_insn *insn;
8340   /* A stack clash protection prologue may not have left EP0_REGNUM or
8341      EP1_REGNUM in a usable state.  The same is true for allocations
8342      with an SVE component, since we then need both temporary registers
8343      for each allocation.  For stack clash we are in a usable state if
8344      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
8345   HOST_WIDE_INT guard_size
8346     = 1 << param_stack_clash_protection_guard_size;
8347   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8348
8349   /* We can re-use the registers when:
8350
8351      (a) the deallocation amount is the same as the corresponding
8352          allocation amount (which is false if we combine the initial
8353          and SVE callee save allocations in the prologue); and
8354
8355      (b) the allocation amount doesn't need a probe (which is false
8356          if the amount is guard_size - guard_used_by_caller or greater).
8357
8358      In such situations the register should remain live with the correct
8359      value.  */
8360   bool can_inherit_p = (initial_adjust.is_constant ()
8361                         && final_adjust.is_constant ()
8362                         && (!flag_stack_clash_protection
8363                             || (known_lt (initial_adjust,
8364                                           guard_size - guard_used_by_caller)
8365                                 && known_eq (sve_callee_adjust, 0))));
8366
8367   /* We need to add memory barrier to prevent read from deallocated stack.  */
8368   bool need_barrier_p
8369     = maybe_ne (get_frame_size ()
8370                 + cfun->machine->frame.saved_varargs_size, 0);
8371
8372   /* Emit a barrier to prevent loads from a deallocated stack.  */
8373   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8374       || cfun->calls_alloca
8375       || crtl->calls_eh_return)
8376     {
8377       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8378       need_barrier_p = false;
8379     }
8380
8381   /* Restore the stack pointer from the frame pointer if it may not
8382      be the same as the stack pointer.  */
8383   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8384   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8385   if (frame_pointer_needed
8386       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8387     /* If writeback is used when restoring callee-saves, the CFA
8388        is restored on the instruction doing the writeback.  */
8389     aarch64_add_offset (Pmode, stack_pointer_rtx,
8390                         hard_frame_pointer_rtx,
8391                         -callee_offset - below_hard_fp_saved_regs_size,
8392                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8393   else
8394      /* The case where we need to re-use the register here is very rare, so
8395         avoid the complicated condition and just always emit a move if the
8396         immediate doesn't fit.  */
8397      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8398
8399   /* Restore the vector registers before the predicate registers,
8400      so that we can use P4 as a temporary for big-endian SVE frames.  */
8401   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8402                                 callee_adjust != 0, &cfi_ops);
8403   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8404                                 false, &cfi_ops);
8405   if (maybe_ne (sve_callee_adjust, 0))
8406     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8407   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8408                                 R0_REGNUM, R30_REGNUM,
8409                                 callee_adjust != 0, &cfi_ops);
8410
8411   if (need_barrier_p)
8412     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8413
8414   if (callee_adjust != 0)
8415     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8416
8417   /* If we have no register restore information, the CFA must have been
8418      defined in terms of the stack pointer since the end of the prologue.  */
8419   gcc_assert (cfi_ops || !frame_pointer_needed);
8420
8421   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8422     {
8423       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
8424       insn = get_last_insn ();
8425       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8426       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8427       RTX_FRAME_RELATED_P (insn) = 1;
8428       cfi_ops = NULL;
8429     }
8430
8431   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8432      add restriction on emit_move optimization to leaf functions.  */
8433   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8434                   (!can_inherit_p || !crtl->is_leaf
8435                    || df_regs_ever_live_p (EP0_REGNUM)));
8436
8437   if (cfi_ops)
8438     {
8439       /* Emit delayed restores and reset the CFA to be SP.  */
8440       insn = get_last_insn ();
8441       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8442       REG_NOTES (insn) = cfi_ops;
8443       RTX_FRAME_RELATED_P (insn) = 1;
8444     }
8445
8446   /* We prefer to emit the combined return/authenticate instruction RETAA,
8447      however there are three cases in which we must instead emit an explicit
8448      authentication instruction.
8449
8450         1) Sibcalls don't return in a normal way, so if we're about to call one
8451            we must authenticate.
8452
8453         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8454            generating code for !TARGET_ARMV8_3 we can't use it and must
8455            explicitly authenticate.
8456
8457         3) On an eh_return path we make extra stack adjustments to update the
8458            canonical frame address to be the exception handler's CFA.  We want
8459            to authenticate using the CFA of the function which calls eh_return.
8460     */
8461   if (aarch64_return_address_signing_enabled ()
8462       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8463     {
8464       switch (aarch64_ra_sign_key)
8465         {
8466           case AARCH64_KEY_A:
8467             insn = emit_insn (gen_autiasp ());
8468             break;
8469           case AARCH64_KEY_B:
8470             insn = emit_insn (gen_autibsp ());
8471             break;
8472           default:
8473             gcc_unreachable ();
8474         }
8475       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8476       RTX_FRAME_RELATED_P (insn) = 1;
8477     }
8478
8479   /* Stack adjustment for exception handler.  */
8480   if (crtl->calls_eh_return && !for_sibcall)
8481     {
8482       /* We need to unwind the stack by the offset computed by
8483          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
8484          to be SP; letting the CFA move during this adjustment
8485          is just as correct as retaining the CFA from the body
8486          of the function.  Therefore, do nothing special.  */
8487       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8488     }
8489
8490   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8491   if (!for_sibcall)
8492     emit_jump_insn (ret_rtx);
8493 }
8494
8495 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
8496    normally or return to a previous frame after unwinding.
8497
8498    An EH return uses a single shared return sequence.  The epilogue is
8499    exactly like a normal epilogue except that it has an extra input
8500    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8501    that must be applied after the frame has been destroyed.  An extra label
8502    is inserted before the epilogue which initializes this register to zero,
8503    and this is the entry point for a normal return.
8504
8505    An actual EH return updates the return address, initializes the stack
8506    adjustment and jumps directly into the epilogue (bypassing the zeroing
8507    of the adjustment).  Since the return address is typically saved on the
8508    stack when a function makes a call, the saved LR must be updated outside
8509    the epilogue.
8510
8511    This poses problems as the store is generated well before the epilogue,
8512    so the offset of LR is not known yet.  Also optimizations will remove the
8513    store as it appears dead, even after the epilogue is generated (as the
8514    base or offset for loading LR is different in many cases).
8515
8516    To avoid these problems this implementation forces the frame pointer
8517    in eh_return functions so that the location of LR is fixed and known early.
8518    It also marks the store volatile, so no optimization is permitted to
8519    remove the store.  */
8520 rtx
8521 aarch64_eh_return_handler_rtx (void)
8522 {
8523   rtx tmp = gen_frame_mem (Pmode,
8524     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8525
8526   /* Mark the store volatile, so no optimization is permitted to remove it.  */
8527   MEM_VOLATILE_P (tmp) = true;
8528   return tmp;
8529 }
8530
8531 /* Output code to add DELTA to the first argument, and then jump
8532    to FUNCTION.  Used for C++ multiple inheritance.  */
8533 static void
8534 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8535                          HOST_WIDE_INT delta,
8536                          HOST_WIDE_INT vcall_offset,
8537                          tree function)
8538 {
8539   /* The this pointer is always in x0.  Note that this differs from
8540      Arm where the this pointer maybe bumped to r1 if r0 is required
8541      to return a pointer to an aggregate.  On AArch64 a result value
8542      pointer will be in x8.  */
8543   int this_regno = R0_REGNUM;
8544   rtx this_rtx, temp0, temp1, addr, funexp;
8545   rtx_insn *insn;
8546   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8547
8548   if (aarch64_bti_enabled ())
8549     emit_insn (gen_bti_c());
8550
8551   reload_completed = 1;
8552   emit_note (NOTE_INSN_PROLOGUE_END);
8553
8554   this_rtx = gen_rtx_REG (Pmode, this_regno);
8555   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8556   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8557
8558   if (vcall_offset == 0)
8559     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8560   else
8561     {
8562       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8563
8564       addr = this_rtx;
8565       if (delta != 0)
8566         {
8567           if (delta >= -256 && delta < 256)
8568             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8569                                        plus_constant (Pmode, this_rtx, delta));
8570           else
8571             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8572                                 temp1, temp0, false);
8573         }
8574
8575       if (Pmode == ptr_mode)
8576         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8577       else
8578         aarch64_emit_move (temp0,
8579                            gen_rtx_ZERO_EXTEND (Pmode,
8580                                                 gen_rtx_MEM (ptr_mode, addr)));
8581
8582       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8583           addr = plus_constant (Pmode, temp0, vcall_offset);
8584       else
8585         {
8586           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8587                                           Pmode);
8588           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8589         }
8590
8591       if (Pmode == ptr_mode)
8592         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8593       else
8594         aarch64_emit_move (temp1,
8595                            gen_rtx_SIGN_EXTEND (Pmode,
8596                                                 gen_rtx_MEM (ptr_mode, addr)));
8597
8598       emit_insn (gen_add2_insn (this_rtx, temp1));
8599     }
8600
8601   /* Generate a tail call to the target function.  */
8602   if (!TREE_USED (function))
8603     {
8604       assemble_external (function);
8605       TREE_USED (function) = 1;
8606     }
8607   funexp = XEXP (DECL_RTL (function), 0);
8608   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8609   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8610   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8611   SIBLING_CALL_P (insn) = 1;
8612
8613   insn = get_insns ();
8614   shorten_branches (insn);
8615
8616   assemble_start_function (thunk, fnname);
8617   final_start_function (insn, file, 1);
8618   final (insn, file, 1);
8619   final_end_function ();
8620   assemble_end_function (thunk, fnname);
8621
8622   /* Stop pretending to be a post-reload pass.  */
8623   reload_completed = 0;
8624 }
8625
8626 static bool
8627 aarch64_tls_referenced_p (rtx x)
8628 {
8629   if (!TARGET_HAVE_TLS)
8630     return false;
8631   subrtx_iterator::array_type array;
8632   FOR_EACH_SUBRTX (iter, array, x, ALL)
8633     {
8634       const_rtx x = *iter;
8635       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
8636         return true;
8637       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8638          TLS offsets, not real symbol references.  */
8639       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8640         iter.skip_subrtxes ();
8641     }
8642   return false;
8643 }
8644
8645
8646 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8647    a left shift of 0 or 12 bits.  */
8648 bool
8649 aarch64_uimm12_shift (HOST_WIDE_INT val)
8650 {
8651   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8652           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8653           );
8654 }
8655
8656 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8657    that can be created with a left shift of 0 or 12.  */
8658 static HOST_WIDE_INT
8659 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8660 {
8661   /* Check to see if the value fits in 24 bits, as that is the maximum we can
8662      handle correctly.  */
8663   gcc_assert ((val & 0xffffff) == val);
8664
8665   if (((val & 0xfff) << 0) == val)
8666     return val;
8667
8668   return val & (0xfff << 12);
8669 }
8670
8671 /* Return true if val is an immediate that can be loaded into a
8672    register by a MOVZ instruction.  */
8673 static bool
8674 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8675 {
8676   if (GET_MODE_SIZE (mode) > 4)
8677     {
8678       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8679           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8680         return 1;
8681     }
8682   else
8683     {
8684       /* Ignore sign extension.  */
8685       val &= (HOST_WIDE_INT) 0xffffffff;
8686     }
8687   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8688           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8689 }
8690
8691 /* Test whether:
8692
8693      X = (X & AND_VAL) | IOR_VAL;
8694
8695    can be implemented using:
8696
8697      MOVK X, #(IOR_VAL >> shift), LSL #shift
8698
8699    Return the shift if so, otherwise return -1.  */
8700 int
8701 aarch64_movk_shift (const wide_int_ref &and_val,
8702                     const wide_int_ref &ior_val)
8703 {
8704   unsigned int precision = and_val.get_precision ();
8705   unsigned HOST_WIDE_INT mask = 0xffff;
8706   for (unsigned int shift = 0; shift < precision; shift += 16)
8707     {
8708       if (and_val == ~mask && (ior_val & mask) == ior_val)
8709         return shift;
8710       mask <<= 16;
8711     }
8712   return -1;
8713 }
8714
8715 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
8716    64-bit (DImode) integer.  */
8717
8718 static unsigned HOST_WIDE_INT
8719 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8720 {
8721   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8722   while (size < 64)
8723     {
8724       val &= (HOST_WIDE_INT_1U << size) - 1;
8725       val |= val << size;
8726       size *= 2;
8727     }
8728   return val;
8729 }
8730
8731 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
8732
8733 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8734   {
8735     0x0000000100000001ull,
8736     0x0001000100010001ull,
8737     0x0101010101010101ull,
8738     0x1111111111111111ull,
8739     0x5555555555555555ull,
8740   };
8741
8742
8743 /* Return true if val is a valid bitmask immediate.  */
8744
8745 bool
8746 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8747 {
8748   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8749   int bits;
8750
8751   /* Check for a single sequence of one bits and return quickly if so.
8752      The special cases of all ones and all zeroes returns false.  */
8753   val = aarch64_replicate_bitmask_imm (val_in, mode);
8754   tmp = val + (val & -val);
8755
8756   if (tmp == (tmp & -tmp))
8757     return (val + 1) > 1;
8758
8759   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
8760   if (mode == SImode)
8761     val = (val << 32) | (val & 0xffffffff);
8762
8763   /* Invert if the immediate doesn't start with a zero bit - this means we
8764      only need to search for sequences of one bits.  */
8765   if (val & 1)
8766     val = ~val;
8767
8768   /* Find the first set bit and set tmp to val with the first sequence of one
8769      bits removed.  Return success if there is a single sequence of ones.  */
8770   first_one = val & -val;
8771   tmp = val & (val + first_one);
8772
8773   if (tmp == 0)
8774     return true;
8775
8776   /* Find the next set bit and compute the difference in bit position.  */
8777   next_one = tmp & -tmp;
8778   bits = clz_hwi (first_one) - clz_hwi (next_one);
8779   mask = val ^ tmp;
8780
8781   /* Check the bit position difference is a power of 2, and that the first
8782      sequence of one bits fits within 'bits' bits.  */
8783   if ((mask >> bits) != 0 || bits != (bits & -bits))
8784     return false;
8785
8786   /* Check the sequence of one bits is repeated 64/bits times.  */
8787   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8788 }
8789
8790 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8791    Assumed precondition: VAL_IN Is not zero.  */
8792
8793 unsigned HOST_WIDE_INT
8794 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8795 {
8796   int lowest_bit_set = ctz_hwi (val_in);
8797   int highest_bit_set = floor_log2 (val_in);
8798   gcc_assert (val_in != 0);
8799
8800   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8801           (HOST_WIDE_INT_1U << lowest_bit_set));
8802 }
8803
8804 /* Create constant where bits outside of lowest bit set to highest bit set
8805    are set to 1.  */
8806
8807 unsigned HOST_WIDE_INT
8808 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8809 {
8810   return val_in | ~aarch64_and_split_imm1 (val_in);
8811 }
8812
8813 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
8814
8815 bool
8816 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8817 {
8818   scalar_int_mode int_mode;
8819   if (!is_a <scalar_int_mode> (mode, &int_mode))
8820     return false;
8821
8822   if (aarch64_bitmask_imm (val_in, int_mode))
8823     return false;
8824
8825   if (aarch64_move_imm (val_in, int_mode))
8826     return false;
8827
8828   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8829
8830   return aarch64_bitmask_imm (imm2, int_mode);
8831 }
8832
8833 /* Return true if val is an immediate that can be loaded into a
8834    register in a single instruction.  */
8835 bool
8836 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8837 {
8838   scalar_int_mode int_mode;
8839   if (!is_a <scalar_int_mode> (mode, &int_mode))
8840     return false;
8841
8842   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8843     return 1;
8844   return aarch64_bitmask_imm (val, int_mode);
8845 }
8846
8847 static bool
8848 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8849 {
8850   if (GET_CODE (x) == HIGH)
8851     return true;
8852
8853   /* There's no way to calculate VL-based values using relocations.  */
8854   subrtx_iterator::array_type array;
8855   FOR_EACH_SUBRTX (iter, array, x, ALL)
8856     if (GET_CODE (*iter) == CONST_POLY_INT)
8857       return true;
8858
8859   poly_int64 offset;
8860   rtx base = strip_offset_and_salt (x, &offset);
8861   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
8862     {
8863       /* We checked for POLY_INT_CST offsets above.  */
8864       if (aarch64_classify_symbol (base, offset.to_constant ())
8865           != SYMBOL_FORCE_TO_MEM)
8866         return true;
8867       else
8868         /* Avoid generating a 64-bit relocation in ILP32; leave
8869            to aarch64_expand_mov_immediate to handle it properly.  */
8870         return mode != ptr_mode;
8871     }
8872
8873   return aarch64_tls_referenced_p (x);
8874 }
8875
8876 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8877    The expansion for a table switch is quite expensive due to the number
8878    of instructions, the table lookup and hard to predict indirect jump.
8879    When optimizing for speed, and -O3 enabled, use the per-core tuning if
8880    set, otherwise use tables for > 16 cases as a tradeoff between size and
8881    performance.  When optimizing for size, use the default setting.  */
8882
8883 static unsigned int
8884 aarch64_case_values_threshold (void)
8885 {
8886   /* Use the specified limit for the number of cases before using jump
8887      tables at higher optimization levels.  */
8888   if (optimize > 2
8889       && selected_cpu->tune->max_case_values != 0)
8890     return selected_cpu->tune->max_case_values;
8891   else
8892     return optimize_size ? default_case_values_threshold () : 17;
8893 }
8894
8895 /* Return true if register REGNO is a valid index register.
8896    STRICT_P is true if REG_OK_STRICT is in effect.  */
8897
8898 bool
8899 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8900 {
8901   if (!HARD_REGISTER_NUM_P (regno))
8902     {
8903       if (!strict_p)
8904         return true;
8905
8906       if (!reg_renumber)
8907         return false;
8908
8909       regno = reg_renumber[regno];
8910     }
8911   return GP_REGNUM_P (regno);
8912 }
8913
8914 /* Return true if register REGNO is a valid base register for mode MODE.
8915    STRICT_P is true if REG_OK_STRICT is in effect.  */
8916
8917 bool
8918 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8919 {
8920   if (!HARD_REGISTER_NUM_P (regno))
8921     {
8922       if (!strict_p)
8923         return true;
8924
8925       if (!reg_renumber)
8926         return false;
8927
8928       regno = reg_renumber[regno];
8929     }
8930
8931   /* The fake registers will be eliminated to either the stack or
8932      hard frame pointer, both of which are usually valid base registers.
8933      Reload deals with the cases where the eliminated form isn't valid.  */
8934   return (GP_REGNUM_P (regno)
8935           || regno == SP_REGNUM
8936           || regno == FRAME_POINTER_REGNUM
8937           || regno == ARG_POINTER_REGNUM);
8938 }
8939
8940 /* Return true if X is a valid base register for mode MODE.
8941    STRICT_P is true if REG_OK_STRICT is in effect.  */
8942
8943 static bool
8944 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8945 {
8946   if (!strict_p
8947       && SUBREG_P (x)
8948       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8949     x = SUBREG_REG (x);
8950
8951   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8952 }
8953
8954 /* Return true if address offset is a valid index.  If it is, fill in INFO
8955    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8956
8957 static bool
8958 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8959                         machine_mode mode, bool strict_p)
8960 {
8961   enum aarch64_address_type type;
8962   rtx index;
8963   int shift;
8964
8965   /* (reg:P) */
8966   if ((REG_P (x) || SUBREG_P (x))
8967       && GET_MODE (x) == Pmode)
8968     {
8969       type = ADDRESS_REG_REG;
8970       index = x;
8971       shift = 0;
8972     }
8973   /* (sign_extend:DI (reg:SI)) */
8974   else if ((GET_CODE (x) == SIGN_EXTEND
8975             || GET_CODE (x) == ZERO_EXTEND)
8976            && GET_MODE (x) == DImode
8977            && GET_MODE (XEXP (x, 0)) == SImode)
8978     {
8979       type = (GET_CODE (x) == SIGN_EXTEND)
8980         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8981       index = XEXP (x, 0);
8982       shift = 0;
8983     }
8984   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8985   else if (GET_CODE (x) == MULT
8986            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8987                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8988            && GET_MODE (XEXP (x, 0)) == DImode
8989            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8990            && CONST_INT_P (XEXP (x, 1)))
8991     {
8992       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8993         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8994       index = XEXP (XEXP (x, 0), 0);
8995       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8996     }
8997   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8998   else if (GET_CODE (x) == ASHIFT
8999            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9000                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9001            && GET_MODE (XEXP (x, 0)) == DImode
9002            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9003            && CONST_INT_P (XEXP (x, 1)))
9004     {
9005       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9006         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9007       index = XEXP (XEXP (x, 0), 0);
9008       shift = INTVAL (XEXP (x, 1));
9009     }
9010   /* (and:DI (mult:DI (reg:DI) (const_int scale))
9011      (const_int 0xffffffff<<shift)) */
9012   else if (GET_CODE (x) == AND
9013            && GET_MODE (x) == DImode
9014            && GET_CODE (XEXP (x, 0)) == MULT
9015            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9016            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9017            && CONST_INT_P (XEXP (x, 1)))
9018     {
9019       type = ADDRESS_REG_UXTW;
9020       index = XEXP (XEXP (x, 0), 0);
9021       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9022       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9023         shift = -1;
9024     }
9025   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9026      (const_int 0xffffffff<<shift)) */
9027   else if (GET_CODE (x) == AND
9028            && GET_MODE (x) == DImode
9029            && GET_CODE (XEXP (x, 0)) == ASHIFT
9030            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9031            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9032            && CONST_INT_P (XEXP (x, 1)))
9033     {
9034       type = ADDRESS_REG_UXTW;
9035       index = XEXP (XEXP (x, 0), 0);
9036       shift = INTVAL (XEXP (XEXP (x, 0), 1));
9037       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9038         shift = -1;
9039     }
9040   /* (mult:P (reg:P) (const_int scale)) */
9041   else if (GET_CODE (x) == MULT
9042            && GET_MODE (x) == Pmode
9043            && GET_MODE (XEXP (x, 0)) == Pmode
9044            && CONST_INT_P (XEXP (x, 1)))
9045     {
9046       type = ADDRESS_REG_REG;
9047       index = XEXP (x, 0);
9048       shift = exact_log2 (INTVAL (XEXP (x, 1)));
9049     }
9050   /* (ashift:P (reg:P) (const_int shift)) */
9051   else if (GET_CODE (x) == ASHIFT
9052            && GET_MODE (x) == Pmode
9053            && GET_MODE (XEXP (x, 0)) == Pmode
9054            && CONST_INT_P (XEXP (x, 1)))
9055     {
9056       type = ADDRESS_REG_REG;
9057       index = XEXP (x, 0);
9058       shift = INTVAL (XEXP (x, 1));
9059     }
9060   else
9061     return false;
9062
9063   if (!strict_p
9064       && SUBREG_P (index)
9065       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
9066     index = SUBREG_REG (index);
9067
9068   if (aarch64_sve_data_mode_p (mode))
9069     {
9070       if (type != ADDRESS_REG_REG
9071           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9072         return false;
9073     }
9074   else
9075     {
9076       if (shift != 0
9077           && !(IN_RANGE (shift, 1, 3)
9078                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9079         return false;
9080     }
9081
9082   if (REG_P (index)
9083       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9084     {
9085       info->type = type;
9086       info->offset = index;
9087       info->shift = shift;
9088       return true;
9089     }
9090
9091   return false;
9092 }
9093
9094 /* Return true if MODE is one of the modes for which we
9095    support LDP/STP operations.  */
9096
9097 static bool
9098 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9099 {
9100   return mode == SImode || mode == DImode
9101          || mode == SFmode || mode == DFmode
9102          || (aarch64_vector_mode_supported_p (mode)
9103              && (known_eq (GET_MODE_SIZE (mode), 8)
9104                  || (known_eq (GET_MODE_SIZE (mode), 16)
9105                     && (aarch64_tune_params.extra_tuning_flags
9106                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
9107 }
9108
9109 /* Return true if REGNO is a virtual pointer register, or an eliminable
9110    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
9111    include stack_pointer or hard_frame_pointer.  */
9112 static bool
9113 virt_or_elim_regno_p (unsigned regno)
9114 {
9115   return ((regno >= FIRST_VIRTUAL_REGISTER
9116            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9117           || regno == FRAME_POINTER_REGNUM
9118           || regno == ARG_POINTER_REGNUM);
9119 }
9120
9121 /* Return true if X is a valid address of type TYPE for machine mode MODE.
9122    If it is, fill in INFO appropriately.  STRICT_P is true if
9123    REG_OK_STRICT is in effect.  */
9124
9125 bool
9126 aarch64_classify_address (struct aarch64_address_info *info,
9127                           rtx x, machine_mode mode, bool strict_p,
9128                           aarch64_addr_query_type type)
9129 {
9130   enum rtx_code code = GET_CODE (x);
9131   rtx op0, op1;
9132   poly_int64 offset;
9133
9134   HOST_WIDE_INT const_size;
9135
9136   /* Whether a vector mode is partial doesn't affect address legitimacy.
9137      Partial vectors like VNx8QImode allow the same indexed addressing
9138      mode and MUL VL addressing mode as full vectors like VNx16QImode;
9139      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
9140   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9141   vec_flags &= ~VEC_PARTIAL;
9142
9143   /* On BE, we use load/store pair for all large int mode load/stores.
9144      TI/TFmode may also use a load/store pair.  */
9145   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
9146   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
9147                             || type == ADDR_QUERY_LDP_STP_N
9148                             || mode == TImode
9149                             || mode == TFmode
9150                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
9151
9152   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9153      corresponds to the actual size of the memory being loaded/stored and the
9154      mode of the corresponding addressing mode is half of that.  */
9155   if (type == ADDR_QUERY_LDP_STP_N
9156       && known_eq (GET_MODE_SIZE (mode), 16))
9157     mode = DFmode;
9158
9159   bool allow_reg_index_p = (!load_store_pair_p
9160                             && (known_lt (GET_MODE_SIZE (mode), 16)
9161                                 || vec_flags == VEC_ADVSIMD
9162                                 || vec_flags & VEC_SVE_DATA));
9163
9164   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
9165      [Rn, #offset, MUL VL].  */
9166   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9167       && (code != REG && code != PLUS))
9168     return false;
9169
9170   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9171      REG addressing.  */
9172   if (advsimd_struct_p
9173       && !BYTES_BIG_ENDIAN
9174       && (code != POST_INC && code != REG))
9175     return false;
9176
9177   gcc_checking_assert (GET_MODE (x) == VOIDmode
9178                        || SCALAR_INT_MODE_P (GET_MODE (x)));
9179
9180   switch (code)
9181     {
9182     case REG:
9183     case SUBREG:
9184       info->type = ADDRESS_REG_IMM;
9185       info->base = x;
9186       info->offset = const0_rtx;
9187       info->const_offset = 0;
9188       return aarch64_base_register_rtx_p (x, strict_p);
9189
9190     case PLUS:
9191       op0 = XEXP (x, 0);
9192       op1 = XEXP (x, 1);
9193
9194       if (! strict_p
9195           && REG_P (op0)
9196           && virt_or_elim_regno_p (REGNO (op0))
9197           && poly_int_rtx_p (op1, &offset))
9198         {
9199           info->type = ADDRESS_REG_IMM;
9200           info->base = op0;
9201           info->offset = op1;
9202           info->const_offset = offset;
9203
9204           return true;
9205         }
9206
9207       if (maybe_ne (GET_MODE_SIZE (mode), 0)
9208           && aarch64_base_register_rtx_p (op0, strict_p)
9209           && poly_int_rtx_p (op1, &offset))
9210         {
9211           info->type = ADDRESS_REG_IMM;
9212           info->base = op0;
9213           info->offset = op1;
9214           info->const_offset = offset;
9215
9216           /* TImode and TFmode values are allowed in both pairs of X
9217              registers and individual Q registers.  The available
9218              address modes are:
9219              X,X: 7-bit signed scaled offset
9220              Q:   9-bit signed offset
9221              We conservatively require an offset representable in either mode.
9222              When performing the check for pairs of X registers i.e.  LDP/STP
9223              pass down DImode since that is the natural size of the LDP/STP
9224              instruction memory accesses.  */
9225           if (mode == TImode || mode == TFmode)
9226             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9227                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9228                         || offset_12bit_unsigned_scaled_p (mode, offset)));
9229
9230           /* A 7bit offset check because OImode will emit a ldp/stp
9231              instruction (only big endian will get here).
9232              For ldp/stp instructions, the offset is scaled for the size of a
9233              single element of the pair.  */
9234           if (mode == OImode)
9235             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9236
9237           /* Three 9/12 bit offsets checks because CImode will emit three
9238              ldr/str instructions (only big endian will get here).  */
9239           if (mode == CImode)
9240             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9241                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9242                                                                offset + 32)
9243                         || offset_12bit_unsigned_scaled_p (V16QImode,
9244                                                            offset + 32)));
9245
9246           /* Two 7bit offsets checks because XImode will emit two ldp/stp
9247              instructions (only big endian will get here).  */
9248           if (mode == XImode)
9249             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9250                     && aarch64_offset_7bit_signed_scaled_p (TImode,
9251                                                             offset + 32));
9252
9253           /* Make "m" use the LD1 offset range for SVE data modes, so
9254              that pre-RTL optimizers like ivopts will work to that
9255              instead of the wider LDR/STR range.  */
9256           if (vec_flags == VEC_SVE_DATA)
9257             return (type == ADDR_QUERY_M
9258                     ? offset_4bit_signed_scaled_p (mode, offset)
9259                     : offset_9bit_signed_scaled_p (mode, offset));
9260
9261           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9262             {
9263               poly_int64 end_offset = (offset
9264                                        + GET_MODE_SIZE (mode)
9265                                        - BYTES_PER_SVE_VECTOR);
9266               return (type == ADDR_QUERY_M
9267                       ? offset_4bit_signed_scaled_p (mode, offset)
9268                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9269                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9270                                                          end_offset)));
9271             }
9272
9273           if (vec_flags == VEC_SVE_PRED)
9274             return offset_9bit_signed_scaled_p (mode, offset);
9275
9276           if (load_store_pair_p)
9277             return ((known_eq (GET_MODE_SIZE (mode), 4)
9278                      || known_eq (GET_MODE_SIZE (mode), 8)
9279                      || known_eq (GET_MODE_SIZE (mode), 16))
9280                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9281           else
9282             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9283                     || offset_12bit_unsigned_scaled_p (mode, offset));
9284         }
9285
9286       if (allow_reg_index_p)
9287         {
9288           /* Look for base + (scaled/extended) index register.  */
9289           if (aarch64_base_register_rtx_p (op0, strict_p)
9290               && aarch64_classify_index (info, op1, mode, strict_p))
9291             {
9292               info->base = op0;
9293               return true;
9294             }
9295           if (aarch64_base_register_rtx_p (op1, strict_p)
9296               && aarch64_classify_index (info, op0, mode, strict_p))
9297             {
9298               info->base = op1;
9299               return true;
9300             }
9301         }
9302
9303       return false;
9304
9305     case POST_INC:
9306     case POST_DEC:
9307     case PRE_INC:
9308     case PRE_DEC:
9309       info->type = ADDRESS_REG_WB;
9310       info->base = XEXP (x, 0);
9311       info->offset = NULL_RTX;
9312       return aarch64_base_register_rtx_p (info->base, strict_p);
9313
9314     case POST_MODIFY:
9315     case PRE_MODIFY:
9316       info->type = ADDRESS_REG_WB;
9317       info->base = XEXP (x, 0);
9318       if (GET_CODE (XEXP (x, 1)) == PLUS
9319           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9320           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9321           && aarch64_base_register_rtx_p (info->base, strict_p))
9322         {
9323           info->offset = XEXP (XEXP (x, 1), 1);
9324           info->const_offset = offset;
9325
9326           /* TImode and TFmode values are allowed in both pairs of X
9327              registers and individual Q registers.  The available
9328              address modes are:
9329              X,X: 7-bit signed scaled offset
9330              Q:   9-bit signed offset
9331              We conservatively require an offset representable in either mode.
9332            */
9333           if (mode == TImode || mode == TFmode)
9334             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9335                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9336
9337           if (load_store_pair_p)
9338             return ((known_eq (GET_MODE_SIZE (mode), 4)
9339                      || known_eq (GET_MODE_SIZE (mode), 8)
9340                      || known_eq (GET_MODE_SIZE (mode), 16))
9341                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9342           else
9343             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9344         }
9345       return false;
9346
9347     case CONST:
9348     case SYMBOL_REF:
9349     case LABEL_REF:
9350       /* load literal: pc-relative constant pool entry.  Only supported
9351          for SI mode or larger.  */
9352       info->type = ADDRESS_SYMBOLIC;
9353
9354       if (!load_store_pair_p
9355           && GET_MODE_SIZE (mode).is_constant (&const_size)
9356           && const_size >= 4)
9357         {
9358           poly_int64 offset;
9359           rtx sym = strip_offset_and_salt (x, &offset);
9360           return ((LABEL_REF_P (sym)
9361                    || (SYMBOL_REF_P (sym)
9362                        && CONSTANT_POOL_ADDRESS_P (sym)
9363                        && aarch64_pcrelative_literal_loads)));
9364         }
9365       return false;
9366
9367     case LO_SUM:
9368       info->type = ADDRESS_LO_SUM;
9369       info->base = XEXP (x, 0);
9370       info->offset = XEXP (x, 1);
9371       if (allow_reg_index_p
9372           && aarch64_base_register_rtx_p (info->base, strict_p))
9373         {
9374           poly_int64 offset;
9375           HOST_WIDE_INT const_offset;
9376           rtx sym = strip_offset_and_salt (info->offset, &offset);
9377           if (SYMBOL_REF_P (sym)
9378               && offset.is_constant (&const_offset)
9379               && (aarch64_classify_symbol (sym, const_offset)
9380                   == SYMBOL_SMALL_ABSOLUTE))
9381             {
9382               /* The symbol and offset must be aligned to the access size.  */
9383               unsigned int align;
9384
9385               if (CONSTANT_POOL_ADDRESS_P (sym))
9386                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9387               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9388                 {
9389                   tree exp = SYMBOL_REF_DECL (sym);
9390                   align = TYPE_ALIGN (TREE_TYPE (exp));
9391                   align = aarch64_constant_alignment (exp, align);
9392                 }
9393               else if (SYMBOL_REF_DECL (sym))
9394                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9395               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9396                        && SYMBOL_REF_BLOCK (sym) != NULL)
9397                 align = SYMBOL_REF_BLOCK (sym)->alignment;
9398               else
9399                 align = BITS_PER_UNIT;
9400
9401               poly_int64 ref_size = GET_MODE_SIZE (mode);
9402               if (known_eq (ref_size, 0))
9403                 ref_size = GET_MODE_SIZE (DImode);
9404
9405               return (multiple_p (const_offset, ref_size)
9406                       && multiple_p (align / BITS_PER_UNIT, ref_size));
9407             }
9408         }
9409       return false;
9410
9411     default:
9412       return false;
9413     }
9414 }
9415
9416 /* Return true if the address X is valid for a PRFM instruction.
9417    STRICT_P is true if we should do strict checking with
9418    aarch64_classify_address.  */
9419
9420 bool
9421 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9422 {
9423   struct aarch64_address_info addr;
9424
9425   /* PRFM accepts the same addresses as DImode...  */
9426   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9427   if (!res)
9428     return false;
9429
9430   /* ... except writeback forms.  */
9431   return addr.type != ADDRESS_REG_WB;
9432 }
9433
9434 bool
9435 aarch64_symbolic_address_p (rtx x)
9436 {
9437   poly_int64 offset;
9438   x = strip_offset_and_salt (x, &offset);
9439   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
9440 }
9441
9442 /* Classify the base of symbolic expression X.  */
9443
9444 enum aarch64_symbol_type
9445 aarch64_classify_symbolic_expression (rtx x)
9446 {
9447   rtx offset;
9448
9449   split_const (x, &x, &offset);
9450   return aarch64_classify_symbol (x, INTVAL (offset));
9451 }
9452
9453
9454 /* Return TRUE if X is a legitimate address for accessing memory in
9455    mode MODE.  */
9456 static bool
9457 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9458 {
9459   struct aarch64_address_info addr;
9460
9461   return aarch64_classify_address (&addr, x, mode, strict_p);
9462 }
9463
9464 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9465    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9466 bool
9467 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9468                               aarch64_addr_query_type type)
9469 {
9470   struct aarch64_address_info addr;
9471
9472   return aarch64_classify_address (&addr, x, mode, strict_p, type);
9473 }
9474
9475 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
9476
9477 static bool
9478 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9479                                          poly_int64 orig_offset,
9480                                          machine_mode mode)
9481 {
9482   HOST_WIDE_INT size;
9483   if (GET_MODE_SIZE (mode).is_constant (&size))
9484     {
9485       HOST_WIDE_INT const_offset, second_offset;
9486
9487       /* A general SVE offset is A * VQ + B.  Remove the A component from
9488          coefficient 0 in order to get the constant B.  */
9489       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9490
9491       /* Split an out-of-range address displacement into a base and
9492          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
9493          range otherwise to increase opportunities for sharing the base
9494          address of different sizes.  Unaligned accesses use the signed
9495          9-bit range, TImode/TFmode use the intersection of signed
9496          scaled 7-bit and signed 9-bit offset.  */
9497       if (mode == TImode || mode == TFmode)
9498         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9499       else if ((const_offset & (size - 1)) != 0)
9500         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9501       else
9502         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9503
9504       if (second_offset == 0 || known_eq (orig_offset, second_offset))
9505         return false;
9506
9507       /* Split the offset into second_offset and the rest.  */
9508       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9509       *offset2 = gen_int_mode (second_offset, Pmode);
9510       return true;
9511     }
9512   else
9513     {
9514       /* Get the mode we should use as the basis of the range.  For structure
9515          modes this is the mode of one vector.  */
9516       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9517       machine_mode step_mode
9518         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9519
9520       /* Get the "mul vl" multiplier we'd like to use.  */
9521       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9522       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9523       if (vec_flags & VEC_SVE_DATA)
9524         /* LDR supports a 9-bit range, but the move patterns for
9525            structure modes require all vectors to be in range of the
9526            same base.  The simplest way of accomodating that while still
9527            promoting reuse of anchor points between different modes is
9528            to use an 8-bit range unconditionally.  */
9529         vnum = ((vnum + 128) & 255) - 128;
9530       else
9531         /* Predicates are only handled singly, so we might as well use
9532            the full range.  */
9533         vnum = ((vnum + 256) & 511) - 256;
9534       if (vnum == 0)
9535         return false;
9536
9537       /* Convert the "mul vl" multiplier into a byte offset.  */
9538       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9539       if (known_eq (second_offset, orig_offset))
9540         return false;
9541
9542       /* Split the offset into second_offset and the rest.  */
9543       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9544       *offset2 = gen_int_mode (second_offset, Pmode);
9545       return true;
9546     }
9547 }
9548
9549 /* Return the binary representation of floating point constant VALUE in INTVAL.
9550    If the value cannot be converted, return false without setting INTVAL.
9551    The conversion is done in the given MODE.  */
9552 bool
9553 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9554 {
9555
9556   /* We make a general exception for 0.  */
9557   if (aarch64_float_const_zero_rtx_p (value))
9558     {
9559       *intval = 0;
9560       return true;
9561     }
9562
9563   scalar_float_mode mode;
9564   if (!CONST_DOUBLE_P (value)
9565       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9566       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9567       /* Only support up to DF mode.  */
9568       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9569     return false;
9570
9571   unsigned HOST_WIDE_INT ival = 0;
9572
9573   long res[2];
9574   real_to_target (res,
9575                   CONST_DOUBLE_REAL_VALUE (value),
9576                   REAL_MODE_FORMAT (mode));
9577
9578   if (mode == DFmode)
9579     {
9580       int order = BYTES_BIG_ENDIAN ? 1 : 0;
9581       ival = zext_hwi (res[order], 32);
9582       ival |= (zext_hwi (res[1 - order], 32) << 32);
9583     }
9584   else
9585       ival = zext_hwi (res[0], 32);
9586
9587   *intval = ival;
9588   return true;
9589 }
9590
9591 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9592    single MOV(+MOVK) followed by an FMOV.  */
9593 bool
9594 aarch64_float_const_rtx_p (rtx x)
9595 {
9596   machine_mode mode = GET_MODE (x);
9597   if (mode == VOIDmode)
9598     return false;
9599
9600   /* Determine whether it's cheaper to write float constants as
9601      mov/movk pairs over ldr/adrp pairs.  */
9602   unsigned HOST_WIDE_INT ival;
9603
9604   if (CONST_DOUBLE_P (x)
9605       && SCALAR_FLOAT_MODE_P (mode)
9606       && aarch64_reinterpret_float_as_int (x, &ival))
9607     {
9608       scalar_int_mode imode = (mode == HFmode
9609                                ? SImode
9610                                : int_mode_for_mode (mode).require ());
9611       int num_instr = aarch64_internal_mov_immediate
9612                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9613       return num_instr < 3;
9614     }
9615
9616   return false;
9617 }
9618
9619 /* Return TRUE if rtx X is immediate constant 0.0 */
9620 bool
9621 aarch64_float_const_zero_rtx_p (rtx x)
9622 {
9623   if (GET_MODE (x) == VOIDmode)
9624     return false;
9625
9626   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9627     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9628   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9629 }
9630
9631 /* Return TRUE if rtx X is immediate constant that fits in a single
9632    MOVI immediate operation.  */
9633 bool
9634 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9635 {
9636   if (!TARGET_SIMD)
9637      return false;
9638
9639   machine_mode vmode;
9640   scalar_int_mode imode;
9641   unsigned HOST_WIDE_INT ival;
9642
9643   if (CONST_DOUBLE_P (x)
9644       && SCALAR_FLOAT_MODE_P (mode))
9645     {
9646       if (!aarch64_reinterpret_float_as_int (x, &ival))
9647         return false;
9648
9649       /* We make a general exception for 0.  */
9650       if (aarch64_float_const_zero_rtx_p (x))
9651         return true;
9652
9653       imode = int_mode_for_mode (mode).require ();
9654     }
9655   else if (CONST_INT_P (x)
9656            && is_a <scalar_int_mode> (mode, &imode))
9657     ival = INTVAL (x);
9658   else
9659     return false;
9660
9661    /* use a 64 bit mode for everything except for DI/DF mode, where we use
9662      a 128 bit vector mode.  */
9663   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9664
9665   vmode = aarch64_simd_container_mode (imode, width);
9666   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9667
9668   return aarch64_simd_valid_immediate (v_op, NULL);
9669 }
9670
9671
9672 /* Return the fixed registers used for condition codes.  */
9673
9674 static bool
9675 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9676 {
9677   *p1 = CC_REGNUM;
9678   *p2 = INVALID_REGNUM;
9679   return true;
9680 }
9681
9682 /* This function is used by the call expanders of the machine description.
9683    RESULT is the register in which the result is returned.  It's NULL for
9684    "call" and "sibcall".
9685    MEM is the location of the function call.
9686    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9687    SIBCALL indicates whether this function call is normal call or sibling call.
9688    It will generate different pattern accordingly.  */
9689
9690 void
9691 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9692 {
9693   rtx call, callee, tmp;
9694   rtvec vec;
9695   machine_mode mode;
9696
9697   gcc_assert (MEM_P (mem));
9698   callee = XEXP (mem, 0);
9699   mode = GET_MODE (callee);
9700   gcc_assert (mode == Pmode);
9701
9702   /* Decide if we should generate indirect calls by loading the
9703      address of the callee into a register before performing
9704      the branch-and-link.  */
9705   if (SYMBOL_REF_P (callee)
9706       ? (aarch64_is_long_call_p (callee)
9707          || aarch64_is_noplt_call_p (callee))
9708       : !REG_P (callee))
9709     XEXP (mem, 0) = force_reg (mode, callee);
9710
9711   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9712
9713   if (result != NULL_RTX)
9714     call = gen_rtx_SET (result, call);
9715
9716   if (sibcall)
9717     tmp = ret_rtx;
9718   else
9719     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9720
9721   gcc_assert (CONST_INT_P (callee_abi));
9722   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9723                                UNSPEC_CALLEE_ABI);
9724
9725   vec = gen_rtvec (3, call, callee_abi, tmp);
9726   call = gen_rtx_PARALLEL (VOIDmode, vec);
9727
9728   aarch64_emit_call_insn (call);
9729 }
9730
9731 /* Emit call insn with PAT and do aarch64-specific handling.  */
9732
9733 void
9734 aarch64_emit_call_insn (rtx pat)
9735 {
9736   rtx insn = emit_call_insn (pat);
9737
9738   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9739   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9740   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9741 }
9742
9743 machine_mode
9744 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9745 {
9746   machine_mode mode_x = GET_MODE (x);
9747   rtx_code code_x = GET_CODE (x);
9748
9749   /* All floating point compares return CCFP if it is an equality
9750      comparison, and CCFPE otherwise.  */
9751   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9752     {
9753       switch (code)
9754         {
9755         case EQ:
9756         case NE:
9757         case UNORDERED:
9758         case ORDERED:
9759         case UNLT:
9760         case UNLE:
9761         case UNGT:
9762         case UNGE:
9763         case UNEQ:
9764           return CCFPmode;
9765
9766         case LT:
9767         case LE:
9768         case GT:
9769         case GE:
9770         case LTGT:
9771           return CCFPEmode;
9772
9773         default:
9774           gcc_unreachable ();
9775         }
9776     }
9777
9778   /* Equality comparisons of short modes against zero can be performed
9779      using the TST instruction with the appropriate bitmask.  */
9780   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9781       && (code == EQ || code == NE)
9782       && (mode_x == HImode || mode_x == QImode))
9783     return CC_NZmode;
9784
9785   /* Similarly, comparisons of zero_extends from shorter modes can
9786      be performed using an ANDS with an immediate mask.  */
9787   if (y == const0_rtx && code_x == ZERO_EXTEND
9788       && (mode_x == SImode || mode_x == DImode)
9789       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9790       && (code == EQ || code == NE))
9791     return CC_NZmode;
9792
9793   if ((mode_x == SImode || mode_x == DImode)
9794       && y == const0_rtx
9795       && (code == EQ || code == NE || code == LT || code == GE)
9796       && (code_x == PLUS || code_x == MINUS || code_x == AND
9797           || code_x == NEG
9798           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9799               && CONST_INT_P (XEXP (x, 2)))))
9800     return CC_NZmode;
9801
9802   /* A compare with a shifted operand.  Because of canonicalization,
9803      the comparison will have to be swapped when we emit the assembly
9804      code.  */
9805   if ((mode_x == SImode || mode_x == DImode)
9806       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
9807       && (code_x == ASHIFT || code_x == ASHIFTRT
9808           || code_x == LSHIFTRT
9809           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9810     return CC_SWPmode;
9811
9812   /* Similarly for a negated operand, but we can only do this for
9813      equalities.  */
9814   if ((mode_x == SImode || mode_x == DImode)
9815       && (REG_P (y) || SUBREG_P (y))
9816       && (code == EQ || code == NE)
9817       && code_x == NEG)
9818     return CC_Zmode;
9819
9820   /* A test for unsigned overflow from an addition.  */
9821   if ((mode_x == DImode || mode_x == TImode)
9822       && (code == LTU || code == GEU)
9823       && code_x == PLUS
9824       && rtx_equal_p (XEXP (x, 0), y))
9825     return CC_Cmode;
9826
9827   /* A test for unsigned overflow from an add with carry.  */
9828   if ((mode_x == DImode || mode_x == TImode)
9829       && (code == LTU || code == GEU)
9830       && code_x == PLUS
9831       && CONST_SCALAR_INT_P (y)
9832       && (rtx_mode_t (y, mode_x)
9833           == (wi::shwi (1, mode_x)
9834               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9835     return CC_ADCmode;
9836
9837   /* A test for signed overflow.  */
9838   if ((mode_x == DImode || mode_x == TImode)
9839       && code == NE
9840       && code_x == PLUS
9841       && GET_CODE (y) == SIGN_EXTEND)
9842     return CC_Vmode;
9843
9844   /* For everything else, return CCmode.  */
9845   return CCmode;
9846 }
9847
9848 static int
9849 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9850
9851 int
9852 aarch64_get_condition_code (rtx x)
9853 {
9854   machine_mode mode = GET_MODE (XEXP (x, 0));
9855   enum rtx_code comp_code = GET_CODE (x);
9856
9857   if (GET_MODE_CLASS (mode) != MODE_CC)
9858     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9859   return aarch64_get_condition_code_1 (mode, comp_code);
9860 }
9861
9862 static int
9863 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9864 {
9865   switch (mode)
9866     {
9867     case E_CCFPmode:
9868     case E_CCFPEmode:
9869       switch (comp_code)
9870         {
9871         case GE: return AARCH64_GE;
9872         case GT: return AARCH64_GT;
9873         case LE: return AARCH64_LS;
9874         case LT: return AARCH64_MI;
9875         case NE: return AARCH64_NE;
9876         case EQ: return AARCH64_EQ;
9877         case ORDERED: return AARCH64_VC;
9878         case UNORDERED: return AARCH64_VS;
9879         case UNLT: return AARCH64_LT;
9880         case UNLE: return AARCH64_LE;
9881         case UNGT: return AARCH64_HI;
9882         case UNGE: return AARCH64_PL;
9883         default: return -1;
9884         }
9885       break;
9886
9887     case E_CCmode:
9888       switch (comp_code)
9889         {
9890         case NE: return AARCH64_NE;
9891         case EQ: return AARCH64_EQ;
9892         case GE: return AARCH64_GE;
9893         case GT: return AARCH64_GT;
9894         case LE: return AARCH64_LE;
9895         case LT: return AARCH64_LT;
9896         case GEU: return AARCH64_CS;
9897         case GTU: return AARCH64_HI;
9898         case LEU: return AARCH64_LS;
9899         case LTU: return AARCH64_CC;
9900         default: return -1;
9901         }
9902       break;
9903
9904     case E_CC_SWPmode:
9905       switch (comp_code)
9906         {
9907         case NE: return AARCH64_NE;
9908         case EQ: return AARCH64_EQ;
9909         case GE: return AARCH64_LE;
9910         case GT: return AARCH64_LT;
9911         case LE: return AARCH64_GE;
9912         case LT: return AARCH64_GT;
9913         case GEU: return AARCH64_LS;
9914         case GTU: return AARCH64_CC;
9915         case LEU: return AARCH64_CS;
9916         case LTU: return AARCH64_HI;
9917         default: return -1;
9918         }
9919       break;
9920
9921     case E_CC_NZCmode:
9922       switch (comp_code)
9923         {
9924         case NE: return AARCH64_NE; /* = any */
9925         case EQ: return AARCH64_EQ; /* = none */
9926         case GE: return AARCH64_PL; /* = nfrst */
9927         case LT: return AARCH64_MI; /* = first */
9928         case GEU: return AARCH64_CS; /* = nlast */
9929         case GTU: return AARCH64_HI; /* = pmore */
9930         case LEU: return AARCH64_LS; /* = plast */
9931         case LTU: return AARCH64_CC; /* = last */
9932         default: return -1;
9933         }
9934       break;
9935
9936     case E_CC_NZmode:
9937       switch (comp_code)
9938         {
9939         case NE: return AARCH64_NE;
9940         case EQ: return AARCH64_EQ;
9941         case GE: return AARCH64_PL;
9942         case LT: return AARCH64_MI;
9943         default: return -1;
9944         }
9945       break;
9946
9947     case E_CC_Zmode:
9948       switch (comp_code)
9949         {
9950         case NE: return AARCH64_NE;
9951         case EQ: return AARCH64_EQ;
9952         default: return -1;
9953         }
9954       break;
9955
9956     case E_CC_Cmode:
9957       switch (comp_code)
9958         {
9959         case LTU: return AARCH64_CS;
9960         case GEU: return AARCH64_CC;
9961         default: return -1;
9962         }
9963       break;
9964
9965     case E_CC_ADCmode:
9966       switch (comp_code)
9967         {
9968         case GEU: return AARCH64_CS;
9969         case LTU: return AARCH64_CC;
9970         default: return -1;
9971         }
9972       break;
9973
9974     case E_CC_Vmode:
9975       switch (comp_code)
9976         {
9977         case NE: return AARCH64_VS;
9978         case EQ: return AARCH64_VC;
9979         default: return -1;
9980         }
9981       break;
9982
9983     default:
9984       return -1;
9985     }
9986
9987   return -1;
9988 }
9989
9990 bool
9991 aarch64_const_vec_all_same_in_range_p (rtx x,
9992                                        HOST_WIDE_INT minval,
9993                                        HOST_WIDE_INT maxval)
9994 {
9995   rtx elt;
9996   return (const_vec_duplicate_p (x, &elt)
9997           && CONST_INT_P (elt)
9998           && IN_RANGE (INTVAL (elt), minval, maxval));
9999 }
10000
10001 bool
10002 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10003 {
10004   return aarch64_const_vec_all_same_in_range_p (x, val, val);
10005 }
10006
10007 /* Return true if VEC is a constant in which every element is in the range
10008    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
10009
10010 static bool
10011 aarch64_const_vec_all_in_range_p (rtx vec,
10012                                   HOST_WIDE_INT minval,
10013                                   HOST_WIDE_INT maxval)
10014 {
10015   if (GET_CODE (vec) != CONST_VECTOR
10016       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10017     return false;
10018
10019   int nunits;
10020   if (!CONST_VECTOR_STEPPED_P (vec))
10021     nunits = const_vector_encoded_nelts (vec);
10022   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10023     return false;
10024
10025   for (int i = 0; i < nunits; i++)
10026     {
10027       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10028       if (!CONST_INT_P (vec_elem)
10029           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10030         return false;
10031     }
10032   return true;
10033 }
10034
10035 /* N Z C V.  */
10036 #define AARCH64_CC_V 1
10037 #define AARCH64_CC_C (1 << 1)
10038 #define AARCH64_CC_Z (1 << 2)
10039 #define AARCH64_CC_N (1 << 3)
10040
10041 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
10042 static const int aarch64_nzcv_codes[] =
10043 {
10044   0,            /* EQ, Z == 1.  */
10045   AARCH64_CC_Z, /* NE, Z == 0.  */
10046   0,            /* CS, C == 1.  */
10047   AARCH64_CC_C, /* CC, C == 0.  */
10048   0,            /* MI, N == 1.  */
10049   AARCH64_CC_N, /* PL, N == 0.  */
10050   0,            /* VS, V == 1.  */
10051   AARCH64_CC_V, /* VC, V == 0.  */
10052   0,            /* HI, C ==1 && Z == 0.  */
10053   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
10054   AARCH64_CC_V, /* GE, N == V.  */
10055   0,            /* LT, N != V.  */
10056   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
10057   0,            /* LE, !(Z == 0 && N == V).  */
10058   0,            /* AL, Any.  */
10059   0             /* NV, Any.  */
10060 };
10061
10062 /* Print floating-point vector immediate operand X to F, negating it
10063    first if NEGATE is true.  Return true on success, false if it isn't
10064    a constant we can handle.  */
10065
10066 static bool
10067 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10068 {
10069   rtx elt;
10070
10071   if (!const_vec_duplicate_p (x, &elt))
10072     return false;
10073
10074   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10075   if (negate)
10076     r = real_value_negate (&r);
10077
10078   /* Handle the SVE single-bit immediates specially, since they have a
10079      fixed form in the assembly syntax.  */
10080   if (real_equal (&r, &dconst0))
10081     asm_fprintf (f, "0.0");
10082   else if (real_equal (&r, &dconst2))
10083     asm_fprintf (f, "2.0");
10084   else if (real_equal (&r, &dconst1))
10085     asm_fprintf (f, "1.0");
10086   else if (real_equal (&r, &dconsthalf))
10087     asm_fprintf (f, "0.5");
10088   else
10089     {
10090       const int buf_size = 20;
10091       char float_buf[buf_size] = {'\0'};
10092       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10093                                 1, GET_MODE (elt));
10094       asm_fprintf (f, "%s", float_buf);
10095     }
10096
10097   return true;
10098 }
10099
10100 /* Return the equivalent letter for size.  */
10101 static char
10102 sizetochar (int size)
10103 {
10104   switch (size)
10105     {
10106     case 64: return 'd';
10107     case 32: return 's';
10108     case 16: return 'h';
10109     case 8 : return 'b';
10110     default: gcc_unreachable ();
10111     }
10112 }
10113
10114 /* Print operand X to file F in a target specific manner according to CODE.
10115    The acceptable formatting commands given by CODE are:
10116      'c':               An integer or symbol address without a preceding #
10117                         sign.
10118      'C':               Take the duplicated element in a vector constant
10119                         and print it in hex.
10120      'D':               Take the duplicated element in a vector constant
10121                         and print it as an unsigned integer, in decimal.
10122      'e':               Print the sign/zero-extend size as a character 8->b,
10123                         16->h, 32->w.  Can also be used for masks:
10124                         0xff->b, 0xffff->h, 0xffffffff->w.
10125      'I':               If the operand is a duplicated vector constant,
10126                         replace it with the duplicated scalar.  If the
10127                         operand is then a floating-point constant, replace
10128                         it with the integer bit representation.  Print the
10129                         transformed constant as a signed decimal number.
10130      'p':               Prints N such that 2^N == X (X must be power of 2 and
10131                         const int).
10132      'P':               Print the number of non-zero bits in X (a const_int).
10133      'H':               Print the higher numbered register of a pair (TImode)
10134                         of regs.
10135      'm':               Print a condition (eq, ne, etc).
10136      'M':               Same as 'm', but invert condition.
10137      'N':               Take the duplicated element in a vector constant
10138                         and print the negative of it in decimal.
10139      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
10140      'S/T/U/V':         Print a FP/SIMD register name for a register list.
10141                         The register printed is the FP/SIMD register name
10142                         of X + 0/1/2/3 for S/T/U/V.
10143      'R':               Print a scalar Integer/FP/SIMD register name + 1.
10144      'X':               Print bottom 16 bits of integer constant in hex.
10145      'w/x':             Print a general register name or the zero register
10146                         (32-bit or 64-bit).
10147      '0':               Print a normal operand, if it's a general register,
10148                         then we assume DImode.
10149      'k':               Print NZCV for conditional compare instructions.
10150      'A':               Output address constant representing the first
10151                         argument of X, specifying a relocation offset
10152                         if appropriate.
10153      'L':               Output constant address specified by X
10154                         with a relocation offset if appropriate.
10155      'G':               Prints address of X, specifying a PC relative
10156                         relocation mode if appropriate.
10157      'y':               Output address of LDP or STP - this is used for
10158                         some LDP/STPs which don't use a PARALLEL in their
10159                         pattern (so the mode needs to be adjusted).
10160      'z':               Output address of a typical LDP or STP.  */
10161
10162 static void
10163 aarch64_print_operand (FILE *f, rtx x, int code)
10164 {
10165   rtx elt;
10166   switch (code)
10167     {
10168     case 'c':
10169       if (CONST_INT_P (x))
10170         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10171       else
10172         {
10173           poly_int64 offset;
10174           rtx base = strip_offset_and_salt (x, &offset);
10175           if (SYMBOL_REF_P (base))
10176             output_addr_const (f, x);
10177           else
10178             output_operand_lossage ("unsupported operand for code '%c'", code);
10179         }
10180       break;
10181
10182     case 'e':
10183       {
10184         x = unwrap_const_vec_duplicate (x);
10185         if (!CONST_INT_P (x))
10186           {
10187             output_operand_lossage ("invalid operand for '%%%c'", code);
10188             return;
10189           }
10190
10191         HOST_WIDE_INT val = INTVAL (x);
10192         if ((val & ~7) == 8 || val == 0xff)
10193           fputc ('b', f);
10194         else if ((val & ~7) == 16 || val == 0xffff)
10195           fputc ('h', f);
10196         else if ((val & ~7) == 32 || val == 0xffffffff)
10197           fputc ('w', f);
10198         else
10199           {
10200             output_operand_lossage ("invalid operand for '%%%c'", code);
10201             return;
10202           }
10203       }
10204       break;
10205
10206     case 'p':
10207       {
10208         int n;
10209
10210         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10211           {
10212             output_operand_lossage ("invalid operand for '%%%c'", code);
10213             return;
10214           }
10215
10216         asm_fprintf (f, "%d", n);
10217       }
10218       break;
10219
10220     case 'P':
10221       if (!CONST_INT_P (x))
10222         {
10223           output_operand_lossage ("invalid operand for '%%%c'", code);
10224           return;
10225         }
10226
10227       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10228       break;
10229
10230     case 'H':
10231       if (x == const0_rtx)
10232         {
10233           asm_fprintf (f, "xzr");
10234           break;
10235         }
10236
10237       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10238         {
10239           output_operand_lossage ("invalid operand for '%%%c'", code);
10240           return;
10241         }
10242
10243       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10244       break;
10245
10246     case 'I':
10247       {
10248         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10249         if (CONST_INT_P (x))
10250           asm_fprintf (f, "%wd", INTVAL (x));
10251         else
10252           {
10253             output_operand_lossage ("invalid operand for '%%%c'", code);
10254             return;
10255           }
10256         break;
10257       }
10258
10259     case 'M':
10260     case 'm':
10261       {
10262         int cond_code;
10263         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
10264         if (x == const_true_rtx)
10265           {
10266             if (code == 'M')
10267               fputs ("nv", f);
10268             return;
10269           }
10270
10271         if (!COMPARISON_P (x))
10272           {
10273             output_operand_lossage ("invalid operand for '%%%c'", code);
10274             return;
10275           }
10276
10277         cond_code = aarch64_get_condition_code (x);
10278         gcc_assert (cond_code >= 0);
10279         if (code == 'M')
10280           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10281         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10282           fputs (aarch64_sve_condition_codes[cond_code], f);
10283         else
10284           fputs (aarch64_condition_codes[cond_code], f);
10285       }
10286       break;
10287
10288     case 'N':
10289       if (!const_vec_duplicate_p (x, &elt))
10290         {
10291           output_operand_lossage ("invalid vector constant");
10292           return;
10293         }
10294
10295       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10296         asm_fprintf (f, "%wd", -INTVAL (elt));
10297       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10298                && aarch64_print_vector_float_operand (f, x, true))
10299         ;
10300       else
10301         {
10302           output_operand_lossage ("invalid vector constant");
10303           return;
10304         }
10305       break;
10306
10307     case 'b':
10308     case 'h':
10309     case 's':
10310     case 'd':
10311     case 'q':
10312       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10313         {
10314           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10315           return;
10316         }
10317       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10318       break;
10319
10320     case 'S':
10321     case 'T':
10322     case 'U':
10323     case 'V':
10324       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10325         {
10326           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10327           return;
10328         }
10329       asm_fprintf (f, "%c%d",
10330                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10331                    REGNO (x) - V0_REGNUM + (code - 'S'));
10332       break;
10333
10334     case 'R':
10335       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10336         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10337       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10338         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10339       else
10340         output_operand_lossage ("incompatible register operand for '%%%c'",
10341                                 code);
10342       break;
10343
10344     case 'X':
10345       if (!CONST_INT_P (x))
10346         {
10347           output_operand_lossage ("invalid operand for '%%%c'", code);
10348           return;
10349         }
10350       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10351       break;
10352
10353     case 'C':
10354       {
10355         /* Print a replicated constant in hex.  */
10356         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10357           {
10358             output_operand_lossage ("invalid operand for '%%%c'", code);
10359             return;
10360           }
10361         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10362         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10363       }
10364       break;
10365
10366     case 'D':
10367       {
10368         /* Print a replicated constant in decimal, treating it as
10369            unsigned.  */
10370         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10371           {
10372             output_operand_lossage ("invalid operand for '%%%c'", code);
10373             return;
10374           }
10375         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10376         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10377       }
10378       break;
10379
10380     case 'w':
10381     case 'x':
10382       if (x == const0_rtx
10383           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10384         {
10385           asm_fprintf (f, "%czr", code);
10386           break;
10387         }
10388
10389       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10390         {
10391           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10392           break;
10393         }
10394
10395       if (REG_P (x) && REGNO (x) == SP_REGNUM)
10396         {
10397           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10398           break;
10399         }
10400
10401       /* Fall through */
10402
10403     case 0:
10404       if (x == NULL)
10405         {
10406           output_operand_lossage ("missing operand");
10407           return;
10408         }
10409
10410       switch (GET_CODE (x))
10411         {
10412         case REG:
10413           if (aarch64_sve_data_mode_p (GET_MODE (x)))
10414             {
10415               if (REG_NREGS (x) == 1)
10416                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10417               else
10418                 {
10419                   char suffix
10420                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10421                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
10422                                REGNO (x) - V0_REGNUM, suffix,
10423                                END_REGNO (x) - V0_REGNUM - 1, suffix);
10424                 }
10425             }
10426           else
10427             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10428           break;
10429
10430         case MEM:
10431           output_address (GET_MODE (x), XEXP (x, 0));
10432           break;
10433
10434         case LABEL_REF:
10435         case SYMBOL_REF:
10436           output_addr_const (asm_out_file, x);
10437           break;
10438
10439         case CONST_INT:
10440           asm_fprintf (f, "%wd", INTVAL (x));
10441           break;
10442
10443         case CONST:
10444           if (!VECTOR_MODE_P (GET_MODE (x)))
10445             {
10446               output_addr_const (asm_out_file, x);
10447               break;
10448             }
10449           /* fall through */
10450
10451         case CONST_VECTOR:
10452           if (!const_vec_duplicate_p (x, &elt))
10453             {
10454               output_operand_lossage ("invalid vector constant");
10455               return;
10456             }
10457
10458           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10459             asm_fprintf (f, "%wd", INTVAL (elt));
10460           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10461                    && aarch64_print_vector_float_operand (f, x, false))
10462             ;
10463           else
10464             {
10465               output_operand_lossage ("invalid vector constant");
10466               return;
10467             }
10468           break;
10469
10470         case CONST_DOUBLE:
10471           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10472              be getting CONST_DOUBLEs holding integers.  */
10473           gcc_assert (GET_MODE (x) != VOIDmode);
10474           if (aarch64_float_const_zero_rtx_p (x))
10475             {
10476               fputc ('0', f);
10477               break;
10478             }
10479           else if (aarch64_float_const_representable_p (x))
10480             {
10481 #define buf_size 20
10482               char float_buf[buf_size] = {'\0'};
10483               real_to_decimal_for_mode (float_buf,
10484                                         CONST_DOUBLE_REAL_VALUE (x),
10485                                         buf_size, buf_size,
10486                                         1, GET_MODE (x));
10487               asm_fprintf (asm_out_file, "%s", float_buf);
10488               break;
10489 #undef buf_size
10490             }
10491           output_operand_lossage ("invalid constant");
10492           return;
10493         default:
10494           output_operand_lossage ("invalid operand");
10495           return;
10496         }
10497       break;
10498
10499     case 'A':
10500       if (GET_CODE (x) == HIGH)
10501         x = XEXP (x, 0);
10502
10503       switch (aarch64_classify_symbolic_expression (x))
10504         {
10505         case SYMBOL_SMALL_GOT_4G:
10506           asm_fprintf (asm_out_file, ":got:");
10507           break;
10508
10509         case SYMBOL_SMALL_TLSGD:
10510           asm_fprintf (asm_out_file, ":tlsgd:");
10511           break;
10512
10513         case SYMBOL_SMALL_TLSDESC:
10514           asm_fprintf (asm_out_file, ":tlsdesc:");
10515           break;
10516
10517         case SYMBOL_SMALL_TLSIE:
10518           asm_fprintf (asm_out_file, ":gottprel:");
10519           break;
10520
10521         case SYMBOL_TLSLE24:
10522           asm_fprintf (asm_out_file, ":tprel:");
10523           break;
10524
10525         case SYMBOL_TINY_GOT:
10526           gcc_unreachable ();
10527           break;
10528
10529         default:
10530           break;
10531         }
10532       output_addr_const (asm_out_file, x);
10533       break;
10534
10535     case 'L':
10536       switch (aarch64_classify_symbolic_expression (x))
10537         {
10538         case SYMBOL_SMALL_GOT_4G:
10539           asm_fprintf (asm_out_file, ":lo12:");
10540           break;
10541
10542         case SYMBOL_SMALL_TLSGD:
10543           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10544           break;
10545
10546         case SYMBOL_SMALL_TLSDESC:
10547           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10548           break;
10549
10550         case SYMBOL_SMALL_TLSIE:
10551           asm_fprintf (asm_out_file, ":gottprel_lo12:");
10552           break;
10553
10554         case SYMBOL_TLSLE12:
10555           asm_fprintf (asm_out_file, ":tprel_lo12:");
10556           break;
10557
10558         case SYMBOL_TLSLE24:
10559           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10560           break;
10561
10562         case SYMBOL_TINY_GOT:
10563           asm_fprintf (asm_out_file, ":got:");
10564           break;
10565
10566         case SYMBOL_TINY_TLSIE:
10567           asm_fprintf (asm_out_file, ":gottprel:");
10568           break;
10569
10570         default:
10571           break;
10572         }
10573       output_addr_const (asm_out_file, x);
10574       break;
10575
10576     case 'G':
10577       switch (aarch64_classify_symbolic_expression (x))
10578         {
10579         case SYMBOL_TLSLE24:
10580           asm_fprintf (asm_out_file, ":tprel_hi12:");
10581           break;
10582         default:
10583           break;
10584         }
10585       output_addr_const (asm_out_file, x);
10586       break;
10587
10588     case 'k':
10589       {
10590         HOST_WIDE_INT cond_code;
10591
10592         if (!CONST_INT_P (x))
10593           {
10594             output_operand_lossage ("invalid operand for '%%%c'", code);
10595             return;
10596           }
10597
10598         cond_code = INTVAL (x);
10599         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10600         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10601       }
10602       break;
10603
10604     case 'y':
10605     case 'z':
10606       {
10607         machine_mode mode = GET_MODE (x);
10608
10609         if (!MEM_P (x)
10610             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10611           {
10612             output_operand_lossage ("invalid operand for '%%%c'", code);
10613             return;
10614           }
10615
10616         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10617                                             code == 'y'
10618                                             ? ADDR_QUERY_LDP_STP_N
10619                                             : ADDR_QUERY_LDP_STP))
10620           output_operand_lossage ("invalid operand prefix '%%%c'", code);
10621       }
10622       break;
10623
10624     default:
10625       output_operand_lossage ("invalid operand prefix '%%%c'", code);
10626       return;
10627     }
10628 }
10629
10630 /* Print address 'x' of a memory access with mode 'mode'.
10631    'op' is the context required by aarch64_classify_address.  It can either be
10632    MEM for a normal memory access or PARALLEL for LDP/STP.  */
10633 static bool
10634 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10635                                 aarch64_addr_query_type type)
10636 {
10637   struct aarch64_address_info addr;
10638   unsigned int size, vec_flags;
10639
10640   /* Check all addresses are Pmode - including ILP32.  */
10641   if (GET_MODE (x) != Pmode
10642       && (!CONST_INT_P (x)
10643           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10644     {
10645       output_operand_lossage ("invalid address mode");
10646       return false;
10647     }
10648
10649   if (aarch64_classify_address (&addr, x, mode, true, type))
10650     switch (addr.type)
10651       {
10652       case ADDRESS_REG_IMM:
10653         if (known_eq (addr.const_offset, 0))
10654           {
10655             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10656             return true;
10657           }
10658
10659         vec_flags = aarch64_classify_vector_mode (mode);
10660         if (vec_flags & VEC_ANY_SVE)
10661           {
10662             HOST_WIDE_INT vnum
10663               = exact_div (addr.const_offset,
10664                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10665             asm_fprintf (f, "[%s, #%wd, mul vl]",
10666                          reg_names[REGNO (addr.base)], vnum);
10667             return true;
10668           }
10669
10670         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10671                      INTVAL (addr.offset));
10672         return true;
10673
10674       case ADDRESS_REG_REG:
10675         if (addr.shift == 0)
10676           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10677                        reg_names [REGNO (addr.offset)]);
10678         else
10679           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10680                        reg_names [REGNO (addr.offset)], addr.shift);
10681         return true;
10682
10683       case ADDRESS_REG_UXTW:
10684         if (addr.shift == 0)
10685           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10686                        REGNO (addr.offset) - R0_REGNUM);
10687         else
10688           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10689                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
10690         return true;
10691
10692       case ADDRESS_REG_SXTW:
10693         if (addr.shift == 0)
10694           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10695                        REGNO (addr.offset) - R0_REGNUM);
10696         else
10697           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10698                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
10699         return true;
10700
10701       case ADDRESS_REG_WB:
10702         /* Writeback is only supported for fixed-width modes.  */
10703         size = GET_MODE_SIZE (mode).to_constant ();
10704         switch (GET_CODE (x))
10705           {
10706           case PRE_INC:
10707             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10708             return true;
10709           case POST_INC:
10710             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10711             return true;
10712           case PRE_DEC:
10713             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10714             return true;
10715           case POST_DEC:
10716             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10717             return true;
10718           case PRE_MODIFY:
10719             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10720                          INTVAL (addr.offset));
10721             return true;
10722           case POST_MODIFY:
10723             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10724                          INTVAL (addr.offset));
10725             return true;
10726           default:
10727             break;
10728           }
10729         break;
10730
10731       case ADDRESS_LO_SUM:
10732         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10733         output_addr_const (f, addr.offset);
10734         asm_fprintf (f, "]");
10735         return true;
10736
10737       case ADDRESS_SYMBOLIC:
10738         output_addr_const (f, x);
10739         return true;
10740       }
10741
10742   return false;
10743 }
10744
10745 /* Print address 'x' of a memory access with mode 'mode'.  */
10746 static void
10747 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10748 {
10749   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10750     output_addr_const (f, x);
10751 }
10752
10753 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
10754
10755 static bool
10756 aarch64_output_addr_const_extra (FILE *file, rtx x)
10757 {
10758   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
10759     {
10760       output_addr_const (file, XVECEXP (x, 0, 0));
10761       return true;
10762    }
10763   return false;
10764 }
10765
10766 bool
10767 aarch64_label_mentioned_p (rtx x)
10768 {
10769   const char *fmt;
10770   int i;
10771
10772   if (LABEL_REF_P (x))
10773     return true;
10774
10775   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10776      referencing instruction, but they are constant offsets, not
10777      symbols.  */
10778   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10779     return false;
10780
10781   fmt = GET_RTX_FORMAT (GET_CODE (x));
10782   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10783     {
10784       if (fmt[i] == 'E')
10785         {
10786           int j;
10787
10788           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10789             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10790               return 1;
10791         }
10792       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10793         return 1;
10794     }
10795
10796   return 0;
10797 }
10798
10799 /* Implement REGNO_REG_CLASS.  */
10800
10801 enum reg_class
10802 aarch64_regno_regclass (unsigned regno)
10803 {
10804   if (STUB_REGNUM_P (regno))
10805     return STUB_REGS;
10806
10807   if (GP_REGNUM_P (regno))
10808     return GENERAL_REGS;
10809
10810   if (regno == SP_REGNUM)
10811     return STACK_REG;
10812
10813   if (regno == FRAME_POINTER_REGNUM
10814       || regno == ARG_POINTER_REGNUM)
10815     return POINTER_REGS;
10816
10817   if (FP_REGNUM_P (regno))
10818     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10819             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10820
10821   if (PR_REGNUM_P (regno))
10822     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10823
10824   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10825     return FFR_REGS;
10826
10827   return NO_REGS;
10828 }
10829
10830 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10831    If OFFSET is out of range, return an offset of an anchor point
10832    that is in range.  Return 0 otherwise.  */
10833
10834 static HOST_WIDE_INT
10835 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10836                        machine_mode mode)
10837 {
10838   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
10839   if (size > 16)
10840     return (offset + 0x400) & ~0x7f0;
10841
10842   /* For offsets that aren't a multiple of the access size, the limit is
10843      -256...255.  */
10844   if (offset & (size - 1))
10845     {
10846       /* BLKmode typically uses LDP of X-registers.  */
10847       if (mode == BLKmode)
10848         return (offset + 512) & ~0x3ff;
10849       return (offset + 0x100) & ~0x1ff;
10850     }
10851
10852   /* Small negative offsets are supported.  */
10853   if (IN_RANGE (offset, -256, 0))
10854     return 0;
10855
10856   if (mode == TImode || mode == TFmode)
10857     return (offset + 0x100) & ~0x1ff;
10858
10859   /* Use 12-bit offset by access size.  */
10860   return offset & (~0xfff * size);
10861 }
10862
10863 static rtx
10864 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
10865 {
10866   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10867      where mask is selected by alignment and size of the offset.
10868      We try to pick as large a range for the offset as possible to
10869      maximize the chance of a CSE.  However, for aligned addresses
10870      we limit the range to 4k so that structures with different sized
10871      elements are likely to use the same base.  We need to be careful
10872      not to split a CONST for some forms of address expression, otherwise
10873      it will generate sub-optimal code.  */
10874
10875   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10876     {
10877       rtx base = XEXP (x, 0);
10878       rtx offset_rtx = XEXP (x, 1);
10879       HOST_WIDE_INT offset = INTVAL (offset_rtx);
10880
10881       if (GET_CODE (base) == PLUS)
10882         {
10883           rtx op0 = XEXP (base, 0);
10884           rtx op1 = XEXP (base, 1);
10885
10886           /* Force any scaling into a temp for CSE.  */
10887           op0 = force_reg (Pmode, op0);
10888           op1 = force_reg (Pmode, op1);
10889
10890           /* Let the pointer register be in op0.  */
10891           if (REG_POINTER (op1))
10892             std::swap (op0, op1);
10893
10894           /* If the pointer is virtual or frame related, then we know that
10895              virtual register instantiation or register elimination is going
10896              to apply a second constant.  We want the two constants folded
10897              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
10898           if (virt_or_elim_regno_p (REGNO (op0)))
10899             {
10900               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10901                                    NULL_RTX, true, OPTAB_DIRECT);
10902               return gen_rtx_PLUS (Pmode, base, op1);
10903             }
10904
10905           /* Otherwise, in order to encourage CSE (and thence loop strength
10906              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
10907           base = expand_binop (Pmode, add_optab, op0, op1,
10908                                NULL_RTX, true, OPTAB_DIRECT);
10909           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10910         }
10911
10912       HOST_WIDE_INT size;
10913       if (GET_MODE_SIZE (mode).is_constant (&size))
10914         {
10915           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10916                                                              mode);
10917           if (base_offset != 0)
10918             {
10919               base = plus_constant (Pmode, base, base_offset);
10920               base = force_operand (base, NULL_RTX);
10921               return plus_constant (Pmode, base, offset - base_offset);
10922             }
10923         }
10924     }
10925
10926   return x;
10927 }
10928
10929 static reg_class_t
10930 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10931                           reg_class_t rclass,
10932                           machine_mode mode,
10933                           secondary_reload_info *sri)
10934 {
10935   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10936      LDR and STR.  See the comment at the head of aarch64-sve.md for
10937      more details about the big-endian handling.  */
10938   if (reg_class_subset_p (rclass, FP_REGS)
10939       && !((REG_P (x) && HARD_REGISTER_P (x))
10940            || aarch64_simd_valid_immediate (x, NULL))
10941       && mode != VNx16QImode)
10942     {
10943       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10944       if ((vec_flags & VEC_SVE_DATA)
10945           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10946         {
10947           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10948           return NO_REGS;
10949         }
10950     }
10951
10952   /* If we have to disable direct literal pool loads and stores because the
10953      function is too big, then we need a scratch register.  */
10954   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
10955       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10956           || targetm.vector_mode_supported_p (GET_MODE (x)))
10957       && !aarch64_pcrelative_literal_loads)
10958     {
10959       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10960       return NO_REGS;
10961     }
10962
10963   /* Without the TARGET_SIMD instructions we cannot move a Q register
10964      to a Q register directly.  We need a scratch.  */
10965   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10966       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10967       && reg_class_subset_p (rclass, FP_REGS))
10968     {
10969       sri->icode = code_for_aarch64_reload_mov (mode);
10970       return NO_REGS;
10971     }
10972
10973   /* A TFmode or TImode memory access should be handled via an FP_REGS
10974      because AArch64 has richer addressing modes for LDR/STR instructions
10975      than LDP/STP instructions.  */
10976   if (TARGET_FLOAT && rclass == GENERAL_REGS
10977       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10978     return FP_REGS;
10979
10980   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10981       return GENERAL_REGS;
10982
10983   return NO_REGS;
10984 }
10985
10986 static bool
10987 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10988 {
10989   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10990
10991   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10992      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10993   if (frame_pointer_needed)
10994     return to == HARD_FRAME_POINTER_REGNUM;
10995   return true;
10996 }
10997
10998 poly_int64
10999 aarch64_initial_elimination_offset (unsigned from, unsigned to)
11000 {
11001   if (to == HARD_FRAME_POINTER_REGNUM)
11002     {
11003       if (from == ARG_POINTER_REGNUM)
11004         return cfun->machine->frame.hard_fp_offset;
11005
11006       if (from == FRAME_POINTER_REGNUM)
11007         return cfun->machine->frame.hard_fp_offset
11008                - cfun->machine->frame.locals_offset;
11009     }
11010
11011   if (to == STACK_POINTER_REGNUM)
11012     {
11013       if (from == FRAME_POINTER_REGNUM)
11014           return cfun->machine->frame.frame_size
11015                  - cfun->machine->frame.locals_offset;
11016     }
11017
11018   return cfun->machine->frame.frame_size;
11019 }
11020
11021
11022 /* Get return address without mangling.  */
11023
11024 rtx
11025 aarch64_return_addr_rtx (void)
11026 {
11027   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11028   /* Note: aarch64_return_address_signing_enabled only
11029      works after cfun->machine->frame.laid_out is set,
11030      so here we don't know if the return address will
11031      be signed or not.  */
11032   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11033   emit_move_insn (lr, val);
11034   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11035   return lr;
11036 }
11037
11038
11039 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
11040    previous frame.  */
11041
11042 rtx
11043 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11044 {
11045   if (count != 0)
11046     return const0_rtx;
11047   return aarch64_return_addr_rtx ();
11048 }
11049
11050 static void
11051 aarch64_asm_trampoline_template (FILE *f)
11052 {
11053   /* Even if the current function doesn't have branch protection, some
11054      later function might, so since this template is only generated once
11055      we have to add a BTI just in case. */
11056   asm_fprintf (f, "\thint\t34 // bti c\n");
11057
11058   if (TARGET_ILP32)
11059     {
11060       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
11061       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
11062     }
11063   else
11064     {
11065       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
11066       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
11067     }
11068   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
11069
11070   /* We always emit a speculation barrier.
11071      This is because the same trampoline template is used for every nested
11072      function.  Since nested functions are not particularly common or
11073      performant we don't worry too much about the extra instructions to copy
11074      around.
11075      This is not yet a problem, since we have not yet implemented function
11076      specific attributes to choose between hardening against straight line
11077      speculation or not, but such function specific attributes are likely to
11078      happen in the future.  */
11079   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11080
11081   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11082   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11083 }
11084
11085 static void
11086 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11087 {
11088   rtx fnaddr, mem, a_tramp;
11089   const int tramp_code_sz = 24;
11090
11091   /* Don't need to copy the trailing D-words, we fill those in below.  */
11092   /* We create our own memory address in Pmode so that `emit_block_move` can
11093      use parts of the backend which expect Pmode addresses.  */
11094   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11095   emit_block_move (gen_rtx_MEM (BLKmode, temp),
11096                    assemble_trampoline_template (),
11097                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11098   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
11099   fnaddr = XEXP (DECL_RTL (fndecl), 0);
11100   if (GET_MODE (fnaddr) != ptr_mode)
11101     fnaddr = convert_memory_address (ptr_mode, fnaddr);
11102   emit_move_insn (mem, fnaddr);
11103
11104   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
11105   emit_move_insn (mem, chain_value);
11106
11107   /* XXX We should really define a "clear_cache" pattern and use
11108      gen_clear_cache().  */
11109   a_tramp = XEXP (m_tramp, 0);
11110   maybe_emit_call_builtin___clear_cache (a_tramp,
11111                                          plus_constant (ptr_mode,
11112                                                         a_tramp,
11113                                                         TRAMPOLINE_SIZE));
11114 }
11115
11116 static unsigned char
11117 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
11118 {
11119   /* ??? Logically we should only need to provide a value when
11120      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11121      can hold MODE, but at the moment we need to handle all modes.
11122      Just ignore any runtime parts for registers that can't store them.  */
11123   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
11124   unsigned int nregs, vec_flags;
11125   switch (regclass)
11126     {
11127     case STUB_REGS:
11128     case TAILCALL_ADDR_REGS:
11129     case POINTER_REGS:
11130     case GENERAL_REGS:
11131     case ALL_REGS:
11132     case POINTER_AND_FP_REGS:
11133     case FP_REGS:
11134     case FP_LO_REGS:
11135     case FP_LO8_REGS:
11136       vec_flags = aarch64_classify_vector_mode (mode);
11137       if ((vec_flags & VEC_SVE_DATA)
11138           && constant_multiple_p (GET_MODE_SIZE (mode),
11139                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
11140         return nregs;
11141       return (vec_flags & VEC_ADVSIMD
11142               ? CEIL (lowest_size, UNITS_PER_VREG)
11143               : CEIL (lowest_size, UNITS_PER_WORD));
11144     case STACK_REG:
11145     case PR_REGS:
11146     case PR_LO_REGS:
11147     case PR_HI_REGS:
11148     case FFR_REGS:
11149     case PR_AND_FFR_REGS:
11150       return 1;
11151
11152     case NO_REGS:
11153       return 0;
11154
11155     default:
11156       break;
11157     }
11158   gcc_unreachable ();
11159 }
11160
11161 static reg_class_t
11162 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
11163 {
11164   if (regclass == POINTER_REGS)
11165     return GENERAL_REGS;
11166
11167   if (regclass == STACK_REG)
11168     {
11169       if (REG_P(x)
11170           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11171           return regclass;
11172
11173       return NO_REGS;
11174     }
11175
11176   /* Register eliminiation can result in a request for
11177      SP+constant->FP_REGS.  We cannot support such operations which
11178      use SP as source and an FP_REG as destination, so reject out
11179      right now.  */
11180   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11181     {
11182       rtx lhs = XEXP (x, 0);
11183
11184       /* Look through a possible SUBREG introduced by ILP32.  */
11185       if (SUBREG_P (lhs))
11186         lhs = SUBREG_REG (lhs);
11187
11188       gcc_assert (REG_P (lhs));
11189       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11190                                       POINTER_REGS));
11191       return NO_REGS;
11192     }
11193
11194   return regclass;
11195 }
11196
11197 void
11198 aarch64_asm_output_labelref (FILE* f, const char *name)
11199 {
11200   asm_fprintf (f, "%U%s", name);
11201 }
11202
11203 static void
11204 aarch64_elf_asm_constructor (rtx symbol, int priority)
11205 {
11206   if (priority == DEFAULT_INIT_PRIORITY)
11207     default_ctor_section_asm_out_constructor (symbol, priority);
11208   else
11209     {
11210       section *s;
11211       /* While priority is known to be in range [0, 65535], so 18 bytes
11212          would be enough, the compiler might not know that.  To avoid
11213          -Wformat-truncation false positive, use a larger size.  */
11214       char buf[23];
11215       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11216       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11217       switch_to_section (s);
11218       assemble_align (POINTER_SIZE);
11219       assemble_aligned_integer (POINTER_BYTES, symbol);
11220     }
11221 }
11222
11223 static void
11224 aarch64_elf_asm_destructor (rtx symbol, int priority)
11225 {
11226   if (priority == DEFAULT_INIT_PRIORITY)
11227     default_dtor_section_asm_out_destructor (symbol, priority);
11228   else
11229     {
11230       section *s;
11231       /* While priority is known to be in range [0, 65535], so 18 bytes
11232          would be enough, the compiler might not know that.  To avoid
11233          -Wformat-truncation false positive, use a larger size.  */
11234       char buf[23];
11235       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11236       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11237       switch_to_section (s);
11238       assemble_align (POINTER_SIZE);
11239       assemble_aligned_integer (POINTER_BYTES, symbol);
11240     }
11241 }
11242
11243 const char*
11244 aarch64_output_casesi (rtx *operands)
11245 {
11246   char buf[100];
11247   char label[100];
11248   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11249   int index;
11250   static const char *const patterns[4][2] =
11251   {
11252     {
11253       "ldrb\t%w3, [%0,%w1,uxtw]",
11254       "add\t%3, %4, %w3, sxtb #2"
11255     },
11256     {
11257       "ldrh\t%w3, [%0,%w1,uxtw #1]",
11258       "add\t%3, %4, %w3, sxth #2"
11259     },
11260     {
11261       "ldr\t%w3, [%0,%w1,uxtw #2]",
11262       "add\t%3, %4, %w3, sxtw #2"
11263     },
11264     /* We assume that DImode is only generated when not optimizing and
11265        that we don't really need 64-bit address offsets.  That would
11266        imply an object file with 8GB of code in a single function!  */
11267     {
11268       "ldr\t%w3, [%0,%w1,uxtw #2]",
11269       "add\t%3, %4, %w3, sxtw #2"
11270     }
11271   };
11272
11273   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11274
11275   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11276   index = exact_log2 (GET_MODE_SIZE (mode));
11277
11278   gcc_assert (index >= 0 && index <= 3);
11279
11280   /* Need to implement table size reduction, by chaning the code below.  */
11281   output_asm_insn (patterns[index][0], operands);
11282   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11283   snprintf (buf, sizeof (buf),
11284             "adr\t%%4, %s", targetm.strip_name_encoding (label));
11285   output_asm_insn (buf, operands);
11286   output_asm_insn (patterns[index][1], operands);
11287   output_asm_insn ("br\t%3", operands);
11288   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11289                    operands);
11290   assemble_label (asm_out_file, label);
11291   return "";
11292 }
11293
11294
11295 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11296    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11297    operator.  */
11298
11299 int
11300 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11301 {
11302   if (shift >= 0 && shift <= 3)
11303     {
11304       int size;
11305       for (size = 8; size <= 32; size *= 2)
11306         {
11307           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11308           if (mask == bits << shift)
11309             return size;
11310         }
11311     }
11312   return 0;
11313 }
11314
11315 /* Constant pools are per function only when PC relative
11316    literal loads are true or we are in the large memory
11317    model.  */
11318
11319 static inline bool
11320 aarch64_can_use_per_function_literal_pools_p (void)
11321 {
11322   return (aarch64_pcrelative_literal_loads
11323           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11324 }
11325
11326 static bool
11327 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11328 {
11329   /* We can't use blocks for constants when we're using a per-function
11330      constant pool.  */
11331   return !aarch64_can_use_per_function_literal_pools_p ();
11332 }
11333
11334 /* Select appropriate section for constants depending
11335    on where we place literal pools.  */
11336
11337 static section *
11338 aarch64_select_rtx_section (machine_mode mode,
11339                             rtx x,
11340                             unsigned HOST_WIDE_INT align)
11341 {
11342   if (aarch64_can_use_per_function_literal_pools_p ())
11343     return function_section (current_function_decl);
11344
11345   return default_elf_select_rtx_section (mode, x, align);
11346 }
11347
11348 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
11349 void
11350 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11351                                   HOST_WIDE_INT offset)
11352 {
11353   /* When using per-function literal pools, we must ensure that any code
11354      section is aligned to the minimal instruction length, lest we get
11355      errors from the assembler re "unaligned instructions".  */
11356   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11357     ASM_OUTPUT_ALIGN (f, 2);
11358 }
11359
11360 /* Costs.  */
11361
11362 /* Helper function for rtx cost calculation.  Strip a shift expression
11363    from X.  Returns the inner operand if successful, or the original
11364    expression on failure.  */
11365 static rtx
11366 aarch64_strip_shift (rtx x)
11367 {
11368   rtx op = x;
11369
11370   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11371      we can convert both to ROR during final output.  */
11372   if ((GET_CODE (op) == ASHIFT
11373        || GET_CODE (op) == ASHIFTRT
11374        || GET_CODE (op) == LSHIFTRT
11375        || GET_CODE (op) == ROTATERT
11376        || GET_CODE (op) == ROTATE)
11377       && CONST_INT_P (XEXP (op, 1)))
11378     return XEXP (op, 0);
11379
11380   if (GET_CODE (op) == MULT
11381       && CONST_INT_P (XEXP (op, 1))
11382       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11383     return XEXP (op, 0);
11384
11385   return x;
11386 }
11387
11388 /* Helper function for rtx cost calculation.  Strip an extend
11389    expression from X.  Returns the inner operand if successful, or the
11390    original expression on failure.  We deal with a number of possible
11391    canonicalization variations here. If STRIP_SHIFT is true, then
11392    we can strip off a shift also.  */
11393 static rtx
11394 aarch64_strip_extend (rtx x, bool strip_shift)
11395 {
11396   scalar_int_mode mode;
11397   rtx op = x;
11398
11399   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11400     return op;
11401
11402   if (GET_CODE (op) == AND
11403       && GET_CODE (XEXP (op, 0)) == MULT
11404       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11405       && CONST_INT_P (XEXP (op, 1))
11406       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11407                            INTVAL (XEXP (op, 1))) != 0)
11408     return XEXP (XEXP (op, 0), 0);
11409
11410   /* Now handle extended register, as this may also have an optional
11411      left shift by 1..4.  */
11412   if (strip_shift
11413       && GET_CODE (op) == ASHIFT
11414       && CONST_INT_P (XEXP (op, 1))
11415       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11416     op = XEXP (op, 0);
11417
11418   if (GET_CODE (op) == ZERO_EXTEND
11419       || GET_CODE (op) == SIGN_EXTEND)
11420     op = XEXP (op, 0);
11421
11422   if (op != x)
11423     return op;
11424
11425   return x;
11426 }
11427
11428 /* Return true iff CODE is a shift supported in combination
11429    with arithmetic instructions.  */
11430
11431 static bool
11432 aarch64_shift_p (enum rtx_code code)
11433 {
11434   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11435 }
11436
11437
11438 /* Return true iff X is a cheap shift without a sign extend. */
11439
11440 static bool
11441 aarch64_cheap_mult_shift_p (rtx x)
11442 {
11443   rtx op0, op1;
11444
11445   op0 = XEXP (x, 0);
11446   op1 = XEXP (x, 1);
11447
11448   if (!(aarch64_tune_params.extra_tuning_flags
11449                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11450     return false;
11451
11452   if (GET_CODE (op0) == SIGN_EXTEND)
11453     return false;
11454
11455   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11456       && UINTVAL (op1) <= 4)
11457     return true;
11458
11459   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11460     return false;
11461
11462   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11463
11464   if (l2 > 0 && l2 <= 4)
11465     return true;
11466
11467   return false;
11468 }
11469
11470 /* Helper function for rtx cost calculation.  Calculate the cost of
11471    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11472    Return the calculated cost of the expression, recursing manually in to
11473    operands where needed.  */
11474
11475 static int
11476 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11477 {
11478   rtx op0, op1;
11479   const struct cpu_cost_table *extra_cost
11480     = aarch64_tune_params.insn_extra_cost;
11481   int cost = 0;
11482   bool compound_p = (outer == PLUS || outer == MINUS);
11483   machine_mode mode = GET_MODE (x);
11484
11485   gcc_checking_assert (code == MULT);
11486
11487   op0 = XEXP (x, 0);
11488   op1 = XEXP (x, 1);
11489
11490   if (VECTOR_MODE_P (mode))
11491     {
11492       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11493       mode = GET_MODE_INNER (mode);
11494       if (vec_flags & VEC_ADVSIMD)
11495         {
11496           /* The by-element versions of the instruction have the same costs as
11497              the normal 3-vector version.  So don't add the costs of the
11498              duplicate into the costs of the multiply.  We make an assumption
11499              that the input to the VEC_DUPLICATE is already on the FP & SIMD
11500              side.  This means costing of a MUL by element pre RA is a bit
11501              optimistic.  */
11502           if (GET_CODE (op0) == VEC_DUPLICATE)
11503             op0 = XEXP (op0, 0);
11504           else if (GET_CODE (op1) == VEC_DUPLICATE)
11505             op1 = XEXP (op1, 0);
11506         }
11507     }
11508
11509   /* Integer multiply/fma.  */
11510   if (GET_MODE_CLASS (mode) == MODE_INT)
11511     {
11512       /* The multiply will be canonicalized as a shift, cost it as such.  */
11513       if (aarch64_shift_p (GET_CODE (x))
11514           || (CONST_INT_P (op1)
11515               && exact_log2 (INTVAL (op1)) > 0))
11516         {
11517           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11518                            || GET_CODE (op0) == SIGN_EXTEND;
11519           if (speed)
11520             {
11521               if (compound_p)
11522                 {
11523                   /* If the shift is considered cheap,
11524                      then don't add any cost. */
11525                   if (aarch64_cheap_mult_shift_p (x))
11526                     ;
11527                   else if (REG_P (op1))
11528                     /* ARITH + shift-by-register.  */
11529                     cost += extra_cost->alu.arith_shift_reg;
11530                   else if (is_extend)
11531                     /* ARITH + extended register.  We don't have a cost field
11532                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
11533                     cost += extra_cost->alu.extend_arith;
11534                   else
11535                     /* ARITH + shift-by-immediate.  */
11536                     cost += extra_cost->alu.arith_shift;
11537                 }
11538               else
11539                 /* LSL (immediate).  */
11540                 cost += extra_cost->alu.shift;
11541
11542             }
11543           /* Strip extends as we will have costed them in the case above.  */
11544           if (is_extend)
11545             op0 = aarch64_strip_extend (op0, true);
11546
11547           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11548
11549           return cost;
11550         }
11551
11552       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
11553          compound and let the below cases handle it.  After all, MNEG is a
11554          special-case alias of MSUB.  */
11555       if (GET_CODE (op0) == NEG)
11556         {
11557           op0 = XEXP (op0, 0);
11558           compound_p = true;
11559         }
11560
11561       /* Integer multiplies or FMAs have zero/sign extending variants.  */
11562       if ((GET_CODE (op0) == ZERO_EXTEND
11563            && GET_CODE (op1) == ZERO_EXTEND)
11564           || (GET_CODE (op0) == SIGN_EXTEND
11565               && GET_CODE (op1) == SIGN_EXTEND))
11566         {
11567           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11568           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11569
11570           if (speed)
11571             {
11572               if (compound_p)
11573                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
11574                 cost += extra_cost->mult[0].extend_add;
11575               else
11576                 /* MUL/SMULL/UMULL.  */
11577                 cost += extra_cost->mult[0].extend;
11578             }
11579
11580           return cost;
11581         }
11582
11583       /* This is either an integer multiply or a MADD.  In both cases
11584          we want to recurse and cost the operands.  */
11585       cost += rtx_cost (op0, mode, MULT, 0, speed);
11586       cost += rtx_cost (op1, mode, MULT, 1, speed);
11587
11588       if (speed)
11589         {
11590           if (compound_p)
11591             /* MADD/MSUB.  */
11592             cost += extra_cost->mult[mode == DImode].add;
11593           else
11594             /* MUL.  */
11595             cost += extra_cost->mult[mode == DImode].simple;
11596         }
11597
11598       return cost;
11599     }
11600   else
11601     {
11602       if (speed)
11603         {
11604           /* Floating-point FMA/FMUL can also support negations of the
11605              operands, unless the rounding mode is upward or downward in
11606              which case FNMUL is different than FMUL with operand negation.  */
11607           bool neg0 = GET_CODE (op0) == NEG;
11608           bool neg1 = GET_CODE (op1) == NEG;
11609           if (compound_p || !flag_rounding_math || (neg0 && neg1))
11610             {
11611               if (neg0)
11612                 op0 = XEXP (op0, 0);
11613               if (neg1)
11614                 op1 = XEXP (op1, 0);
11615             }
11616
11617           if (compound_p)
11618             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
11619             cost += extra_cost->fp[mode == DFmode].fma;
11620           else
11621             /* FMUL/FNMUL.  */
11622             cost += extra_cost->fp[mode == DFmode].mult;
11623         }
11624
11625       cost += rtx_cost (op0, mode, MULT, 0, speed);
11626       cost += rtx_cost (op1, mode, MULT, 1, speed);
11627       return cost;
11628     }
11629 }
11630
11631 static int
11632 aarch64_address_cost (rtx x,
11633                       machine_mode mode,
11634                       addr_space_t as ATTRIBUTE_UNUSED,
11635                       bool speed)
11636 {
11637   enum rtx_code c = GET_CODE (x);
11638   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11639   struct aarch64_address_info info;
11640   int cost = 0;
11641   info.shift = 0;
11642
11643   if (!aarch64_classify_address (&info, x, mode, false))
11644     {
11645       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
11646         {
11647           /* This is a CONST or SYMBOL ref which will be split
11648              in a different way depending on the code model in use.
11649              Cost it through the generic infrastructure.  */
11650           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11651           /* Divide through by the cost of one instruction to
11652              bring it to the same units as the address costs.  */
11653           cost_symbol_ref /= COSTS_N_INSNS (1);
11654           /* The cost is then the cost of preparing the address,
11655              followed by an immediate (possibly 0) offset.  */
11656           return cost_symbol_ref + addr_cost->imm_offset;
11657         }
11658       else
11659         {
11660           /* This is most likely a jump table from a case
11661              statement.  */
11662           return addr_cost->register_offset;
11663         }
11664     }
11665
11666   switch (info.type)
11667     {
11668       case ADDRESS_LO_SUM:
11669       case ADDRESS_SYMBOLIC:
11670       case ADDRESS_REG_IMM:
11671         cost += addr_cost->imm_offset;
11672         break;
11673
11674       case ADDRESS_REG_WB:
11675         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11676           cost += addr_cost->pre_modify;
11677         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11678           cost += addr_cost->post_modify;
11679         else
11680           gcc_unreachable ();
11681
11682         break;
11683
11684       case ADDRESS_REG_REG:
11685         cost += addr_cost->register_offset;
11686         break;
11687
11688       case ADDRESS_REG_SXTW:
11689         cost += addr_cost->register_sextend;
11690         break;
11691
11692       case ADDRESS_REG_UXTW:
11693         cost += addr_cost->register_zextend;
11694         break;
11695
11696       default:
11697         gcc_unreachable ();
11698     }
11699
11700
11701   if (info.shift > 0)
11702     {
11703       /* For the sake of calculating the cost of the shifted register
11704          component, we can treat same sized modes in the same way.  */
11705       if (known_eq (GET_MODE_BITSIZE (mode), 16))
11706         cost += addr_cost->addr_scale_costs.hi;
11707       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11708         cost += addr_cost->addr_scale_costs.si;
11709       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11710         cost += addr_cost->addr_scale_costs.di;
11711       else
11712         /* We can't tell, or this is a 128-bit vector.  */
11713         cost += addr_cost->addr_scale_costs.ti;
11714     }
11715
11716   return cost;
11717 }
11718
11719 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
11720    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
11721    to be taken.  */
11722
11723 int
11724 aarch64_branch_cost (bool speed_p, bool predictable_p)
11725 {
11726   /* When optimizing for speed, use the cost of unpredictable branches.  */
11727   const struct cpu_branch_cost *branch_costs =
11728     aarch64_tune_params.branch_costs;
11729
11730   if (!speed_p || predictable_p)
11731     return branch_costs->predictable;
11732   else
11733     return branch_costs->unpredictable;
11734 }
11735
11736 /* Return true if X is a zero or sign extract
11737    usable in an ADD or SUB (extended register) instruction.  */
11738 static bool
11739 aarch64_rtx_arith_op_extract_p (rtx x)
11740 {
11741   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11742      No shift.  */
11743   if (GET_CODE (x) == SIGN_EXTEND
11744       || GET_CODE (x) == ZERO_EXTEND)
11745     return REG_P (XEXP (x, 0));
11746
11747   return false;
11748 }
11749
11750 static bool
11751 aarch64_frint_unspec_p (unsigned int u)
11752 {
11753   switch (u)
11754     {
11755       case UNSPEC_FRINTZ:
11756       case UNSPEC_FRINTP:
11757       case UNSPEC_FRINTM:
11758       case UNSPEC_FRINTA:
11759       case UNSPEC_FRINTN:
11760       case UNSPEC_FRINTX:
11761       case UNSPEC_FRINTI:
11762         return true;
11763
11764       default:
11765         return false;
11766     }
11767 }
11768
11769 /* Return true iff X is an rtx that will match an extr instruction
11770    i.e. as described in the *extr<mode>5_insn family of patterns.
11771    OP0 and OP1 will be set to the operands of the shifts involved
11772    on success and will be NULL_RTX otherwise.  */
11773
11774 static bool
11775 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11776 {
11777   rtx op0, op1;
11778   scalar_int_mode mode;
11779   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11780     return false;
11781
11782   *res_op0 = NULL_RTX;
11783   *res_op1 = NULL_RTX;
11784
11785   if (GET_CODE (x) != IOR)
11786     return false;
11787
11788   op0 = XEXP (x, 0);
11789   op1 = XEXP (x, 1);
11790
11791   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11792       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11793     {
11794      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
11795       if (GET_CODE (op1) == ASHIFT)
11796         std::swap (op0, op1);
11797
11798       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11799         return false;
11800
11801       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11802       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11803
11804       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11805           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11806         {
11807           *res_op0 = XEXP (op0, 0);
11808           *res_op1 = XEXP (op1, 0);
11809           return true;
11810         }
11811     }
11812
11813   return false;
11814 }
11815
11816 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11817    storing it in *COST.  Result is true if the total cost of the operation
11818    has now been calculated.  */
11819 static bool
11820 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11821 {
11822   rtx inner;
11823   rtx comparator;
11824   enum rtx_code cmpcode;
11825   const struct cpu_cost_table *extra_cost
11826     = aarch64_tune_params.insn_extra_cost;
11827
11828   if (COMPARISON_P (op0))
11829     {
11830       inner = XEXP (op0, 0);
11831       comparator = XEXP (op0, 1);
11832       cmpcode = GET_CODE (op0);
11833     }
11834   else
11835     {
11836       inner = op0;
11837       comparator = const0_rtx;
11838       cmpcode = NE;
11839     }
11840
11841   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11842     {
11843       /* Conditional branch.  */
11844       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11845         return true;
11846       else
11847         {
11848           if (cmpcode == NE || cmpcode == EQ)
11849             {
11850               if (comparator == const0_rtx)
11851                 {
11852                   /* TBZ/TBNZ/CBZ/CBNZ.  */
11853                   if (GET_CODE (inner) == ZERO_EXTRACT)
11854                     /* TBZ/TBNZ.  */
11855                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11856                                        ZERO_EXTRACT, 0, speed);
11857                   else
11858                     /* CBZ/CBNZ.  */
11859                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11860
11861                   return true;
11862                 }
11863               if (register_operand (inner, VOIDmode)
11864                   && aarch64_imm24 (comparator, VOIDmode))
11865                 {
11866                   /* SUB and SUBS.  */
11867                   *cost += COSTS_N_INSNS (2);
11868                   if (speed)
11869                     *cost += extra_cost->alu.arith * 2;
11870                   return true;
11871                 }
11872             }
11873           else if (cmpcode == LT || cmpcode == GE)
11874             {
11875               /* TBZ/TBNZ.  */
11876               if (comparator == const0_rtx)
11877                 return true;
11878             }
11879         }
11880     }
11881   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11882     {
11883       /* CCMP.  */
11884       if (GET_CODE (op1) == COMPARE)
11885         {
11886           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
11887           if (XEXP (op1, 1) == const0_rtx)
11888             *cost += 1;
11889           if (speed)
11890             {
11891               machine_mode mode = GET_MODE (XEXP (op1, 0));
11892
11893               if (GET_MODE_CLASS (mode) == MODE_INT)
11894                 *cost += extra_cost->alu.arith;
11895               else
11896                 *cost += extra_cost->fp[mode == DFmode].compare;
11897             }
11898           return true;
11899         }
11900
11901       /* It's a conditional operation based on the status flags,
11902          so it must be some flavor of CSEL.  */
11903
11904       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
11905       if (GET_CODE (op1) == NEG
11906           || GET_CODE (op1) == NOT
11907           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11908         op1 = XEXP (op1, 0);
11909       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11910         {
11911           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
11912           op1 = XEXP (op1, 0);
11913           op2 = XEXP (op2, 0);
11914         }
11915       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
11916         {
11917           inner = XEXP (op1, 0);
11918           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
11919             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
11920             op1 = XEXP (inner, 0);
11921         }
11922
11923       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11924       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11925       return true;
11926     }
11927
11928   /* We don't know what this is, cost all operands.  */
11929   return false;
11930 }
11931
11932 /* Check whether X is a bitfield operation of the form shift + extend that
11933    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
11934    operand to which the bitfield operation is applied.  Otherwise return
11935    NULL_RTX.  */
11936
11937 static rtx
11938 aarch64_extend_bitfield_pattern_p (rtx x)
11939 {
11940   rtx_code outer_code = GET_CODE (x);
11941   machine_mode outer_mode = GET_MODE (x);
11942
11943   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11944       && outer_mode != SImode && outer_mode != DImode)
11945     return NULL_RTX;
11946
11947   rtx inner = XEXP (x, 0);
11948   rtx_code inner_code = GET_CODE (inner);
11949   machine_mode inner_mode = GET_MODE (inner);
11950   rtx op = NULL_RTX;
11951
11952   switch (inner_code)
11953     {
11954       case ASHIFT:
11955         if (CONST_INT_P (XEXP (inner, 1))
11956             && (inner_mode == QImode || inner_mode == HImode))
11957           op = XEXP (inner, 0);
11958         break;
11959       case LSHIFTRT:
11960         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11961             && (inner_mode == QImode || inner_mode == HImode))
11962           op = XEXP (inner, 0);
11963         break;
11964       case ASHIFTRT:
11965         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11966             && (inner_mode == QImode || inner_mode == HImode))
11967           op = XEXP (inner, 0);
11968         break;
11969       default:
11970         break;
11971     }
11972
11973   return op;
11974 }
11975
11976 /* Return true if the mask and a shift amount from an RTX of the form
11977    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11978    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
11979
11980 bool
11981 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11982                                     rtx shft_amnt)
11983 {
11984   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11985          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11986          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11987          && (INTVAL (mask)
11988              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11989 }
11990
11991 /* Return true if the masks and a shift amount from an RTX of the form
11992    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11993    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
11994
11995 bool
11996 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11997                                    unsigned HOST_WIDE_INT mask1,
11998                                    unsigned HOST_WIDE_INT shft_amnt,
11999                                    unsigned HOST_WIDE_INT mask2)
12000 {
12001   unsigned HOST_WIDE_INT t;
12002
12003   /* Verify that there is no overlap in what bits are set in the two masks.  */
12004   if (mask1 != ~mask2)
12005     return false;
12006
12007   /* Verify that mask2 is not all zeros or ones.  */
12008   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12009     return false;
12010
12011   /* The shift amount should always be less than the mode size.  */
12012   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12013
12014   /* Verify that the mask being shifted is contiguous and would be in the
12015      least significant bits after shifting by shft_amnt.  */
12016   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12017   return (t == (t & -t));
12018 }
12019
12020 /* Calculate the cost of calculating X, storing it in *COST.  Result
12021    is true if the total cost of the operation has now been calculated.  */
12022 static bool
12023 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
12024                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12025 {
12026   rtx op0, op1, op2;
12027   const struct cpu_cost_table *extra_cost
12028     = aarch64_tune_params.insn_extra_cost;
12029   int code = GET_CODE (x);
12030   scalar_int_mode int_mode;
12031
12032   /* By default, assume that everything has equivalent cost to the
12033      cheapest instruction.  Any additional costs are applied as a delta
12034      above this default.  */
12035   *cost = COSTS_N_INSNS (1);
12036
12037   switch (code)
12038     {
12039     case SET:
12040       /* The cost depends entirely on the operands to SET.  */
12041       *cost = 0;
12042       op0 = SET_DEST (x);
12043       op1 = SET_SRC (x);
12044
12045       switch (GET_CODE (op0))
12046         {
12047         case MEM:
12048           if (speed)
12049             {
12050               rtx address = XEXP (op0, 0);
12051               if (VECTOR_MODE_P (mode))
12052                 *cost += extra_cost->ldst.storev;
12053               else if (GET_MODE_CLASS (mode) == MODE_INT)
12054                 *cost += extra_cost->ldst.store;
12055               else if (mode == SFmode)
12056                 *cost += extra_cost->ldst.storef;
12057               else if (mode == DFmode)
12058                 *cost += extra_cost->ldst.stored;
12059
12060               *cost +=
12061                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12062                                                      0, speed));
12063             }
12064
12065           *cost += rtx_cost (op1, mode, SET, 1, speed);
12066           return true;
12067
12068         case SUBREG:
12069           if (! REG_P (SUBREG_REG (op0)))
12070             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
12071
12072           /* Fall through.  */
12073         case REG:
12074           /* The cost is one per vector-register copied.  */
12075           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12076             {
12077               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12078               *cost = COSTS_N_INSNS (nregs);
12079             }
12080           /* const0_rtx is in general free, but we will use an
12081              instruction to set a register to 0.  */
12082           else if (REG_P (op1) || op1 == const0_rtx)
12083             {
12084               /* The cost is 1 per register copied.  */
12085               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12086               *cost = COSTS_N_INSNS (nregs);
12087             }
12088           else
12089             /* Cost is just the cost of the RHS of the set.  */
12090             *cost += rtx_cost (op1, mode, SET, 1, speed);
12091           return true;
12092
12093         case ZERO_EXTRACT:
12094         case SIGN_EXTRACT:
12095           /* Bit-field insertion.  Strip any redundant widening of
12096              the RHS to meet the width of the target.  */
12097           if (GET_CODE (op1) == SUBREG)
12098             op1 = SUBREG_REG (op1);
12099           if ((GET_CODE (op1) == ZERO_EXTEND
12100                || GET_CODE (op1) == SIGN_EXTEND)
12101               && CONST_INT_P (XEXP (op0, 1))
12102               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12103               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
12104             op1 = XEXP (op1, 0);
12105
12106           if (CONST_INT_P (op1))
12107             {
12108               /* MOV immediate is assumed to always be cheap.  */
12109               *cost = COSTS_N_INSNS (1);
12110             }
12111           else
12112             {
12113               /* BFM.  */
12114               if (speed)
12115                 *cost += extra_cost->alu.bfi;
12116               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
12117             }
12118
12119           return true;
12120
12121         default:
12122           /* We can't make sense of this, assume default cost.  */
12123           *cost = COSTS_N_INSNS (1);
12124           return false;
12125         }
12126       return false;
12127
12128     case CONST_INT:
12129       /* If an instruction can incorporate a constant within the
12130          instruction, the instruction's expression avoids calling
12131          rtx_cost() on the constant.  If rtx_cost() is called on a
12132          constant, then it is usually because the constant must be
12133          moved into a register by one or more instructions.
12134
12135          The exception is constant 0, which can be expressed
12136          as XZR/WZR and is therefore free.  The exception to this is
12137          if we have (set (reg) (const0_rtx)) in which case we must cost
12138          the move.  However, we can catch that when we cost the SET, so
12139          we don't need to consider that here.  */
12140       if (x == const0_rtx)
12141         *cost = 0;
12142       else
12143         {
12144           /* To an approximation, building any other constant is
12145              proportionally expensive to the number of instructions
12146              required to build that constant.  This is true whether we
12147              are compiling for SPEED or otherwise.  */
12148           if (!is_a <scalar_int_mode> (mode, &int_mode))
12149             int_mode = word_mode;
12150           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12151                                  (NULL_RTX, x, false, int_mode));
12152         }
12153       return true;
12154
12155     case CONST_DOUBLE:
12156
12157       /* First determine number of instructions to do the move
12158           as an integer constant.  */
12159       if (!aarch64_float_const_representable_p (x)
12160            && !aarch64_can_const_movi_rtx_p (x, mode)
12161            && aarch64_float_const_rtx_p (x))
12162         {
12163           unsigned HOST_WIDE_INT ival;
12164           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12165           gcc_assert (succeed);
12166
12167           scalar_int_mode imode = (mode == HFmode
12168                                    ? SImode
12169                                    : int_mode_for_mode (mode).require ());
12170           int ncost = aarch64_internal_mov_immediate
12171                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12172           *cost += COSTS_N_INSNS (ncost);
12173           return true;
12174         }
12175
12176       if (speed)
12177         {
12178           /* mov[df,sf]_aarch64.  */
12179           if (aarch64_float_const_representable_p (x))
12180             /* FMOV (scalar immediate).  */
12181             *cost += extra_cost->fp[mode == DFmode].fpconst;
12182           else if (!aarch64_float_const_zero_rtx_p (x))
12183             {
12184               /* This will be a load from memory.  */
12185               if (mode == DFmode)
12186                 *cost += extra_cost->ldst.loadd;
12187               else
12188                 *cost += extra_cost->ldst.loadf;
12189             }
12190           else
12191             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
12192                or MOV v0.s[0], wzr - neither of which are modeled by the
12193                cost tables.  Just use the default cost.  */
12194             {
12195             }
12196         }
12197
12198       return true;
12199
12200     case MEM:
12201       if (speed)
12202         {
12203           /* For loads we want the base cost of a load, plus an
12204              approximation for the additional cost of the addressing
12205              mode.  */
12206           rtx address = XEXP (x, 0);
12207           if (VECTOR_MODE_P (mode))
12208             *cost += extra_cost->ldst.loadv;
12209           else if (GET_MODE_CLASS (mode) == MODE_INT)
12210             *cost += extra_cost->ldst.load;
12211           else if (mode == SFmode)
12212             *cost += extra_cost->ldst.loadf;
12213           else if (mode == DFmode)
12214             *cost += extra_cost->ldst.loadd;
12215
12216           *cost +=
12217                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12218                                                      0, speed));
12219         }
12220
12221       return true;
12222
12223     case NEG:
12224       op0 = XEXP (x, 0);
12225
12226       if (VECTOR_MODE_P (mode))
12227         {
12228           if (speed)
12229             {
12230               /* FNEG.  */
12231               *cost += extra_cost->vect.alu;
12232             }
12233           return false;
12234         }
12235
12236       if (GET_MODE_CLASS (mode) == MODE_INT)
12237         {
12238           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12239               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12240             {
12241               /* CSETM.  */
12242               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12243               return true;
12244             }
12245
12246           /* Cost this as SUB wzr, X.  */
12247           op0 = CONST0_RTX (mode);
12248           op1 = XEXP (x, 0);
12249           goto cost_minus;
12250         }
12251
12252       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12253         {
12254           /* Support (neg(fma...)) as a single instruction only if
12255              sign of zeros is unimportant.  This matches the decision
12256              making in aarch64.md.  */
12257           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12258             {
12259               /* FNMADD.  */
12260               *cost = rtx_cost (op0, mode, NEG, 0, speed);
12261               return true;
12262             }
12263           if (GET_CODE (op0) == MULT)
12264             {
12265               /* FNMUL.  */
12266               *cost = rtx_cost (op0, mode, NEG, 0, speed);
12267               return true;
12268             }
12269           if (speed)
12270             /* FNEG.  */
12271             *cost += extra_cost->fp[mode == DFmode].neg;
12272           return false;
12273         }
12274
12275       return false;
12276
12277     case CLRSB:
12278     case CLZ:
12279       if (speed)
12280         {
12281           if (VECTOR_MODE_P (mode))
12282             *cost += extra_cost->vect.alu;
12283           else
12284             *cost += extra_cost->alu.clz;
12285         }
12286
12287       return false;
12288
12289     case CTZ:
12290       *cost = COSTS_N_INSNS (2);
12291
12292       if (speed)
12293         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12294       return false;
12295
12296     case COMPARE:
12297       op0 = XEXP (x, 0);
12298       op1 = XEXP (x, 1);
12299
12300       if (op1 == const0_rtx
12301           && GET_CODE (op0) == AND)
12302         {
12303           x = op0;
12304           mode = GET_MODE (op0);
12305           goto cost_logic;
12306         }
12307
12308       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12309         {
12310           /* TODO: A write to the CC flags possibly costs extra, this
12311              needs encoding in the cost tables.  */
12312
12313           mode = GET_MODE (op0);
12314           /* ANDS.  */
12315           if (GET_CODE (op0) == AND)
12316             {
12317               x = op0;
12318               goto cost_logic;
12319             }
12320
12321           if (GET_CODE (op0) == PLUS)
12322             {
12323               /* ADDS (and CMN alias).  */
12324               x = op0;
12325               goto cost_plus;
12326             }
12327
12328           if (GET_CODE (op0) == MINUS)
12329             {
12330               /* SUBS.  */
12331               x = op0;
12332               goto cost_minus;
12333             }
12334
12335           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12336               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12337               && CONST_INT_P (XEXP (op0, 2)))
12338             {
12339               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12340                  Handle it here directly rather than going to cost_logic
12341                  since we know the immediate generated for the TST is valid
12342                  so we can avoid creating an intermediate rtx for it only
12343                  for costing purposes.  */
12344               if (speed)
12345                 *cost += extra_cost->alu.logical;
12346
12347               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12348                                  ZERO_EXTRACT, 0, speed);
12349               return true;
12350             }
12351
12352           if (GET_CODE (op1) == NEG)
12353             {
12354               /* CMN.  */
12355               if (speed)
12356                 *cost += extra_cost->alu.arith;
12357
12358               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12359               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12360               return true;
12361             }
12362
12363           /* CMP.
12364
12365              Compare can freely swap the order of operands, and
12366              canonicalization puts the more complex operation first.
12367              But the integer MINUS logic expects the shift/extend
12368              operation in op1.  */
12369           if (! (REG_P (op0)
12370                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12371           {
12372             op0 = XEXP (x, 1);
12373             op1 = XEXP (x, 0);
12374           }
12375           goto cost_minus;
12376         }
12377
12378       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12379         {
12380           /* FCMP.  */
12381           if (speed)
12382             *cost += extra_cost->fp[mode == DFmode].compare;
12383
12384           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12385             {
12386               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12387               /* FCMP supports constant 0.0 for no extra cost. */
12388               return true;
12389             }
12390           return false;
12391         }
12392
12393       if (VECTOR_MODE_P (mode))
12394         {
12395           /* Vector compare.  */
12396           if (speed)
12397             *cost += extra_cost->vect.alu;
12398
12399           if (aarch64_float_const_zero_rtx_p (op1))
12400             {
12401               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12402                  cost.  */
12403               return true;
12404             }
12405           return false;
12406         }
12407       return false;
12408
12409     case MINUS:
12410       {
12411         op0 = XEXP (x, 0);
12412         op1 = XEXP (x, 1);
12413
12414 cost_minus:
12415         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12416
12417         /* Detect valid immediates.  */
12418         if ((GET_MODE_CLASS (mode) == MODE_INT
12419              || (GET_MODE_CLASS (mode) == MODE_CC
12420                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12421             && CONST_INT_P (op1)
12422             && aarch64_uimm12_shift (INTVAL (op1)))
12423           {
12424             if (speed)
12425               /* SUB(S) (immediate).  */
12426               *cost += extra_cost->alu.arith;
12427             return true;
12428           }
12429
12430         /* Look for SUB (extended register).  */
12431         if (is_a <scalar_int_mode> (mode)
12432             && aarch64_rtx_arith_op_extract_p (op1))
12433           {
12434             if (speed)
12435               *cost += extra_cost->alu.extend_arith;
12436
12437             op1 = aarch64_strip_extend (op1, true);
12438             *cost += rtx_cost (op1, VOIDmode,
12439                                (enum rtx_code) GET_CODE (op1), 0, speed);
12440             return true;
12441           }
12442
12443         rtx new_op1 = aarch64_strip_extend (op1, false);
12444
12445         /* Cost this as an FMA-alike operation.  */
12446         if ((GET_CODE (new_op1) == MULT
12447              || aarch64_shift_p (GET_CODE (new_op1)))
12448             && code != COMPARE)
12449           {
12450             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12451                                             (enum rtx_code) code,
12452                                             speed);
12453             return true;
12454           }
12455
12456         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12457
12458         if (speed)
12459           {
12460             if (VECTOR_MODE_P (mode))
12461               {
12462                 /* Vector SUB.  */
12463                 *cost += extra_cost->vect.alu;
12464               }
12465             else if (GET_MODE_CLASS (mode) == MODE_INT)
12466               {
12467                 /* SUB(S).  */
12468                 *cost += extra_cost->alu.arith;
12469               }
12470             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12471               {
12472                 /* FSUB.  */
12473                 *cost += extra_cost->fp[mode == DFmode].addsub;
12474               }
12475           }
12476         return true;
12477       }
12478
12479     case PLUS:
12480       {
12481         rtx new_op0;
12482
12483         op0 = XEXP (x, 0);
12484         op1 = XEXP (x, 1);
12485
12486 cost_plus:
12487         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12488             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12489           {
12490             /* CSINC.  */
12491             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12492             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12493             return true;
12494           }
12495
12496         if (GET_MODE_CLASS (mode) == MODE_INT
12497             && (aarch64_plus_immediate (op1, mode)
12498                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
12499           {
12500             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12501
12502             if (speed)
12503               /* ADD (immediate).  */
12504               *cost += extra_cost->alu.arith;
12505             return true;
12506           }
12507
12508         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12509
12510         /* Look for ADD (extended register).  */
12511         if (is_a <scalar_int_mode> (mode)
12512             && aarch64_rtx_arith_op_extract_p (op0))
12513           {
12514             if (speed)
12515               *cost += extra_cost->alu.extend_arith;
12516
12517             op0 = aarch64_strip_extend (op0, true);
12518             *cost += rtx_cost (op0, VOIDmode,
12519                                (enum rtx_code) GET_CODE (op0), 0, speed);
12520             return true;
12521           }
12522
12523         /* Strip any extend, leave shifts behind as we will
12524            cost them through mult_cost.  */
12525         new_op0 = aarch64_strip_extend (op0, false);
12526
12527         if (GET_CODE (new_op0) == MULT
12528             || aarch64_shift_p (GET_CODE (new_op0)))
12529           {
12530             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12531                                             speed);
12532             return true;
12533           }
12534
12535         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12536
12537         if (speed)
12538           {
12539             if (VECTOR_MODE_P (mode))
12540               {
12541                 /* Vector ADD.  */
12542                 *cost += extra_cost->vect.alu;
12543               }
12544             else if (GET_MODE_CLASS (mode) == MODE_INT)
12545               {
12546                 /* ADD.  */
12547                 *cost += extra_cost->alu.arith;
12548               }
12549             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12550               {
12551                 /* FADD.  */
12552                 *cost += extra_cost->fp[mode == DFmode].addsub;
12553               }
12554           }
12555         return true;
12556       }
12557
12558     case BSWAP:
12559       *cost = COSTS_N_INSNS (1);
12560
12561       if (speed)
12562         {
12563           if (VECTOR_MODE_P (mode))
12564             *cost += extra_cost->vect.alu;
12565           else
12566             *cost += extra_cost->alu.rev;
12567         }
12568       return false;
12569
12570     case IOR:
12571       if (aarch_rev16_p (x))
12572         {
12573           *cost = COSTS_N_INSNS (1);
12574
12575           if (speed)
12576             {
12577               if (VECTOR_MODE_P (mode))
12578                 *cost += extra_cost->vect.alu;
12579               else
12580                 *cost += extra_cost->alu.rev;
12581             }
12582           return true;
12583         }
12584
12585       if (aarch64_extr_rtx_p (x, &op0, &op1))
12586         {
12587           *cost += rtx_cost (op0, mode, IOR, 0, speed);
12588           *cost += rtx_cost (op1, mode, IOR, 1, speed);
12589           if (speed)
12590             *cost += extra_cost->alu.shift;
12591
12592           return true;
12593         }
12594     /* Fall through.  */
12595     case XOR:
12596     case AND:
12597     cost_logic:
12598       op0 = XEXP (x, 0);
12599       op1 = XEXP (x, 1);
12600
12601       if (VECTOR_MODE_P (mode))
12602         {
12603           if (speed)
12604             *cost += extra_cost->vect.alu;
12605           return true;
12606         }
12607
12608       if (code == AND
12609           && GET_CODE (op0) == MULT
12610           && CONST_INT_P (XEXP (op0, 1))
12611           && CONST_INT_P (op1)
12612           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12613                                INTVAL (op1)) != 0)
12614         {
12615           /* This is a UBFM/SBFM.  */
12616           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12617           if (speed)
12618             *cost += extra_cost->alu.bfx;
12619           return true;
12620         }
12621
12622       if (is_int_mode (mode, &int_mode))
12623         {
12624           if (CONST_INT_P (op1))
12625             {
12626               /* We have a mask + shift version of a UBFIZ
12627                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
12628               if (GET_CODE (op0) == ASHIFT
12629                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12630                                                          XEXP (op0, 1)))
12631                 {
12632                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
12633                                      (enum rtx_code) code, 0, speed);
12634                   if (speed)
12635                     *cost += extra_cost->alu.bfx;
12636
12637                   return true;
12638                 }
12639               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12640                 {
12641                 /* We possibly get the immediate for free, this is not
12642                    modelled.  */
12643                   *cost += rtx_cost (op0, int_mode,
12644                                      (enum rtx_code) code, 0, speed);
12645                   if (speed)
12646                     *cost += extra_cost->alu.logical;
12647
12648                   return true;
12649                 }
12650             }
12651           else
12652             {
12653               rtx new_op0 = op0;
12654
12655               /* Handle ORN, EON, or BIC.  */
12656               if (GET_CODE (op0) == NOT)
12657                 op0 = XEXP (op0, 0);
12658
12659               new_op0 = aarch64_strip_shift (op0);
12660
12661               /* If we had a shift on op0 then this is a logical-shift-
12662                  by-register/immediate operation.  Otherwise, this is just
12663                  a logical operation.  */
12664               if (speed)
12665                 {
12666                   if (new_op0 != op0)
12667                     {
12668                       /* Shift by immediate.  */
12669                       if (CONST_INT_P (XEXP (op0, 1)))
12670                         *cost += extra_cost->alu.log_shift;
12671                       else
12672                         *cost += extra_cost->alu.log_shift_reg;
12673                     }
12674                   else
12675                     *cost += extra_cost->alu.logical;
12676                 }
12677
12678               /* In both cases we want to cost both operands.  */
12679               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12680                                  0, speed);
12681               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12682                                  1, speed);
12683
12684               return true;
12685             }
12686         }
12687       return false;
12688
12689     case NOT:
12690       x = XEXP (x, 0);
12691       op0 = aarch64_strip_shift (x);
12692
12693       if (VECTOR_MODE_P (mode))
12694         {
12695           /* Vector NOT.  */
12696           *cost += extra_cost->vect.alu;
12697           return false;
12698         }
12699
12700       /* MVN-shifted-reg.  */
12701       if (op0 != x)
12702         {
12703           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12704
12705           if (speed)
12706             *cost += extra_cost->alu.log_shift;
12707
12708           return true;
12709         }
12710       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12711          Handle the second form here taking care that 'a' in the above can
12712          be a shift.  */
12713       else if (GET_CODE (op0) == XOR)
12714         {
12715           rtx newop0 = XEXP (op0, 0);
12716           rtx newop1 = XEXP (op0, 1);
12717           rtx op0_stripped = aarch64_strip_shift (newop0);
12718
12719           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12720           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12721
12722           if (speed)
12723             {
12724               if (op0_stripped != newop0)
12725                 *cost += extra_cost->alu.log_shift;
12726               else
12727                 *cost += extra_cost->alu.logical;
12728             }
12729
12730           return true;
12731         }
12732       /* MVN.  */
12733       if (speed)
12734         *cost += extra_cost->alu.logical;
12735
12736       return false;
12737
12738     case ZERO_EXTEND:
12739
12740       op0 = XEXP (x, 0);
12741       /* If a value is written in SI mode, then zero extended to DI
12742          mode, the operation will in general be free as a write to
12743          a 'w' register implicitly zeroes the upper bits of an 'x'
12744          register.  However, if this is
12745
12746            (set (reg) (zero_extend (reg)))
12747
12748          we must cost the explicit register move.  */
12749       if (mode == DImode
12750           && GET_MODE (op0) == SImode
12751           && outer == SET)
12752         {
12753           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12754
12755         /* If OP_COST is non-zero, then the cost of the zero extend
12756            is effectively the cost of the inner operation.  Otherwise
12757            we have a MOV instruction and we take the cost from the MOV
12758            itself.  This is true independently of whether we are
12759            optimizing for space or time.  */
12760           if (op_cost)
12761             *cost = op_cost;
12762
12763           return true;
12764         }
12765       else if (MEM_P (op0))
12766         {
12767           /* All loads can zero extend to any size for free.  */
12768           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12769           return true;
12770         }
12771
12772       op0 = aarch64_extend_bitfield_pattern_p (x);
12773       if (op0)
12774         {
12775           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12776           if (speed)
12777             *cost += extra_cost->alu.bfx;
12778           return true;
12779         }
12780
12781       if (speed)
12782         {
12783           if (VECTOR_MODE_P (mode))
12784             {
12785               /* UMOV.  */
12786               *cost += extra_cost->vect.alu;
12787             }
12788           else
12789             {
12790               /* We generate an AND instead of UXTB/UXTH.  */
12791               *cost += extra_cost->alu.logical;
12792             }
12793         }
12794       return false;
12795
12796     case SIGN_EXTEND:
12797       if (MEM_P (XEXP (x, 0)))
12798         {
12799           /* LDRSH.  */
12800           if (speed)
12801             {
12802               rtx address = XEXP (XEXP (x, 0), 0);
12803               *cost += extra_cost->ldst.load_sign_extend;
12804
12805               *cost +=
12806                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12807                                                      0, speed));
12808             }
12809           return true;
12810         }
12811
12812       op0 = aarch64_extend_bitfield_pattern_p (x);
12813       if (op0)
12814         {
12815           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12816           if (speed)
12817             *cost += extra_cost->alu.bfx;
12818           return true;
12819         }
12820
12821       if (speed)
12822         {
12823           if (VECTOR_MODE_P (mode))
12824             *cost += extra_cost->vect.alu;
12825           else
12826             *cost += extra_cost->alu.extend;
12827         }
12828       return false;
12829
12830     case ASHIFT:
12831       op0 = XEXP (x, 0);
12832       op1 = XEXP (x, 1);
12833
12834       if (CONST_INT_P (op1))
12835         {
12836           if (speed)
12837             {
12838               if (VECTOR_MODE_P (mode))
12839                 {
12840                   /* Vector shift (immediate).  */
12841                   *cost += extra_cost->vect.alu;
12842                 }
12843               else
12844                 {
12845                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
12846                      aliases.  */
12847                   *cost += extra_cost->alu.shift;
12848                 }
12849             }
12850
12851           /* We can incorporate zero/sign extend for free.  */
12852           if (GET_CODE (op0) == ZERO_EXTEND
12853               || GET_CODE (op0) == SIGN_EXTEND)
12854             op0 = XEXP (op0, 0);
12855
12856           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12857           return true;
12858         }
12859       else
12860         {
12861           if (VECTOR_MODE_P (mode))
12862             {
12863               if (speed)
12864                 /* Vector shift (register).  */
12865                 *cost += extra_cost->vect.alu;
12866             }
12867           else
12868             {
12869               if (speed)
12870                 /* LSLV.  */
12871                 *cost += extra_cost->alu.shift_reg;
12872
12873               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12874                   && CONST_INT_P (XEXP (op1, 1))
12875                   && known_eq (INTVAL (XEXP (op1, 1)),
12876                                GET_MODE_BITSIZE (mode) - 1))
12877                 {
12878                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12879                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12880                      don't recurse into it.  */
12881                   return true;
12882                 }
12883             }
12884           return false;  /* All arguments need to be in registers.  */
12885         }
12886
12887     case ROTATE:
12888     case ROTATERT:
12889     case LSHIFTRT:
12890     case ASHIFTRT:
12891       op0 = XEXP (x, 0);
12892       op1 = XEXP (x, 1);
12893
12894       if (CONST_INT_P (op1))
12895         {
12896           /* ASR (immediate) and friends.  */
12897           if (speed)
12898             {
12899               if (VECTOR_MODE_P (mode))
12900                 *cost += extra_cost->vect.alu;
12901               else
12902                 *cost += extra_cost->alu.shift;
12903             }
12904
12905           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12906           return true;
12907         }
12908       else
12909         {
12910           if (VECTOR_MODE_P (mode))
12911             {
12912               if (speed)
12913                 /* Vector shift (register).  */
12914                 *cost += extra_cost->vect.alu;
12915             }
12916           else
12917             {
12918               if (speed)
12919                 /* ASR (register) and friends.  */
12920                 *cost += extra_cost->alu.shift_reg;
12921
12922               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12923                   && CONST_INT_P (XEXP (op1, 1))
12924                   && known_eq (INTVAL (XEXP (op1, 1)),
12925                                GET_MODE_BITSIZE (mode) - 1))
12926                 {
12927                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12928                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12929                      don't recurse into it.  */
12930                   return true;
12931                 }
12932             }
12933           return false;  /* All arguments need to be in registers.  */
12934         }
12935
12936     case SYMBOL_REF:
12937
12938       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12939           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12940         {
12941           /* LDR.  */
12942           if (speed)
12943             *cost += extra_cost->ldst.load;
12944         }
12945       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12946                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12947         {
12948           /* ADRP, followed by ADD.  */
12949           *cost += COSTS_N_INSNS (1);
12950           if (speed)
12951             *cost += 2 * extra_cost->alu.arith;
12952         }
12953       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12954                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12955         {
12956           /* ADR.  */
12957           if (speed)
12958             *cost += extra_cost->alu.arith;
12959         }
12960
12961       if (flag_pic)
12962         {
12963           /* One extra load instruction, after accessing the GOT.  */
12964           *cost += COSTS_N_INSNS (1);
12965           if (speed)
12966             *cost += extra_cost->ldst.load;
12967         }
12968       return true;
12969
12970     case HIGH:
12971     case LO_SUM:
12972       /* ADRP/ADD (immediate).  */
12973       if (speed)
12974         *cost += extra_cost->alu.arith;
12975       return true;
12976
12977     case ZERO_EXTRACT:
12978     case SIGN_EXTRACT:
12979       /* UBFX/SBFX.  */
12980       if (speed)
12981         {
12982           if (VECTOR_MODE_P (mode))
12983             *cost += extra_cost->vect.alu;
12984           else
12985             *cost += extra_cost->alu.bfx;
12986         }
12987
12988       /* We can trust that the immediates used will be correct (there
12989          are no by-register forms), so we need only cost op0.  */
12990       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12991       return true;
12992
12993     case MULT:
12994       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12995       /* aarch64_rtx_mult_cost always handles recursion to its
12996          operands.  */
12997       return true;
12998
12999     case MOD:
13000     /* We can expand signed mod by power of 2 using a NEGS, two parallel
13001        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
13002        an unconditional negate.  This case should only ever be reached through
13003        the set_smod_pow2_cheap check in expmed.c.  */
13004       if (CONST_INT_P (XEXP (x, 1))
13005           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13006           && (mode == SImode || mode == DImode))
13007         {
13008           /* We expand to 4 instructions.  Reset the baseline.  */
13009           *cost = COSTS_N_INSNS (4);
13010
13011           if (speed)
13012             *cost += 2 * extra_cost->alu.logical
13013                      + 2 * extra_cost->alu.arith;
13014
13015           return true;
13016         }
13017
13018     /* Fall-through.  */
13019     case UMOD:
13020       if (speed)
13021         {
13022           /* Slighly prefer UMOD over SMOD.  */
13023           if (VECTOR_MODE_P (mode))
13024             *cost += extra_cost->vect.alu;
13025           else if (GET_MODE_CLASS (mode) == MODE_INT)
13026             *cost += (extra_cost->mult[mode == DImode].add
13027                       + extra_cost->mult[mode == DImode].idiv
13028                       + (code == MOD ? 1 : 0));
13029         }
13030       return false;  /* All arguments need to be in registers.  */
13031
13032     case DIV:
13033     case UDIV:
13034     case SQRT:
13035       if (speed)
13036         {
13037           if (VECTOR_MODE_P (mode))
13038             *cost += extra_cost->vect.alu;
13039           else if (GET_MODE_CLASS (mode) == MODE_INT)
13040             /* There is no integer SQRT, so only DIV and UDIV can get
13041                here.  */
13042             *cost += (extra_cost->mult[mode == DImode].idiv
13043                      /* Slighly prefer UDIV over SDIV.  */
13044                      + (code == DIV ? 1 : 0));
13045           else
13046             *cost += extra_cost->fp[mode == DFmode].div;
13047         }
13048       return false;  /* All arguments need to be in registers.  */
13049
13050     case IF_THEN_ELSE:
13051       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13052                                          XEXP (x, 2), cost, speed);
13053
13054     case EQ:
13055     case NE:
13056     case GT:
13057     case GTU:
13058     case LT:
13059     case LTU:
13060     case GE:
13061     case GEU:
13062     case LE:
13063     case LEU:
13064
13065       return false; /* All arguments must be in registers.  */
13066
13067     case FMA:
13068       op0 = XEXP (x, 0);
13069       op1 = XEXP (x, 1);
13070       op2 = XEXP (x, 2);
13071
13072       if (speed)
13073         {
13074           if (VECTOR_MODE_P (mode))
13075             *cost += extra_cost->vect.alu;
13076           else
13077             *cost += extra_cost->fp[mode == DFmode].fma;
13078         }
13079
13080       /* FMSUB, FNMADD, and FNMSUB are free.  */
13081       if (GET_CODE (op0) == NEG)
13082         op0 = XEXP (op0, 0);
13083
13084       if (GET_CODE (op2) == NEG)
13085         op2 = XEXP (op2, 0);
13086
13087       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13088          and the by-element operand as operand 0.  */
13089       if (GET_CODE (op1) == NEG)
13090         op1 = XEXP (op1, 0);
13091
13092       /* Catch vector-by-element operations.  The by-element operand can
13093          either be (vec_duplicate (vec_select (x))) or just
13094          (vec_select (x)), depending on whether we are multiplying by
13095          a vector or a scalar.
13096
13097          Canonicalization is not very good in these cases, FMA4 will put the
13098          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
13099       if (GET_CODE (op0) == VEC_DUPLICATE)
13100         op0 = XEXP (op0, 0);
13101       else if (GET_CODE (op1) == VEC_DUPLICATE)
13102         op1 = XEXP (op1, 0);
13103
13104       if (GET_CODE (op0) == VEC_SELECT)
13105         op0 = XEXP (op0, 0);
13106       else if (GET_CODE (op1) == VEC_SELECT)
13107         op1 = XEXP (op1, 0);
13108
13109       /* If the remaining parameters are not registers,
13110          get the cost to put them into registers.  */
13111       *cost += rtx_cost (op0, mode, FMA, 0, speed);
13112       *cost += rtx_cost (op1, mode, FMA, 1, speed);
13113       *cost += rtx_cost (op2, mode, FMA, 2, speed);
13114       return true;
13115
13116     case FLOAT:
13117     case UNSIGNED_FLOAT:
13118       if (speed)
13119         *cost += extra_cost->fp[mode == DFmode].fromint;
13120       return false;
13121
13122     case FLOAT_EXTEND:
13123       if (speed)
13124         {
13125           if (VECTOR_MODE_P (mode))
13126             {
13127               /*Vector truncate.  */
13128               *cost += extra_cost->vect.alu;
13129             }
13130           else
13131             *cost += extra_cost->fp[mode == DFmode].widen;
13132         }
13133       return false;
13134
13135     case FLOAT_TRUNCATE:
13136       if (speed)
13137         {
13138           if (VECTOR_MODE_P (mode))
13139             {
13140               /*Vector conversion.  */
13141               *cost += extra_cost->vect.alu;
13142             }
13143           else
13144             *cost += extra_cost->fp[mode == DFmode].narrow;
13145         }
13146       return false;
13147
13148     case FIX:
13149     case UNSIGNED_FIX:
13150       x = XEXP (x, 0);
13151       /* Strip the rounding part.  They will all be implemented
13152          by the fcvt* family of instructions anyway.  */
13153       if (GET_CODE (x) == UNSPEC)
13154         {
13155           unsigned int uns_code = XINT (x, 1);
13156
13157           if (uns_code == UNSPEC_FRINTA
13158               || uns_code == UNSPEC_FRINTM
13159               || uns_code == UNSPEC_FRINTN
13160               || uns_code == UNSPEC_FRINTP
13161               || uns_code == UNSPEC_FRINTZ)
13162             x = XVECEXP (x, 0, 0);
13163         }
13164
13165       if (speed)
13166         {
13167           if (VECTOR_MODE_P (mode))
13168             *cost += extra_cost->vect.alu;
13169           else
13170             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13171         }
13172
13173       /* We can combine fmul by a power of 2 followed by a fcvt into a single
13174          fixed-point fcvt.  */
13175       if (GET_CODE (x) == MULT
13176           && ((VECTOR_MODE_P (mode)
13177                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13178               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13179         {
13180           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13181                              0, speed);
13182           return true;
13183         }
13184
13185       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13186       return true;
13187
13188     case ABS:
13189       if (VECTOR_MODE_P (mode))
13190         {
13191           /* ABS (vector).  */
13192           if (speed)
13193             *cost += extra_cost->vect.alu;
13194         }
13195       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13196         {
13197           op0 = XEXP (x, 0);
13198
13199           /* FABD, which is analogous to FADD.  */
13200           if (GET_CODE (op0) == MINUS)
13201             {
13202               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13203               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13204               if (speed)
13205                 *cost += extra_cost->fp[mode == DFmode].addsub;
13206
13207               return true;
13208             }
13209           /* Simple FABS is analogous to FNEG.  */
13210           if (speed)
13211             *cost += extra_cost->fp[mode == DFmode].neg;
13212         }
13213       else
13214         {
13215           /* Integer ABS will either be split to
13216              two arithmetic instructions, or will be an ABS
13217              (scalar), which we don't model.  */
13218           *cost = COSTS_N_INSNS (2);
13219           if (speed)
13220             *cost += 2 * extra_cost->alu.arith;
13221         }
13222       return false;
13223
13224     case SMAX:
13225     case SMIN:
13226       if (speed)
13227         {
13228           if (VECTOR_MODE_P (mode))
13229             *cost += extra_cost->vect.alu;
13230           else
13231             {
13232               /* FMAXNM/FMINNM/FMAX/FMIN.
13233                  TODO: This may not be accurate for all implementations, but
13234                  we do not model this in the cost tables.  */
13235               *cost += extra_cost->fp[mode == DFmode].addsub;
13236             }
13237         }
13238       return false;
13239
13240     case UNSPEC:
13241       /* The floating point round to integer frint* instructions.  */
13242       if (aarch64_frint_unspec_p (XINT (x, 1)))
13243         {
13244           if (speed)
13245             *cost += extra_cost->fp[mode == DFmode].roundint;
13246
13247           return false;
13248         }
13249
13250       if (XINT (x, 1) == UNSPEC_RBIT)
13251         {
13252           if (speed)
13253             *cost += extra_cost->alu.rev;
13254
13255           return false;
13256         }
13257       break;
13258
13259     case TRUNCATE:
13260
13261       /* Decompose <su>muldi3_highpart.  */
13262       if (/* (truncate:DI  */
13263           mode == DImode
13264           /*   (lshiftrt:TI  */
13265           && GET_MODE (XEXP (x, 0)) == TImode
13266           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13267           /*      (mult:TI  */
13268           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13269           /*        (ANY_EXTEND:TI (reg:DI))
13270                     (ANY_EXTEND:TI (reg:DI)))  */
13271           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13272                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13273               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13274                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13275           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13276           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13277           /*     (const_int 64)  */
13278           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13279           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13280         {
13281           /* UMULH/SMULH.  */
13282           if (speed)
13283             *cost += extra_cost->mult[mode == DImode].extend;
13284           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13285                              mode, MULT, 0, speed);
13286           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13287                              mode, MULT, 1, speed);
13288           return true;
13289         }
13290
13291       /* Fall through.  */
13292     default:
13293       break;
13294     }
13295
13296   if (dump_file
13297       && flag_aarch64_verbose_cost)
13298     fprintf (dump_file,
13299       "\nFailed to cost RTX.  Assuming default cost.\n");
13300
13301   return true;
13302 }
13303
13304 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13305    calculated for X.  This cost is stored in *COST.  Returns true
13306    if the total cost of X was calculated.  */
13307 static bool
13308 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13309                    int param, int *cost, bool speed)
13310 {
13311   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13312
13313   if (dump_file
13314       && flag_aarch64_verbose_cost)
13315     {
13316       print_rtl_single (dump_file, x);
13317       fprintf (dump_file, "\n%s cost: %d (%s)\n",
13318                speed ? "Hot" : "Cold",
13319                *cost, result ? "final" : "partial");
13320     }
13321
13322   return result;
13323 }
13324
13325 static int
13326 aarch64_register_move_cost (machine_mode mode,
13327                             reg_class_t from_i, reg_class_t to_i)
13328 {
13329   enum reg_class from = (enum reg_class) from_i;
13330   enum reg_class to = (enum reg_class) to_i;
13331   const struct cpu_regmove_cost *regmove_cost
13332     = aarch64_tune_params.regmove_cost;
13333
13334   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
13335   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13336       || to == STUB_REGS)
13337     to = GENERAL_REGS;
13338
13339   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13340       || from == STUB_REGS)
13341     from = GENERAL_REGS;
13342
13343   /* Make RDFFR very expensive.  In particular, if we know that the FFR
13344      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13345      as a way of obtaining a PTRUE.  */
13346   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13347       && hard_reg_set_subset_p (reg_class_contents[from_i],
13348                                 reg_class_contents[FFR_REGS]))
13349     return 80;
13350
13351   /* Moving between GPR and stack cost is the same as GP2GP.  */
13352   if ((from == GENERAL_REGS && to == STACK_REG)
13353       || (to == GENERAL_REGS && from == STACK_REG))
13354     return regmove_cost->GP2GP;
13355
13356   /* To/From the stack register, we move via the gprs.  */
13357   if (to == STACK_REG || from == STACK_REG)
13358     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13359             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13360
13361   if (known_eq (GET_MODE_SIZE (mode), 16))
13362     {
13363       /* 128-bit operations on general registers require 2 instructions.  */
13364       if (from == GENERAL_REGS && to == GENERAL_REGS)
13365         return regmove_cost->GP2GP * 2;
13366       else if (from == GENERAL_REGS)
13367         return regmove_cost->GP2FP * 2;
13368       else if (to == GENERAL_REGS)
13369         return regmove_cost->FP2GP * 2;
13370
13371       /* When AdvSIMD instructions are disabled it is not possible to move
13372          a 128-bit value directly between Q registers.  This is handled in
13373          secondary reload.  A general register is used as a scratch to move
13374          the upper DI value and the lower DI value is moved directly,
13375          hence the cost is the sum of three moves. */
13376       if (! TARGET_SIMD)
13377         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13378
13379       return regmove_cost->FP2FP;
13380     }
13381
13382   if (from == GENERAL_REGS && to == GENERAL_REGS)
13383     return regmove_cost->GP2GP;
13384   else if (from == GENERAL_REGS)
13385     return regmove_cost->GP2FP;
13386   else if (to == GENERAL_REGS)
13387     return regmove_cost->FP2GP;
13388
13389   return regmove_cost->FP2FP;
13390 }
13391
13392 static int
13393 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13394                           reg_class_t rclass ATTRIBUTE_UNUSED,
13395                           bool in ATTRIBUTE_UNUSED)
13396 {
13397   return aarch64_tune_params.memmov_cost;
13398 }
13399
13400 /* Implement TARGET_INIT_BUILTINS.  */
13401 static void
13402 aarch64_init_builtins ()
13403 {
13404   aarch64_general_init_builtins ();
13405   aarch64_sve::init_builtins ();
13406 }
13407
13408 /* Implement TARGET_FOLD_BUILTIN.  */
13409 static tree
13410 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13411 {
13412   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13413   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13414   tree type = TREE_TYPE (TREE_TYPE (fndecl));
13415   switch (code & AARCH64_BUILTIN_CLASS)
13416     {
13417     case AARCH64_BUILTIN_GENERAL:
13418       return aarch64_general_fold_builtin (subcode, type, nargs, args);
13419
13420     case AARCH64_BUILTIN_SVE:
13421       return NULL_TREE;
13422     }
13423   gcc_unreachable ();
13424 }
13425
13426 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
13427 static bool
13428 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13429 {
13430   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13431   tree fndecl = gimple_call_fndecl (stmt);
13432   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13433   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13434   gimple *new_stmt = NULL;
13435   switch (code & AARCH64_BUILTIN_CLASS)
13436     {
13437     case AARCH64_BUILTIN_GENERAL:
13438       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13439       break;
13440
13441     case AARCH64_BUILTIN_SVE:
13442       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13443       break;
13444     }
13445
13446   if (!new_stmt)
13447     return false;
13448
13449   gsi_replace (gsi, new_stmt, true);
13450   return true;
13451 }
13452
13453 /* Implement TARGET_EXPAND_BUILTIN.  */
13454 static rtx
13455 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13456 {
13457   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13458   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13459   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13460   switch (code & AARCH64_BUILTIN_CLASS)
13461     {
13462     case AARCH64_BUILTIN_GENERAL:
13463       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13464
13465     case AARCH64_BUILTIN_SVE:
13466       return aarch64_sve::expand_builtin (subcode, exp, target);
13467     }
13468   gcc_unreachable ();
13469 }
13470
13471 /* Implement TARGET_BUILTIN_DECL.  */
13472 static tree
13473 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13474 {
13475   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13476   switch (code & AARCH64_BUILTIN_CLASS)
13477     {
13478     case AARCH64_BUILTIN_GENERAL:
13479       return aarch64_general_builtin_decl (subcode, initialize_p);
13480
13481     case AARCH64_BUILTIN_SVE:
13482       return aarch64_sve::builtin_decl (subcode, initialize_p);
13483     }
13484   gcc_unreachable ();
13485 }
13486
13487 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13488    to optimize 1.0/sqrt.  */
13489
13490 static bool
13491 use_rsqrt_p (machine_mode mode)
13492 {
13493   return (!flag_trapping_math
13494           && flag_unsafe_math_optimizations
13495           && ((aarch64_tune_params.approx_modes->recip_sqrt
13496                & AARCH64_APPROX_MODE (mode))
13497               || flag_mrecip_low_precision_sqrt));
13498 }
13499
13500 /* Function to decide when to use the approximate reciprocal square root
13501    builtin.  */
13502
13503 static tree
13504 aarch64_builtin_reciprocal (tree fndecl)
13505 {
13506   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13507
13508   if (!use_rsqrt_p (mode))
13509     return NULL_TREE;
13510   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13511   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13512   switch (code & AARCH64_BUILTIN_CLASS)
13513     {
13514     case AARCH64_BUILTIN_GENERAL:
13515       return aarch64_general_builtin_rsqrt (subcode);
13516
13517     case AARCH64_BUILTIN_SVE:
13518       return NULL_TREE;
13519     }
13520   gcc_unreachable ();
13521 }
13522
13523 /* Emit code to perform the floating-point operation:
13524
13525      DST = SRC1 * SRC2
13526
13527    where all three operands are already known to be registers.
13528    If the operation is an SVE one, PTRUE is a suitable all-true
13529    predicate.  */
13530
13531 static void
13532 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13533 {
13534   if (ptrue)
13535     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13536                                  dst, ptrue, src1, src2,
13537                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
13538   else
13539     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13540 }
13541
13542 /* Emit instruction sequence to compute either the approximate square root
13543    or its approximate reciprocal, depending on the flag RECP, and return
13544    whether the sequence was emitted or not.  */
13545
13546 bool
13547 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13548 {
13549   machine_mode mode = GET_MODE (dst);
13550
13551   if (GET_MODE_INNER (mode) == HFmode)
13552     {
13553       gcc_assert (!recp);
13554       return false;
13555     }
13556
13557   if (!recp)
13558     {
13559       if (!(flag_mlow_precision_sqrt
13560             || (aarch64_tune_params.approx_modes->sqrt
13561                 & AARCH64_APPROX_MODE (mode))))
13562         return false;
13563
13564       if (!flag_finite_math_only
13565           || flag_trapping_math
13566           || !flag_unsafe_math_optimizations
13567           || optimize_function_for_size_p (cfun))
13568         return false;
13569     }
13570   else
13571     /* Caller assumes we cannot fail.  */
13572     gcc_assert (use_rsqrt_p (mode));
13573
13574   rtx pg = NULL_RTX;
13575   if (aarch64_sve_mode_p (mode))
13576     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13577   machine_mode mmsk = (VECTOR_MODE_P (mode)
13578                        ? related_int_vector_mode (mode).require ()
13579                        : int_mode_for_mode (mode).require ());
13580   rtx xmsk = NULL_RTX;
13581   if (!recp)
13582     {
13583       /* When calculating the approximate square root, compare the
13584          argument with 0.0 and create a mask.  */
13585       rtx zero = CONST0_RTX (mode);
13586       if (pg)
13587         {
13588           xmsk = gen_reg_rtx (GET_MODE (pg));
13589           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13590           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13591                                            xmsk, pg, hint, src, zero));
13592         }
13593       else
13594         {
13595           xmsk = gen_reg_rtx (mmsk);
13596           emit_insn (gen_rtx_SET (xmsk,
13597                                   gen_rtx_NEG (mmsk,
13598                                                gen_rtx_EQ (mmsk, src, zero))));
13599         }
13600     }
13601
13602   /* Estimate the approximate reciprocal square root.  */
13603   rtx xdst = gen_reg_rtx (mode);
13604   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13605
13606   /* Iterate over the series twice for SF and thrice for DF.  */
13607   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13608
13609   /* Optionally iterate over the series once less for faster performance
13610      while sacrificing the accuracy.  */
13611   if ((recp && flag_mrecip_low_precision_sqrt)
13612       || (!recp && flag_mlow_precision_sqrt))
13613     iterations--;
13614
13615   /* Iterate over the series to calculate the approximate reciprocal square
13616      root.  */
13617   rtx x1 = gen_reg_rtx (mode);
13618   while (iterations--)
13619     {
13620       rtx x2 = gen_reg_rtx (mode);
13621       aarch64_emit_mult (x2, pg, xdst, xdst);
13622
13623       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13624
13625       if (iterations > 0)
13626         aarch64_emit_mult (xdst, pg, xdst, x1);
13627     }
13628
13629   if (!recp)
13630     {
13631       if (pg)
13632         /* Multiply nonzero source values by the corresponding intermediate
13633            result elements, so that the final calculation is the approximate
13634            square root rather than its reciprocal.  Select a zero result for
13635            zero source values, to avoid the Inf * 0 -> NaN that we'd get
13636            otherwise.  */
13637         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13638                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13639       else
13640         {
13641           /* Qualify the approximate reciprocal square root when the
13642              argument is 0.0 by squashing the intermediary result to 0.0.  */
13643           rtx xtmp = gen_reg_rtx (mmsk);
13644           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13645                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
13646           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13647
13648           /* Calculate the approximate square root.  */
13649           aarch64_emit_mult (xdst, pg, xdst, src);
13650         }
13651     }
13652
13653   /* Finalize the approximation.  */
13654   aarch64_emit_mult (dst, pg, xdst, x1);
13655
13656   return true;
13657 }
13658
13659 /* Emit the instruction sequence to compute the approximation for the division
13660    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
13661
13662 bool
13663 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13664 {
13665   machine_mode mode = GET_MODE (quo);
13666
13667   if (GET_MODE_INNER (mode) == HFmode)
13668     return false;
13669
13670   bool use_approx_division_p = (flag_mlow_precision_div
13671                                 || (aarch64_tune_params.approx_modes->division
13672                                     & AARCH64_APPROX_MODE (mode)));
13673
13674   if (!flag_finite_math_only
13675       || flag_trapping_math
13676       || !flag_unsafe_math_optimizations
13677       || optimize_function_for_size_p (cfun)
13678       || !use_approx_division_p)
13679     return false;
13680
13681   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13682     return false;
13683
13684   rtx pg = NULL_RTX;
13685   if (aarch64_sve_mode_p (mode))
13686     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13687
13688   /* Estimate the approximate reciprocal.  */
13689   rtx xrcp = gen_reg_rtx (mode);
13690   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13691
13692   /* Iterate over the series twice for SF and thrice for DF.  */
13693   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13694
13695   /* Optionally iterate over the series less for faster performance,
13696      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
13697   if (flag_mlow_precision_div)
13698     iterations = (GET_MODE_INNER (mode) == DFmode
13699                   ? aarch64_double_recp_precision
13700                   : aarch64_float_recp_precision);
13701
13702   /* Iterate over the series to calculate the approximate reciprocal.  */
13703   rtx xtmp = gen_reg_rtx (mode);
13704   while (iterations--)
13705     {
13706       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13707
13708       if (iterations > 0)
13709         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13710     }
13711
13712   if (num != CONST1_RTX (mode))
13713     {
13714       /* As the approximate reciprocal of DEN is already calculated, only
13715          calculate the approximate division when NUM is not 1.0.  */
13716       rtx xnum = force_reg (mode, num);
13717       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13718     }
13719
13720   /* Finalize the approximation.  */
13721   aarch64_emit_mult (quo, pg, xrcp, xtmp);
13722   return true;
13723 }
13724
13725 /* Return the number of instructions that can be issued per cycle.  */
13726 static int
13727 aarch64_sched_issue_rate (void)
13728 {
13729   return aarch64_tune_params.issue_rate;
13730 }
13731
13732 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
13733 static int
13734 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13735 {
13736   if (DEBUG_INSN_P (insn))
13737     return more;
13738
13739   rtx_code code = GET_CODE (PATTERN (insn));
13740   if (code == USE || code == CLOBBER)
13741     return more;
13742
13743   if (get_attr_type (insn) == TYPE_NO_INSN)
13744     return more;
13745
13746   return more - 1;
13747 }
13748
13749 static int
13750 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13751 {
13752   int issue_rate = aarch64_sched_issue_rate ();
13753
13754   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13755 }
13756
13757
13758 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13759    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
13760    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
13761
13762 static int
13763 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13764                                                     int ready_index)
13765 {
13766   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13767 }
13768
13769
13770 /* Vectorizer cost model target hooks.  */
13771
13772 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
13773 static int
13774 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13775                                     tree vectype,
13776                                     int misalign ATTRIBUTE_UNUSED)
13777 {
13778   unsigned elements;
13779   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13780   bool fp = false;
13781
13782   if (vectype != NULL)
13783     fp = FLOAT_TYPE_P (vectype);
13784
13785   const simd_vec_cost *simd_costs;
13786   if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype))
13787       && costs->sve != NULL)
13788     simd_costs = costs->sve;
13789   else
13790     simd_costs = costs->advsimd;
13791
13792   switch (type_of_cost)
13793     {
13794       case scalar_stmt:
13795         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13796
13797       case scalar_load:
13798         return costs->scalar_load_cost;
13799
13800       case scalar_store:
13801         return costs->scalar_store_cost;
13802
13803       case vector_stmt:
13804         return fp ? simd_costs->fp_stmt_cost
13805                   : simd_costs->int_stmt_cost;
13806
13807       case vector_load:
13808         return simd_costs->align_load_cost;
13809
13810       case vector_store:
13811         return simd_costs->store_cost;
13812
13813       case vec_to_scalar:
13814         return simd_costs->vec_to_scalar_cost;
13815
13816       case scalar_to_vec:
13817         return simd_costs->scalar_to_vec_cost;
13818
13819       case unaligned_load:
13820       case vector_gather_load:
13821         return simd_costs->unalign_load_cost;
13822
13823       case unaligned_store:
13824       case vector_scatter_store:
13825         return simd_costs->unalign_store_cost;
13826
13827       case cond_branch_taken:
13828         return costs->cond_taken_branch_cost;
13829
13830       case cond_branch_not_taken:
13831         return costs->cond_not_taken_branch_cost;
13832
13833       case vec_perm:
13834         return simd_costs->permute_cost;
13835
13836       case vec_promote_demote:
13837         return fp ? simd_costs->fp_stmt_cost
13838                   : simd_costs->int_stmt_cost;
13839
13840       case vec_construct:
13841         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13842         return elements / 2 + 1;
13843
13844       default:
13845         gcc_unreachable ();
13846     }
13847 }
13848
13849 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
13850    vectors would produce a series of LDP or STP operations.  KIND is the
13851    kind of statement that STMT_INFO represents.  */
13852 static bool
13853 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
13854                            stmt_vec_info stmt_info)
13855 {
13856   switch (kind)
13857     {
13858     case vector_load:
13859     case vector_store:
13860     case unaligned_load:
13861     case unaligned_store:
13862       break;
13863
13864     default:
13865       return false;
13866     }
13867
13868   if (aarch64_tune_params.extra_tuning_flags
13869       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
13870     return false;
13871
13872   return is_gimple_assign (stmt_info->stmt);
13873 }
13874
13875 /* Return true if STMT_INFO extends the result of a load.  */
13876 static bool
13877 aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
13878 {
13879   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13880   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13881     return false;
13882
13883   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13884   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13885   tree rhs_type = TREE_TYPE (rhs);
13886   if (!INTEGRAL_TYPE_P (lhs_type)
13887       || !INTEGRAL_TYPE_P (rhs_type)
13888       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13889     return false;
13890
13891   stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
13892   return (def_stmt_info
13893           && STMT_VINFO_DATA_REF (def_stmt_info)
13894           && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13895 }
13896
13897 /* Return true if STMT_INFO is an integer truncation.  */
13898 static bool
13899 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13900 {
13901   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13902   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13903     return false;
13904
13905   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13906   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13907   return (INTEGRAL_TYPE_P (lhs_type)
13908           && INTEGRAL_TYPE_P (rhs_type)
13909           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13910 }
13911
13912 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13913    for STMT_INFO, which has cost kind KIND and which when vectorized would
13914    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
13915    targets.  */
13916 static unsigned int
13917 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
13918                               stmt_vec_info stmt_info, tree vectype,
13919                               unsigned int stmt_cost)
13920 {
13921   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13922      vector register size or number of units.  Integer promotions of this
13923      type therefore map to SXT[BHW] or UXT[BHW].
13924
13925      Most loads have extending forms that can do the sign or zero extension
13926      on the fly.  Optimistically assume that a load followed by an extension
13927      will fold to this form during combine, and that the extension therefore
13928      comes for free.  */
13929   if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
13930     stmt_cost = 0;
13931
13932   /* For similar reasons, vector_stmt integer truncations are a no-op,
13933      because we can just ignore the unused upper bits of the source.  */
13934   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13935     stmt_cost = 0;
13936
13937   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
13938      but there are no equivalent instructions for SVE.  This means that
13939      (all other things being equal) 128-bit SVE needs twice as many load
13940      and store instructions as Advanced SIMD in order to process vector pairs.
13941
13942      Also, scalar code can often use LDP and STP to access pairs of values,
13943      so it is too simplistic to say that one SVE load or store replaces
13944      VF scalar loads and stores.
13945
13946      Ideally we would account for this in the scalar and Advanced SIMD
13947      costs by making suitable load/store pairs as cheap as a single
13948      load/store.  However, that would be a very invasive change and in
13949      practice it tends to stress other parts of the cost model too much.
13950      E.g. stores of scalar constants currently count just a store,
13951      whereas stores of vector constants count a store and a vec_init.
13952      This is an artificial distinction for AArch64, where stores of
13953      nonzero scalar constants need the same kind of register invariant
13954      as vector stores.
13955
13956      An alternative would be to double the cost of any SVE loads and stores
13957      that could be paired in Advanced SIMD (and possibly also paired in
13958      scalar code).  But this tends to stress other parts of the cost model
13959      in the same way.  It also means that we can fall back to Advanced SIMD
13960      even if full-loop predication would have been useful.
13961
13962      Here we go for a more conservative version: double the costs of SVE
13963      loads and stores if one iteration of the scalar loop processes enough
13964      elements for it to use a whole number of Advanced SIMD LDP or STP
13965      instructions.  This makes it very likely that the VF would be 1 for
13966      Advanced SIMD, and so no epilogue should be needed.  */
13967   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
13968     {
13969       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
13970       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
13971       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
13972       if (multiple_p (count * elt_bits, 256)
13973           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
13974         stmt_cost *= 2;
13975     }
13976
13977   return stmt_cost;
13978 }
13979
13980 /* Implement targetm.vectorize.add_stmt_cost.  */
13981 static unsigned
13982 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
13983                        enum vect_cost_for_stmt kind,
13984                        struct _stmt_vec_info *stmt_info, tree vectype,
13985                        int misalign, enum vect_cost_model_location where)
13986 {
13987   unsigned *cost = (unsigned *) data;
13988   unsigned retval = 0;
13989
13990   if (flag_vect_cost_model)
13991     {
13992       int stmt_cost =
13993             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13994
13995       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13996         stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
13997                                                   vectype, stmt_cost);
13998
13999       /* Statements in an inner loop relative to the loop being
14000          vectorized are weighted more heavily.  The value here is
14001          arbitrary and could potentially be improved with analysis.  */
14002       if (where == vect_body && stmt_info
14003           && stmt_in_inner_loop_p (vinfo, stmt_info))
14004         count *= 50; /*  FIXME  */
14005
14006       retval = (unsigned) (count * stmt_cost);
14007       cost[where] += retval;
14008     }
14009
14010   return retval;
14011 }
14012
14013 static void initialize_aarch64_code_model (struct gcc_options *);
14014
14015 /* Parse the TO_PARSE string and put the architecture struct that it
14016    selects into RES and the architectural features into ISA_FLAGS.
14017    Return an aarch64_parse_opt_result describing the parse result.
14018    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
14019    When the TO_PARSE string contains an invalid extension,
14020    a copy of the string is created and stored to INVALID_EXTENSION.  */
14021
14022 static enum aarch64_parse_opt_result
14023 aarch64_parse_arch (const char *to_parse, const struct processor **res,
14024                     uint64_t *isa_flags, std::string *invalid_extension)
14025 {
14026   const char *ext;
14027   const struct processor *arch;
14028   size_t len;
14029
14030   ext = strchr (to_parse, '+');
14031
14032   if (ext != NULL)
14033     len = ext - to_parse;
14034   else
14035     len = strlen (to_parse);
14036
14037   if (len == 0)
14038     return AARCH64_PARSE_MISSING_ARG;
14039
14040
14041   /* Loop through the list of supported ARCHes to find a match.  */
14042   for (arch = all_architectures; arch->name != NULL; arch++)
14043     {
14044       if (strlen (arch->name) == len
14045           && strncmp (arch->name, to_parse, len) == 0)
14046         {
14047           uint64_t isa_temp = arch->flags;
14048
14049           if (ext != NULL)
14050             {
14051               /* TO_PARSE string contains at least one extension.  */
14052               enum aarch64_parse_opt_result ext_res
14053                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
14054
14055               if (ext_res != AARCH64_PARSE_OK)
14056                 return ext_res;
14057             }
14058           /* Extension parsing was successful.  Confirm the result
14059              arch and ISA flags.  */
14060           *res = arch;
14061           *isa_flags = isa_temp;
14062           return AARCH64_PARSE_OK;
14063         }
14064     }
14065
14066   /* ARCH name not found in list.  */
14067   return AARCH64_PARSE_INVALID_ARG;
14068 }
14069
14070 /* Parse the TO_PARSE string and put the result tuning in RES and the
14071    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
14072    describing the parse result.  If there is an error parsing, RES and
14073    ISA_FLAGS are left unchanged.
14074    When the TO_PARSE string contains an invalid extension,
14075    a copy of the string is created and stored to INVALID_EXTENSION.  */
14076
14077 static enum aarch64_parse_opt_result
14078 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
14079                    uint64_t *isa_flags, std::string *invalid_extension)
14080 {
14081   const char *ext;
14082   const struct processor *cpu;
14083   size_t len;
14084
14085   ext = strchr (to_parse, '+');
14086
14087   if (ext != NULL)
14088     len = ext - to_parse;
14089   else
14090     len = strlen (to_parse);
14091
14092   if (len == 0)
14093     return AARCH64_PARSE_MISSING_ARG;
14094
14095
14096   /* Loop through the list of supported CPUs to find a match.  */
14097   for (cpu = all_cores; cpu->name != NULL; cpu++)
14098     {
14099       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
14100         {
14101           uint64_t isa_temp = cpu->flags;
14102
14103
14104           if (ext != NULL)
14105             {
14106               /* TO_PARSE string contains at least one extension.  */
14107               enum aarch64_parse_opt_result ext_res
14108                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
14109
14110               if (ext_res != AARCH64_PARSE_OK)
14111                 return ext_res;
14112             }
14113           /* Extension parsing was successfull.  Confirm the result
14114              cpu and ISA flags.  */
14115           *res = cpu;
14116           *isa_flags = isa_temp;
14117           return AARCH64_PARSE_OK;
14118         }
14119     }
14120
14121   /* CPU name not found in list.  */
14122   return AARCH64_PARSE_INVALID_ARG;
14123 }
14124
14125 /* Parse the TO_PARSE string and put the cpu it selects into RES.
14126    Return an aarch64_parse_opt_result describing the parse result.
14127    If the parsing fails the RES does not change.  */
14128
14129 static enum aarch64_parse_opt_result
14130 aarch64_parse_tune (const char *to_parse, const struct processor **res)
14131 {
14132   const struct processor *cpu;
14133
14134   /* Loop through the list of supported CPUs to find a match.  */
14135   for (cpu = all_cores; cpu->name != NULL; cpu++)
14136     {
14137       if (strcmp (cpu->name, to_parse) == 0)
14138         {
14139           *res = cpu;
14140           return AARCH64_PARSE_OK;
14141         }
14142     }
14143
14144   /* CPU name not found in list.  */
14145   return AARCH64_PARSE_INVALID_ARG;
14146 }
14147
14148 /* Parse TOKEN, which has length LENGTH to see if it is an option
14149    described in FLAG.  If it is, return the index bit for that fusion type.
14150    If not, error (printing OPTION_NAME) and return zero.  */
14151
14152 static unsigned int
14153 aarch64_parse_one_option_token (const char *token,
14154                                 size_t length,
14155                                 const struct aarch64_flag_desc *flag,
14156                                 const char *option_name)
14157 {
14158   for (; flag->name != NULL; flag++)
14159     {
14160       if (length == strlen (flag->name)
14161           && !strncmp (flag->name, token, length))
14162         return flag->flag;
14163     }
14164
14165   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
14166   return 0;
14167 }
14168
14169 /* Parse OPTION which is a comma-separated list of flags to enable.
14170    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
14171    default state we inherit from the CPU tuning structures.  OPTION_NAME
14172    gives the top-level option we are parsing in the -moverride string,
14173    for use in error messages.  */
14174
14175 static unsigned int
14176 aarch64_parse_boolean_options (const char *option,
14177                                const struct aarch64_flag_desc *flags,
14178                                unsigned int initial_state,
14179                                const char *option_name)
14180 {
14181   const char separator = '.';
14182   const char* specs = option;
14183   const char* ntoken = option;
14184   unsigned int found_flags = initial_state;
14185
14186   while ((ntoken = strchr (specs, separator)))
14187     {
14188       size_t token_length = ntoken - specs;
14189       unsigned token_ops = aarch64_parse_one_option_token (specs,
14190                                                            token_length,
14191                                                            flags,
14192                                                            option_name);
14193       /* If we find "none" (or, for simplicity's sake, an error) anywhere
14194          in the token stream, reset the supported operations.  So:
14195
14196            adrp+add.cmp+branch.none.adrp+add
14197
14198            would have the result of turning on only adrp+add fusion.  */
14199       if (!token_ops)
14200         found_flags = 0;
14201
14202       found_flags |= token_ops;
14203       specs = ++ntoken;
14204     }
14205
14206   /* We ended with a comma, print something.  */
14207   if (!(*specs))
14208     {
14209       error ("%s string ill-formed\n", option_name);
14210       return 0;
14211     }
14212
14213   /* We still have one more token to parse.  */
14214   size_t token_length = strlen (specs);
14215   unsigned token_ops = aarch64_parse_one_option_token (specs,
14216                                                        token_length,
14217                                                        flags,
14218                                                        option_name);
14219    if (!token_ops)
14220      found_flags = 0;
14221
14222   found_flags |= token_ops;
14223   return found_flags;
14224 }
14225
14226 /* Support for overriding instruction fusion.  */
14227
14228 static void
14229 aarch64_parse_fuse_string (const char *fuse_string,
14230                             struct tune_params *tune)
14231 {
14232   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14233                                                      aarch64_fusible_pairs,
14234                                                      tune->fusible_ops,
14235                                                      "fuse=");
14236 }
14237
14238 /* Support for overriding other tuning flags.  */
14239
14240 static void
14241 aarch64_parse_tune_string (const char *tune_string,
14242                             struct tune_params *tune)
14243 {
14244   tune->extra_tuning_flags
14245     = aarch64_parse_boolean_options (tune_string,
14246                                      aarch64_tuning_flags,
14247                                      tune->extra_tuning_flags,
14248                                      "tune=");
14249 }
14250
14251 /* Parse the sve_width tuning moverride string in TUNE_STRING.
14252    Accept the valid SVE vector widths allowed by
14253    aarch64_sve_vector_bits_enum and use it to override sve_width
14254    in TUNE.  */
14255
14256 static void
14257 aarch64_parse_sve_width_string (const char *tune_string,
14258                                 struct tune_params *tune)
14259 {
14260   int width = -1;
14261
14262   int n = sscanf (tune_string, "%d", &width);
14263   if (n == EOF)
14264     {
14265       error ("invalid format for sve_width");
14266       return;
14267     }
14268   switch (width)
14269     {
14270     case SVE_128:
14271     case SVE_256:
14272     case SVE_512:
14273     case SVE_1024:
14274     case SVE_2048:
14275       break;
14276     default:
14277       error ("invalid sve_width value: %d", width);
14278     }
14279   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14280 }
14281
14282 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14283    we understand.  If it is, extract the option string and handoff to
14284    the appropriate function.  */
14285
14286 void
14287 aarch64_parse_one_override_token (const char* token,
14288                                   size_t length,
14289                                   struct tune_params *tune)
14290 {
14291   const struct aarch64_tuning_override_function *fn
14292     = aarch64_tuning_override_functions;
14293
14294   const char *option_part = strchr (token, '=');
14295   if (!option_part)
14296     {
14297       error ("tuning string missing in option (%s)", token);
14298       return;
14299     }
14300
14301   /* Get the length of the option name.  */
14302   length = option_part - token;
14303   /* Skip the '=' to get to the option string.  */
14304   option_part++;
14305
14306   for (; fn->name != NULL; fn++)
14307     {
14308       if (!strncmp (fn->name, token, length))
14309         {
14310           fn->parse_override (option_part, tune);
14311           return;
14312         }
14313     }
14314
14315   error ("unknown tuning option (%s)",token);
14316   return;
14317 }
14318
14319 /* A checking mechanism for the implementation of the tls size.  */
14320
14321 static void
14322 initialize_aarch64_tls_size (struct gcc_options *opts)
14323 {
14324   if (aarch64_tls_size == 0)
14325     aarch64_tls_size = 24;
14326
14327   switch (opts->x_aarch64_cmodel_var)
14328     {
14329     case AARCH64_CMODEL_TINY:
14330       /* Both the default and maximum TLS size allowed under tiny is 1M which
14331          needs two instructions to address, so we clamp the size to 24.  */
14332       if (aarch64_tls_size > 24)
14333         aarch64_tls_size = 24;
14334       break;
14335     case AARCH64_CMODEL_SMALL:
14336       /* The maximum TLS size allowed under small is 4G.  */
14337       if (aarch64_tls_size > 32)
14338         aarch64_tls_size = 32;
14339       break;
14340     case AARCH64_CMODEL_LARGE:
14341       /* The maximum TLS size allowed under large is 16E.
14342          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
14343       if (aarch64_tls_size > 48)
14344         aarch64_tls_size = 48;
14345       break;
14346     default:
14347       gcc_unreachable ();
14348     }
14349
14350   return;
14351 }
14352
14353 /* Parse STRING looking for options in the format:
14354      string     :: option:string
14355      option     :: name=substring
14356      name       :: {a-z}
14357      substring  :: defined by option.  */
14358
14359 static void
14360 aarch64_parse_override_string (const char* input_string,
14361                                struct tune_params* tune)
14362 {
14363   const char separator = ':';
14364   size_t string_length = strlen (input_string) + 1;
14365   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14366   char *string = string_root;
14367   strncpy (string, input_string, string_length);
14368   string[string_length - 1] = '\0';
14369
14370   char* ntoken = string;
14371
14372   while ((ntoken = strchr (string, separator)))
14373     {
14374       size_t token_length = ntoken - string;
14375       /* Make this substring look like a string.  */
14376       *ntoken = '\0';
14377       aarch64_parse_one_override_token (string, token_length, tune);
14378       string = ++ntoken;
14379     }
14380
14381   /* One last option to parse.  */
14382   aarch64_parse_one_override_token (string, strlen (string), tune);
14383   free (string_root);
14384 }
14385
14386
14387 static void
14388 aarch64_override_options_after_change_1 (struct gcc_options *opts)
14389 {
14390   if (accepted_branch_protection_string)
14391     {
14392       opts->x_aarch64_branch_protection_string
14393         = xstrdup (accepted_branch_protection_string);
14394     }
14395
14396   /* PR 70044: We have to be careful about being called multiple times for the
14397      same function.  This means all changes should be repeatable.  */
14398
14399   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14400      Disable the frame pointer flag so the mid-end will not use a frame
14401      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14402      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14403      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
14404   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14405   if (opts->x_flag_omit_frame_pointer == 0)
14406     opts->x_flag_omit_frame_pointer = 2;
14407
14408   /* If not optimizing for size, set the default
14409      alignment to what the target wants.  */
14410   if (!opts->x_optimize_size)
14411     {
14412       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14413         opts->x_str_align_loops = aarch64_tune_params.loop_align;
14414       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14415         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14416       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14417         opts->x_str_align_functions = aarch64_tune_params.function_align;
14418     }
14419
14420   /* We default to no pc-relative literal loads.  */
14421
14422   aarch64_pcrelative_literal_loads = false;
14423
14424   /* If -mpc-relative-literal-loads is set on the command line, this
14425      implies that the user asked for PC relative literal loads.  */
14426   if (opts->x_pcrelative_literal_loads == 1)
14427     aarch64_pcrelative_literal_loads = true;
14428
14429   /* In the tiny memory model it makes no sense to disallow PC relative
14430      literal pool loads.  */
14431   if (aarch64_cmodel == AARCH64_CMODEL_TINY
14432       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14433     aarch64_pcrelative_literal_loads = true;
14434
14435   /* When enabling the lower precision Newton series for the square root, also
14436      enable it for the reciprocal square root, since the latter is an
14437      intermediary step for the former.  */
14438   if (flag_mlow_precision_sqrt)
14439     flag_mrecip_low_precision_sqrt = true;
14440 }
14441
14442 /* 'Unpack' up the internal tuning structs and update the options
14443     in OPTS.  The caller must have set up selected_tune and selected_arch
14444     as all the other target-specific codegen decisions are
14445     derived from them.  */
14446
14447 void
14448 aarch64_override_options_internal (struct gcc_options *opts)
14449 {
14450   aarch64_tune_flags = selected_tune->flags;
14451   aarch64_tune = selected_tune->sched_core;
14452   /* Make a copy of the tuning parameters attached to the core, which
14453      we may later overwrite.  */
14454   aarch64_tune_params = *(selected_tune->tune);
14455   aarch64_architecture_version = selected_arch->architecture_version;
14456
14457   if (opts->x_aarch64_override_tune_string)
14458     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14459                                   &aarch64_tune_params);
14460
14461   /* This target defaults to strict volatile bitfields.  */
14462   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14463     opts->x_flag_strict_volatile_bitfields = 1;
14464
14465   if (aarch64_stack_protector_guard == SSP_GLOBAL
14466       && opts->x_aarch64_stack_protector_guard_offset_str)
14467     {
14468       error ("incompatible options %<-mstack-protector-guard=global%> and "
14469              "%<-mstack-protector-guard-offset=%s%>",
14470              aarch64_stack_protector_guard_offset_str);
14471     }
14472
14473   if (aarch64_stack_protector_guard == SSP_SYSREG
14474       && !(opts->x_aarch64_stack_protector_guard_offset_str
14475            && opts->x_aarch64_stack_protector_guard_reg_str))
14476     {
14477       error ("both %<-mstack-protector-guard-offset%> and "
14478              "%<-mstack-protector-guard-reg%> must be used "
14479              "with %<-mstack-protector-guard=sysreg%>");
14480     }
14481
14482   if (opts->x_aarch64_stack_protector_guard_reg_str)
14483     {
14484       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14485           error ("specify a system register with a small string length.");
14486     }
14487
14488   if (opts->x_aarch64_stack_protector_guard_offset_str)
14489     {
14490       char *end;
14491       const char *str = aarch64_stack_protector_guard_offset_str;
14492       errno = 0;
14493       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14494       if (!*str || *end || errno)
14495         error ("%qs is not a valid offset in %qs", str,
14496                "-mstack-protector-guard-offset=");
14497       aarch64_stack_protector_guard_offset = offs;
14498     }
14499
14500   initialize_aarch64_code_model (opts);
14501   initialize_aarch64_tls_size (opts);
14502
14503   int queue_depth = 0;
14504   switch (aarch64_tune_params.autoprefetcher_model)
14505     {
14506       case tune_params::AUTOPREFETCHER_OFF:
14507         queue_depth = -1;
14508         break;
14509       case tune_params::AUTOPREFETCHER_WEAK:
14510         queue_depth = 0;
14511         break;
14512       case tune_params::AUTOPREFETCHER_STRONG:
14513         queue_depth = max_insn_queue_index + 1;
14514         break;
14515       default:
14516         gcc_unreachable ();
14517     }
14518
14519   /* We don't mind passing in global_options_set here as we don't use
14520      the *options_set structs anyway.  */
14521   SET_OPTION_IF_UNSET (opts, &global_options_set,
14522                        param_sched_autopref_queue_depth, queue_depth);
14523
14524   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
14525      comparison.  */
14526   if (aarch64_autovec_preference == 1)
14527     SET_OPTION_IF_UNSET (opts, &global_options_set,
14528                          aarch64_sve_compare_costs, 0);
14529
14530   /* Set up parameters to be used in prefetching algorithm.  Do not
14531      override the defaults unless we are tuning for a core we have
14532      researched values for.  */
14533   if (aarch64_tune_params.prefetch->num_slots > 0)
14534     SET_OPTION_IF_UNSET (opts, &global_options_set,
14535                          param_simultaneous_prefetches,
14536                          aarch64_tune_params.prefetch->num_slots);
14537   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14538     SET_OPTION_IF_UNSET (opts, &global_options_set,
14539                          param_l1_cache_size,
14540                          aarch64_tune_params.prefetch->l1_cache_size);
14541   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14542     SET_OPTION_IF_UNSET (opts, &global_options_set,
14543                          param_l1_cache_line_size,
14544                          aarch64_tune_params.prefetch->l1_cache_line_size);
14545   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14546     SET_OPTION_IF_UNSET (opts, &global_options_set,
14547                          param_l2_cache_size,
14548                          aarch64_tune_params.prefetch->l2_cache_size);
14549   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14550     SET_OPTION_IF_UNSET (opts, &global_options_set,
14551                          param_prefetch_dynamic_strides, 0);
14552   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14553     SET_OPTION_IF_UNSET (opts, &global_options_set,
14554                          param_prefetch_minimum_stride,
14555                          aarch64_tune_params.prefetch->minimum_stride);
14556
14557   /* Use the alternative scheduling-pressure algorithm by default.  */
14558   SET_OPTION_IF_UNSET (opts, &global_options_set,
14559                        param_sched_pressure_algorithm,
14560                        SCHED_PRESSURE_MODEL);
14561
14562   /* Validate the guard size.  */
14563   int guard_size = param_stack_clash_protection_guard_size;
14564
14565   if (guard_size != 12 && guard_size != 16)
14566     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14567            "size.  Given value %d (%llu KB) is out of range",
14568            guard_size, (1ULL << guard_size) / 1024ULL);
14569
14570   /* Enforce that interval is the same size as size so the mid-end does the
14571      right thing.  */
14572   SET_OPTION_IF_UNSET (opts, &global_options_set,
14573                        param_stack_clash_protection_probe_interval,
14574                        guard_size);
14575
14576   /* The maybe_set calls won't update the value if the user has explicitly set
14577      one.  Which means we need to validate that probing interval and guard size
14578      are equal.  */
14579   int probe_interval
14580     = param_stack_clash_protection_probe_interval;
14581   if (guard_size != probe_interval)
14582     error ("stack clash guard size %<%d%> must be equal to probing interval "
14583            "%<%d%>", guard_size, probe_interval);
14584
14585   /* Enable sw prefetching at specified optimization level for
14586      CPUS that have prefetch.  Lower optimization level threshold by 1
14587      when profiling is enabled.  */
14588   if (opts->x_flag_prefetch_loop_arrays < 0
14589       && !opts->x_optimize_size
14590       && aarch64_tune_params.prefetch->default_opt_level >= 0
14591       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14592     opts->x_flag_prefetch_loop_arrays = 1;
14593
14594   if (opts->x_aarch64_arch_string == NULL)
14595     opts->x_aarch64_arch_string = selected_arch->name;
14596   if (opts->x_aarch64_cpu_string == NULL)
14597     opts->x_aarch64_cpu_string = selected_cpu->name;
14598   if (opts->x_aarch64_tune_string == NULL)
14599     opts->x_aarch64_tune_string = selected_tune->name;
14600
14601   aarch64_override_options_after_change_1 (opts);
14602 }
14603
14604 /* Print a hint with a suggestion for a core or architecture name that
14605    most closely resembles what the user passed in STR.  ARCH is true if
14606    the user is asking for an architecture name.  ARCH is false if the user
14607    is asking for a core name.  */
14608
14609 static void
14610 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14611 {
14612   auto_vec<const char *> candidates;
14613   const struct processor *entry = arch ? all_architectures : all_cores;
14614   for (; entry->name != NULL; entry++)
14615     candidates.safe_push (entry->name);
14616
14617 #ifdef HAVE_LOCAL_CPU_DETECT
14618   /* Add also "native" as possible value.  */
14619   if (arch)
14620     candidates.safe_push ("native");
14621 #endif
14622
14623   char *s;
14624   const char *hint = candidates_list_and_hint (str, s, candidates);
14625   if (hint)
14626     inform (input_location, "valid arguments are: %s;"
14627                              " did you mean %qs?", s, hint);
14628   else
14629     inform (input_location, "valid arguments are: %s", s);
14630
14631   XDELETEVEC (s);
14632 }
14633
14634 /* Print a hint with a suggestion for a core name that most closely resembles
14635    what the user passed in STR.  */
14636
14637 inline static void
14638 aarch64_print_hint_for_core (const char *str)
14639 {
14640   aarch64_print_hint_for_core_or_arch (str, false);
14641 }
14642
14643 /* Print a hint with a suggestion for an architecture name that most closely
14644    resembles what the user passed in STR.  */
14645
14646 inline static void
14647 aarch64_print_hint_for_arch (const char *str)
14648 {
14649   aarch64_print_hint_for_core_or_arch (str, true);
14650 }
14651
14652
14653 /* Print a hint with a suggestion for an extension name
14654    that most closely resembles what the user passed in STR.  */
14655
14656 void
14657 aarch64_print_hint_for_extensions (const std::string &str)
14658 {
14659   auto_vec<const char *> candidates;
14660   aarch64_get_all_extension_candidates (&candidates);
14661   char *s;
14662   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14663   if (hint)
14664     inform (input_location, "valid arguments are: %s;"
14665                              " did you mean %qs?", s, hint);
14666   else
14667     inform (input_location, "valid arguments are: %s;", s);
14668
14669   XDELETEVEC (s);
14670 }
14671
14672 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
14673    specified in STR and throw errors if appropriate.  Put the results if
14674    they are valid in RES and ISA_FLAGS.  Return whether the option is
14675    valid.  */
14676
14677 static bool
14678 aarch64_validate_mcpu (const char *str, const struct processor **res,
14679                        uint64_t *isa_flags)
14680 {
14681   std::string invalid_extension;
14682   enum aarch64_parse_opt_result parse_res
14683     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14684
14685   if (parse_res == AARCH64_PARSE_OK)
14686     return true;
14687
14688   switch (parse_res)
14689     {
14690       case AARCH64_PARSE_MISSING_ARG:
14691         error ("missing cpu name in %<-mcpu=%s%>", str);
14692         break;
14693       case AARCH64_PARSE_INVALID_ARG:
14694         error ("unknown value %qs for %<-mcpu%>", str);
14695         aarch64_print_hint_for_core (str);
14696         break;
14697       case AARCH64_PARSE_INVALID_FEATURE:
14698         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14699                invalid_extension.c_str (), str);
14700         aarch64_print_hint_for_extensions (invalid_extension);
14701         break;
14702       default:
14703         gcc_unreachable ();
14704     }
14705
14706   return false;
14707 }
14708
14709 /* Straight line speculation indicators.  */
14710 enum aarch64_sls_hardening_type
14711 {
14712   SLS_NONE = 0,
14713   SLS_RETBR = 1,
14714   SLS_BLR = 2,
14715   SLS_ALL = 3,
14716 };
14717 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
14718
14719 /* Return whether we should mitigatate Straight Line Speculation for the RET
14720    and BR instructions.  */
14721 bool
14722 aarch64_harden_sls_retbr_p (void)
14723 {
14724   return aarch64_sls_hardening & SLS_RETBR;
14725 }
14726
14727 /* Return whether we should mitigatate Straight Line Speculation for the BLR
14728    instruction.  */
14729 bool
14730 aarch64_harden_sls_blr_p (void)
14731 {
14732   return aarch64_sls_hardening & SLS_BLR;
14733 }
14734
14735 /* As of yet we only allow setting these options globally, in the future we may
14736    allow setting them per function.  */
14737 static void
14738 aarch64_validate_sls_mitigation (const char *const_str)
14739 {
14740   char *token_save = NULL;
14741   char *str = NULL;
14742
14743   if (strcmp (const_str, "none") == 0)
14744     {
14745       aarch64_sls_hardening = SLS_NONE;
14746       return;
14747     }
14748   if (strcmp (const_str, "all") == 0)
14749     {
14750       aarch64_sls_hardening = SLS_ALL;
14751       return;
14752     }
14753
14754   char *str_root = xstrdup (const_str);
14755   str = strtok_r (str_root, ",", &token_save);
14756   if (!str)
14757     error ("invalid argument given to %<-mharden-sls=%>");
14758
14759   int temp = SLS_NONE;
14760   while (str)
14761     {
14762       if (strcmp (str, "blr") == 0)
14763         temp |= SLS_BLR;
14764       else if (strcmp (str, "retbr") == 0)
14765         temp |= SLS_RETBR;
14766       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
14767         {
14768           error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
14769           break;
14770         }
14771       else
14772         {
14773           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
14774           break;
14775         }
14776       str = strtok_r (NULL, ",", &token_save);
14777     }
14778   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
14779   free (str_root);
14780 }
14781
14782 /* Parses CONST_STR for branch protection features specified in
14783    aarch64_branch_protect_types, and set any global variables required.  Returns
14784    the parsing result and assigns LAST_STR to the last processed token from
14785    CONST_STR so that it can be used for error reporting.  */
14786
14787 static enum
14788 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14789                                                           char** last_str)
14790 {
14791   char *str_root = xstrdup (const_str);
14792   char* token_save = NULL;
14793   char *str = strtok_r (str_root, "+", &token_save);
14794   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14795   if (!str)
14796     res = AARCH64_PARSE_MISSING_ARG;
14797   else
14798     {
14799       char *next_str = strtok_r (NULL, "+", &token_save);
14800       /* Reset the branch protection features to their defaults.  */
14801       aarch64_handle_no_branch_protection (NULL, NULL);
14802
14803       while (str && res == AARCH64_PARSE_OK)
14804         {
14805           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14806           bool found = false;
14807           /* Search for this type.  */
14808           while (type && type->name && !found && res == AARCH64_PARSE_OK)
14809             {
14810               if (strcmp (str, type->name) == 0)
14811                 {
14812                   found = true;
14813                   res = type->handler (str, next_str);
14814                   str = next_str;
14815                   next_str = strtok_r (NULL, "+", &token_save);
14816                 }
14817               else
14818                 type++;
14819             }
14820           if (found && res == AARCH64_PARSE_OK)
14821             {
14822               bool found_subtype = true;
14823               /* Loop through each token until we find one that isn't a
14824                  subtype.  */
14825               while (found_subtype)
14826                 {
14827                   found_subtype = false;
14828                   const aarch64_branch_protect_type *subtype = type->subtypes;
14829                   /* Search for the subtype.  */
14830                   while (str && subtype && subtype->name && !found_subtype
14831                           && res == AARCH64_PARSE_OK)
14832                     {
14833                       if (strcmp (str, subtype->name) == 0)
14834                         {
14835                           found_subtype = true;
14836                           res = subtype->handler (str, next_str);
14837                           str = next_str;
14838                           next_str = strtok_r (NULL, "+", &token_save);
14839                         }
14840                       else
14841                         subtype++;
14842                     }
14843                 }
14844             }
14845           else if (!found)
14846             res = AARCH64_PARSE_INVALID_ARG;
14847         }
14848     }
14849   /* Copy the last processed token into the argument to pass it back.
14850     Used by option and attribute validation to print the offending token.  */
14851   if (last_str)
14852     {
14853       if (str) strcpy (*last_str, str);
14854       else *last_str = NULL;
14855     }
14856   if (res == AARCH64_PARSE_OK)
14857     {
14858       /* If needed, alloc the accepted string then copy in const_str.
14859         Used by override_option_after_change_1.  */
14860       if (!accepted_branch_protection_string)
14861         accepted_branch_protection_string = (char *) xmalloc (
14862                                                       BRANCH_PROTECT_STR_MAX
14863                                                         + 1);
14864       strncpy (accepted_branch_protection_string, const_str,
14865                 BRANCH_PROTECT_STR_MAX + 1);
14866       /* Forcibly null-terminate.  */
14867       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14868     }
14869   return res;
14870 }
14871
14872 static bool
14873 aarch64_validate_mbranch_protection (const char *const_str)
14874 {
14875   char *str = (char *) xmalloc (strlen (const_str));
14876   enum aarch64_parse_opt_result res =
14877     aarch64_parse_branch_protection (const_str, &str);
14878   if (res == AARCH64_PARSE_INVALID_ARG)
14879     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
14880   else if (res == AARCH64_PARSE_MISSING_ARG)
14881     error ("missing argument for %<-mbranch-protection=%>");
14882   free (str);
14883   return res == AARCH64_PARSE_OK;
14884 }
14885
14886 /* Validate a command-line -march option.  Parse the arch and extensions
14887    (if any) specified in STR and throw errors if appropriate.  Put the
14888    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
14889    option is valid.  */
14890
14891 static bool
14892 aarch64_validate_march (const char *str, const struct processor **res,
14893                          uint64_t *isa_flags)
14894 {
14895   std::string invalid_extension;
14896   enum aarch64_parse_opt_result parse_res
14897     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
14898
14899   if (parse_res == AARCH64_PARSE_OK)
14900     return true;
14901
14902   switch (parse_res)
14903     {
14904       case AARCH64_PARSE_MISSING_ARG:
14905         error ("missing arch name in %<-march=%s%>", str);
14906         break;
14907       case AARCH64_PARSE_INVALID_ARG:
14908         error ("unknown value %qs for %<-march%>", str);
14909         aarch64_print_hint_for_arch (str);
14910         break;
14911       case AARCH64_PARSE_INVALID_FEATURE:
14912         error ("invalid feature modifier %qs in %<-march=%s%>",
14913                invalid_extension.c_str (), str);
14914         aarch64_print_hint_for_extensions (invalid_extension);
14915         break;
14916       default:
14917         gcc_unreachable ();
14918     }
14919
14920   return false;
14921 }
14922
14923 /* Validate a command-line -mtune option.  Parse the cpu
14924    specified in STR and throw errors if appropriate.  Put the
14925    result, if it is valid, in RES.  Return whether the option is
14926    valid.  */
14927
14928 static bool
14929 aarch64_validate_mtune (const char *str, const struct processor **res)
14930 {
14931   enum aarch64_parse_opt_result parse_res
14932     = aarch64_parse_tune (str, res);
14933
14934   if (parse_res == AARCH64_PARSE_OK)
14935     return true;
14936
14937   switch (parse_res)
14938     {
14939       case AARCH64_PARSE_MISSING_ARG:
14940         error ("missing cpu name in %<-mtune=%s%>", str);
14941         break;
14942       case AARCH64_PARSE_INVALID_ARG:
14943         error ("unknown value %qs for %<-mtune%>", str);
14944         aarch64_print_hint_for_core (str);
14945         break;
14946       default:
14947         gcc_unreachable ();
14948     }
14949   return false;
14950 }
14951
14952 /* Return the CPU corresponding to the enum CPU.
14953    If it doesn't specify a cpu, return the default.  */
14954
14955 static const struct processor *
14956 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14957 {
14958   if (cpu != aarch64_none)
14959     return &all_cores[cpu];
14960
14961   /* The & 0x3f is to extract the bottom 6 bits that encode the
14962      default cpu as selected by the --with-cpu GCC configure option
14963      in config.gcc.
14964      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14965      flags mechanism should be reworked to make it more sane.  */
14966   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14967 }
14968
14969 /* Return the architecture corresponding to the enum ARCH.
14970    If it doesn't specify a valid architecture, return the default.  */
14971
14972 static const struct processor *
14973 aarch64_get_arch (enum aarch64_arch arch)
14974 {
14975   if (arch != aarch64_no_arch)
14976     return &all_architectures[arch];
14977
14978   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14979
14980   return &all_architectures[cpu->arch];
14981 }
14982
14983 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
14984
14985 static poly_uint16
14986 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14987 {
14988   /* 128-bit SVE and Advanced SIMD modes use different register layouts
14989      on big-endian targets, so we would need to forbid subregs that convert
14990      from one to the other.  By default a reinterpret sequence would then
14991      involve a store to memory in one mode and a load back in the other.
14992      Even if we optimize that sequence using reverse instructions,
14993      it would still be a significant potential overhead.
14994
14995      For now, it seems better to generate length-agnostic code for that
14996      case instead.  */
14997   if (value == SVE_SCALABLE
14998       || (value == SVE_128 && BYTES_BIG_ENDIAN))
14999     return poly_uint16 (2, 2);
15000   else
15001     return (int) value / 64;
15002 }
15003
15004 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
15005    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
15006    tuning structs.  In particular it must set selected_tune and
15007    aarch64_isa_flags that define the available ISA features and tuning
15008    decisions.  It must also set selected_arch as this will be used to
15009    output the .arch asm tags for each function.  */
15010
15011 static void
15012 aarch64_override_options (void)
15013 {
15014   uint64_t cpu_isa = 0;
15015   uint64_t arch_isa = 0;
15016   aarch64_isa_flags = 0;
15017
15018   bool valid_cpu = true;
15019   bool valid_tune = true;
15020   bool valid_arch = true;
15021
15022   selected_cpu = NULL;
15023   selected_arch = NULL;
15024   selected_tune = NULL;
15025
15026   if (aarch64_harden_sls_string)
15027     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
15028
15029   if (aarch64_branch_protection_string)
15030     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
15031
15032   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
15033      If either of -march or -mtune is given, they override their
15034      respective component of -mcpu.  */
15035   if (aarch64_cpu_string)
15036     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
15037                                         &cpu_isa);
15038
15039   if (aarch64_arch_string)
15040     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
15041                                           &arch_isa);
15042
15043   if (aarch64_tune_string)
15044     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
15045
15046 #ifdef SUBTARGET_OVERRIDE_OPTIONS
15047   SUBTARGET_OVERRIDE_OPTIONS;
15048 #endif
15049
15050   /* If the user did not specify a processor, choose the default
15051      one for them.  This will be the CPU set during configuration using
15052      --with-cpu, otherwise it is "generic".  */
15053   if (!selected_cpu)
15054     {
15055       if (selected_arch)
15056         {
15057           selected_cpu = &all_cores[selected_arch->ident];
15058           aarch64_isa_flags = arch_isa;
15059           explicit_arch = selected_arch->arch;
15060         }
15061       else
15062         {
15063           /* Get default configure-time CPU.  */
15064           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
15065           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
15066         }
15067
15068       if (selected_tune)
15069         explicit_tune_core = selected_tune->ident;
15070     }
15071   /* If both -mcpu and -march are specified check that they are architecturally
15072      compatible, warn if they're not and prefer the -march ISA flags.  */
15073   else if (selected_arch)
15074     {
15075       if (selected_arch->arch != selected_cpu->arch)
15076         {
15077           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
15078                        aarch64_cpu_string,
15079                        aarch64_arch_string);
15080         }
15081       aarch64_isa_flags = arch_isa;
15082       explicit_arch = selected_arch->arch;
15083       explicit_tune_core = selected_tune ? selected_tune->ident
15084                                           : selected_cpu->ident;
15085     }
15086   else
15087     {
15088       /* -mcpu but no -march.  */
15089       aarch64_isa_flags = cpu_isa;
15090       explicit_tune_core = selected_tune ? selected_tune->ident
15091                                           : selected_cpu->ident;
15092       gcc_assert (selected_cpu);
15093       selected_arch = &all_architectures[selected_cpu->arch];
15094       explicit_arch = selected_arch->arch;
15095     }
15096
15097   /* Set the arch as well as we will need it when outputing
15098      the .arch directive in assembly.  */
15099   if (!selected_arch)
15100     {
15101       gcc_assert (selected_cpu);
15102       selected_arch = &all_architectures[selected_cpu->arch];
15103     }
15104
15105   if (!selected_tune)
15106     selected_tune = selected_cpu;
15107
15108   if (aarch64_enable_bti == 2)
15109     {
15110 #ifdef TARGET_ENABLE_BTI
15111       aarch64_enable_bti = 1;
15112 #else
15113       aarch64_enable_bti = 0;
15114 #endif
15115     }
15116
15117   /* Return address signing is currently not supported for ILP32 targets.  For
15118      LP64 targets use the configured option in the absence of a command-line
15119      option for -mbranch-protection.  */
15120   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
15121     {
15122 #ifdef TARGET_ENABLE_PAC_RET
15123       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
15124 #else
15125       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
15126 #endif
15127     }
15128
15129 #ifndef HAVE_AS_MABI_OPTION
15130   /* The compiler may have been configured with 2.23.* binutils, which does
15131      not have support for ILP32.  */
15132   if (TARGET_ILP32)
15133     error ("assembler does not support %<-mabi=ilp32%>");
15134 #endif
15135
15136   /* Convert -msve-vector-bits to a VG count.  */
15137   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
15138
15139   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
15140     sorry ("return address signing is only supported for %<-mabi=lp64%>");
15141
15142   /* Make sure we properly set up the explicit options.  */
15143   if ((aarch64_cpu_string && valid_cpu)
15144        || (aarch64_tune_string && valid_tune))
15145     gcc_assert (explicit_tune_core != aarch64_none);
15146
15147   if ((aarch64_cpu_string && valid_cpu)
15148        || (aarch64_arch_string && valid_arch))
15149     gcc_assert (explicit_arch != aarch64_no_arch);
15150
15151   /* The pass to insert speculation tracking runs before
15152      shrink-wrapping and the latter does not know how to update the
15153      tracking status.  So disable it in this case.  */
15154   if (aarch64_track_speculation)
15155     flag_shrink_wrap = 0;
15156
15157   aarch64_override_options_internal (&global_options);
15158
15159   /* Save these options as the default ones in case we push and pop them later
15160      while processing functions with potential target attributes.  */
15161   target_option_default_node = target_option_current_node
15162     = build_target_option_node (&global_options, &global_options_set);
15163 }
15164
15165 /* Implement targetm.override_options_after_change.  */
15166
15167 static void
15168 aarch64_override_options_after_change (void)
15169 {
15170   aarch64_override_options_after_change_1 (&global_options);
15171 }
15172
15173 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
15174 static char *
15175 aarch64_offload_options (void)
15176 {
15177   if (TARGET_ILP32)
15178     return xstrdup ("-foffload-abi=ilp32");
15179   else
15180     return xstrdup ("-foffload-abi=lp64");
15181 }
15182
15183 static struct machine_function *
15184 aarch64_init_machine_status (void)
15185 {
15186   struct machine_function *machine;
15187   machine = ggc_cleared_alloc<machine_function> ();
15188   return machine;
15189 }
15190
15191 void
15192 aarch64_init_expanders (void)
15193 {
15194   init_machine_status = aarch64_init_machine_status;
15195 }
15196
15197 /* A checking mechanism for the implementation of the various code models.  */
15198 static void
15199 initialize_aarch64_code_model (struct gcc_options *opts)
15200 {
15201   aarch64_cmodel = opts->x_aarch64_cmodel_var;
15202   switch (opts->x_aarch64_cmodel_var)
15203     {
15204     case AARCH64_CMODEL_TINY:
15205       if (opts->x_flag_pic)
15206         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
15207       break;
15208     case AARCH64_CMODEL_SMALL:
15209       if (opts->x_flag_pic)
15210         {
15211 #ifdef HAVE_AS_SMALL_PIC_RELOCS
15212           aarch64_cmodel = (flag_pic == 2
15213                             ? AARCH64_CMODEL_SMALL_PIC
15214                             : AARCH64_CMODEL_SMALL_SPIC);
15215 #else
15216           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
15217 #endif
15218         }
15219       break;
15220     case AARCH64_CMODEL_LARGE:
15221       if (opts->x_flag_pic)
15222         sorry ("code model %qs with %<-f%s%>", "large",
15223                opts->x_flag_pic > 1 ? "PIC" : "pic");
15224       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
15225         sorry ("code model %qs not supported in ilp32 mode", "large");
15226       break;
15227     case AARCH64_CMODEL_TINY_PIC:
15228     case AARCH64_CMODEL_SMALL_PIC:
15229     case AARCH64_CMODEL_SMALL_SPIC:
15230       gcc_unreachable ();
15231     }
15232 }
15233
15234 /* Implement TARGET_OPTION_SAVE.  */
15235
15236 static void
15237 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
15238                      struct gcc_options */* opts_set */)
15239 {
15240   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
15241   ptr->x_aarch64_branch_protection_string
15242     = opts->x_aarch64_branch_protection_string;
15243 }
15244
15245 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
15246    using the information saved in PTR.  */
15247
15248 static void
15249 aarch64_option_restore (struct gcc_options *opts,
15250                         struct gcc_options */* opts_set */,
15251                         struct cl_target_option *ptr)
15252 {
15253   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
15254   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15255   opts->x_explicit_arch = ptr->x_explicit_arch;
15256   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
15257   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
15258   opts->x_aarch64_branch_protection_string
15259     = ptr->x_aarch64_branch_protection_string;
15260   if (opts->x_aarch64_branch_protection_string)
15261     {
15262       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
15263                                         NULL);
15264     }
15265
15266   aarch64_override_options_internal (opts);
15267 }
15268
15269 /* Implement TARGET_OPTION_PRINT.  */
15270
15271 static void
15272 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
15273 {
15274   const struct processor *cpu
15275     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15276   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
15277   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
15278   std::string extension
15279     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
15280
15281   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
15282   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
15283            arch->name, extension.c_str ());
15284 }
15285
15286 static GTY(()) tree aarch64_previous_fndecl;
15287
15288 void
15289 aarch64_reset_previous_fndecl (void)
15290 {
15291   aarch64_previous_fndecl = NULL;
15292 }
15293
15294 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
15295    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
15296    make sure optab availability predicates are recomputed when necessary.  */
15297
15298 void
15299 aarch64_save_restore_target_globals (tree new_tree)
15300 {
15301   if (TREE_TARGET_GLOBALS (new_tree))
15302     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15303   else if (new_tree == target_option_default_node)
15304     restore_target_globals (&default_target_globals);
15305   else
15306     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15307 }
15308
15309 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
15310    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15311    of the function, if such exists.  This function may be called multiple
15312    times on a single function so use aarch64_previous_fndecl to avoid
15313    setting up identical state.  */
15314
15315 static void
15316 aarch64_set_current_function (tree fndecl)
15317 {
15318   if (!fndecl || fndecl == aarch64_previous_fndecl)
15319     return;
15320
15321   tree old_tree = (aarch64_previous_fndecl
15322                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15323                    : NULL_TREE);
15324
15325   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15326
15327   /* If current function has no attributes but the previous one did,
15328      use the default node.  */
15329   if (!new_tree && old_tree)
15330     new_tree = target_option_default_node;
15331
15332   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
15333      the default have been handled by aarch64_save_restore_target_globals from
15334      aarch64_pragma_target_parse.  */
15335   if (old_tree == new_tree)
15336     return;
15337
15338   aarch64_previous_fndecl = fndecl;
15339
15340   /* First set the target options.  */
15341   cl_target_option_restore (&global_options, &global_options_set,
15342                             TREE_TARGET_OPTION (new_tree));
15343
15344   aarch64_save_restore_target_globals (new_tree);
15345 }
15346
15347 /* Enum describing the various ways we can handle attributes.
15348    In many cases we can reuse the generic option handling machinery.  */
15349
15350 enum aarch64_attr_opt_type
15351 {
15352   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
15353   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
15354   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
15355   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
15356 };
15357
15358 /* All the information needed to handle a target attribute.
15359    NAME is the name of the attribute.
15360    ATTR_TYPE specifies the type of behavior of the attribute as described
15361    in the definition of enum aarch64_attr_opt_type.
15362    ALLOW_NEG is true if the attribute supports a "no-" form.
15363    HANDLER is the function that takes the attribute string as an argument
15364    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
15365    OPT_NUM is the enum specifying the option that the attribute modifies.
15366    This is needed for attributes that mirror the behavior of a command-line
15367    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15368    aarch64_attr_enum.  */
15369
15370 struct aarch64_attribute_info
15371 {
15372   const char *name;
15373   enum aarch64_attr_opt_type attr_type;
15374   bool allow_neg;
15375   bool (*handler) (const char *);
15376   enum opt_code opt_num;
15377 };
15378
15379 /* Handle the ARCH_STR argument to the arch= target attribute.  */
15380
15381 static bool
15382 aarch64_handle_attr_arch (const char *str)
15383 {
15384   const struct processor *tmp_arch = NULL;
15385   std::string invalid_extension;
15386   enum aarch64_parse_opt_result parse_res
15387     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
15388
15389   if (parse_res == AARCH64_PARSE_OK)
15390     {
15391       gcc_assert (tmp_arch);
15392       selected_arch = tmp_arch;
15393       explicit_arch = selected_arch->arch;
15394       return true;
15395     }
15396
15397   switch (parse_res)
15398     {
15399       case AARCH64_PARSE_MISSING_ARG:
15400         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
15401         break;
15402       case AARCH64_PARSE_INVALID_ARG:
15403         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
15404         aarch64_print_hint_for_arch (str);
15405         break;
15406       case AARCH64_PARSE_INVALID_FEATURE:
15407         error ("invalid feature modifier %s of value (\"%s\") in "
15408                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15409         aarch64_print_hint_for_extensions (invalid_extension);
15410         break;
15411       default:
15412         gcc_unreachable ();
15413     }
15414
15415   return false;
15416 }
15417
15418 /* Handle the argument CPU_STR to the cpu= target attribute.  */
15419
15420 static bool
15421 aarch64_handle_attr_cpu (const char *str)
15422 {
15423   const struct processor *tmp_cpu = NULL;
15424   std::string invalid_extension;
15425   enum aarch64_parse_opt_result parse_res
15426     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
15427
15428   if (parse_res == AARCH64_PARSE_OK)
15429     {
15430       gcc_assert (tmp_cpu);
15431       selected_tune = tmp_cpu;
15432       explicit_tune_core = selected_tune->ident;
15433
15434       selected_arch = &all_architectures[tmp_cpu->arch];
15435       explicit_arch = selected_arch->arch;
15436       return true;
15437     }
15438
15439   switch (parse_res)
15440     {
15441       case AARCH64_PARSE_MISSING_ARG:
15442         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
15443         break;
15444       case AARCH64_PARSE_INVALID_ARG:
15445         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
15446         aarch64_print_hint_for_core (str);
15447         break;
15448       case AARCH64_PARSE_INVALID_FEATURE:
15449         error ("invalid feature modifier %s of value (\"%s\") in "
15450                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15451         aarch64_print_hint_for_extensions (invalid_extension);
15452         break;
15453       default:
15454         gcc_unreachable ();
15455     }
15456
15457   return false;
15458 }
15459
15460 /* Handle the argument STR to the branch-protection= attribute.  */
15461
15462  static bool
15463  aarch64_handle_attr_branch_protection (const char* str)
15464  {
15465   char *err_str = (char *) xmalloc (strlen (str) + 1);
15466   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15467                                                                       &err_str);
15468   bool success = false;
15469   switch (res)
15470     {
15471      case AARCH64_PARSE_MISSING_ARG:
15472        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15473               " attribute");
15474        break;
15475      case AARCH64_PARSE_INVALID_ARG:
15476        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15477               "=\")%> pragma or attribute", err_str);
15478        break;
15479      case AARCH64_PARSE_OK:
15480        success = true;
15481       /* Fall through.  */
15482      case AARCH64_PARSE_INVALID_FEATURE:
15483        break;
15484      default:
15485        gcc_unreachable ();
15486     }
15487   free (err_str);
15488   return success;
15489  }
15490
15491 /* Handle the argument STR to the tune= target attribute.  */
15492
15493 static bool
15494 aarch64_handle_attr_tune (const char *str)
15495 {
15496   const struct processor *tmp_tune = NULL;
15497   enum aarch64_parse_opt_result parse_res
15498     = aarch64_parse_tune (str, &tmp_tune);
15499
15500   if (parse_res == AARCH64_PARSE_OK)
15501     {
15502       gcc_assert (tmp_tune);
15503       selected_tune = tmp_tune;
15504       explicit_tune_core = selected_tune->ident;
15505       return true;
15506     }
15507
15508   switch (parse_res)
15509     {
15510       case AARCH64_PARSE_INVALID_ARG:
15511         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15512         aarch64_print_hint_for_core (str);
15513         break;
15514       default:
15515         gcc_unreachable ();
15516     }
15517
15518   return false;
15519 }
15520
15521 /* Parse an architecture extensions target attribute string specified in STR.
15522    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
15523    if successful.  Update aarch64_isa_flags to reflect the ISA features
15524    modified.  */
15525
15526 static bool
15527 aarch64_handle_attr_isa_flags (char *str)
15528 {
15529   enum aarch64_parse_opt_result parse_res;
15530   uint64_t isa_flags = aarch64_isa_flags;
15531
15532   /* We allow "+nothing" in the beginning to clear out all architectural
15533      features if the user wants to handpick specific features.  */
15534   if (strncmp ("+nothing", str, 8) == 0)
15535     {
15536       isa_flags = 0;
15537       str += 8;
15538     }
15539
15540   std::string invalid_extension;
15541   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15542
15543   if (parse_res == AARCH64_PARSE_OK)
15544     {
15545       aarch64_isa_flags = isa_flags;
15546       return true;
15547     }
15548
15549   switch (parse_res)
15550     {
15551       case AARCH64_PARSE_MISSING_ARG:
15552         error ("missing value in %<target()%> pragma or attribute");
15553         break;
15554
15555       case AARCH64_PARSE_INVALID_FEATURE:
15556         error ("invalid feature modifier %s of value (\"%s\") in "
15557                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15558         break;
15559
15560       default:
15561         gcc_unreachable ();
15562     }
15563
15564  return false;
15565 }
15566
15567 /* The target attributes that we support.  On top of these we also support just
15568    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
15569    handled explicitly in aarch64_process_one_target_attr.  */
15570
15571 static const struct aarch64_attribute_info aarch64_attributes[] =
15572 {
15573   { "general-regs-only", aarch64_attr_mask, false, NULL,
15574      OPT_mgeneral_regs_only },
15575   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15576      OPT_mfix_cortex_a53_835769 },
15577   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15578      OPT_mfix_cortex_a53_843419 },
15579   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15580   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15581   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15582      OPT_momit_leaf_frame_pointer },
15583   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15584   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15585      OPT_march_ },
15586   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15587   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15588      OPT_mtune_ },
15589   { "branch-protection", aarch64_attr_custom, false,
15590      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15591   { "sign-return-address", aarch64_attr_enum, false, NULL,
15592      OPT_msign_return_address_ },
15593   { "outline-atomics", aarch64_attr_bool, true, NULL,
15594      OPT_moutline_atomics},
15595   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15596 };
15597
15598 /* Parse ARG_STR which contains the definition of one target attribute.
15599    Show appropriate errors if any or return true if the attribute is valid.  */
15600
15601 static bool
15602 aarch64_process_one_target_attr (char *arg_str)
15603 {
15604   bool invert = false;
15605
15606   size_t len = strlen (arg_str);
15607
15608   if (len == 0)
15609     {
15610       error ("malformed %<target()%> pragma or attribute");
15611       return false;
15612     }
15613
15614   char *str_to_check = (char *) alloca (len + 1);
15615   strcpy (str_to_check, arg_str);
15616
15617   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15618      It is easier to detect and handle it explicitly here rather than going
15619      through the machinery for the rest of the target attributes in this
15620      function.  */
15621   if (*str_to_check == '+')
15622     return aarch64_handle_attr_isa_flags (str_to_check);
15623
15624   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15625     {
15626       invert = true;
15627       str_to_check += 3;
15628     }
15629   char *arg = strchr (str_to_check, '=');
15630
15631   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15632      and point ARG to "foo".  */
15633   if (arg)
15634     {
15635       *arg = '\0';
15636       arg++;
15637     }
15638   const struct aarch64_attribute_info *p_attr;
15639   bool found = false;
15640   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15641     {
15642       /* If the names don't match up, or the user has given an argument
15643          to an attribute that doesn't accept one, or didn't give an argument
15644          to an attribute that expects one, fail to match.  */
15645       if (strcmp (str_to_check, p_attr->name) != 0)
15646         continue;
15647
15648       found = true;
15649       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15650                               || p_attr->attr_type == aarch64_attr_enum;
15651
15652       if (attr_need_arg_p ^ (arg != NULL))
15653         {
15654           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15655           return false;
15656         }
15657
15658       /* If the name matches but the attribute does not allow "no-" versions
15659          then we can't match.  */
15660       if (invert && !p_attr->allow_neg)
15661         {
15662           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15663           return false;
15664         }
15665
15666       switch (p_attr->attr_type)
15667         {
15668         /* Has a custom handler registered.
15669            For example, cpu=, arch=, tune=.  */
15670           case aarch64_attr_custom:
15671             gcc_assert (p_attr->handler);
15672             if (!p_attr->handler (arg))
15673               return false;
15674             break;
15675
15676           /* Either set or unset a boolean option.  */
15677           case aarch64_attr_bool:
15678             {
15679               struct cl_decoded_option decoded;
15680
15681               generate_option (p_attr->opt_num, NULL, !invert,
15682                                CL_TARGET, &decoded);
15683               aarch64_handle_option (&global_options, &global_options_set,
15684                                       &decoded, input_location);
15685               break;
15686             }
15687           /* Set or unset a bit in the target_flags.  aarch64_handle_option
15688              should know what mask to apply given the option number.  */
15689           case aarch64_attr_mask:
15690             {
15691               struct cl_decoded_option decoded;
15692               /* We only need to specify the option number.
15693                  aarch64_handle_option will know which mask to apply.  */
15694               decoded.opt_index = p_attr->opt_num;
15695               decoded.value = !invert;
15696               aarch64_handle_option (&global_options, &global_options_set,
15697                                       &decoded, input_location);
15698               break;
15699             }
15700           /* Use the option setting machinery to set an option to an enum.  */
15701           case aarch64_attr_enum:
15702             {
15703               gcc_assert (arg);
15704               bool valid;
15705               int value;
15706               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15707                                               &value, CL_TARGET);
15708               if (valid)
15709                 {
15710                   set_option (&global_options, NULL, p_attr->opt_num, value,
15711                               NULL, DK_UNSPECIFIED, input_location,
15712                               global_dc);
15713                 }
15714               else
15715                 {
15716                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15717                 }
15718               break;
15719             }
15720           default:
15721             gcc_unreachable ();
15722         }
15723     }
15724
15725   /* If we reached here we either have found an attribute and validated
15726      it or didn't match any.  If we matched an attribute but its arguments
15727      were malformed we will have returned false already.  */
15728   return found;
15729 }
15730
15731 /* Count how many times the character C appears in
15732    NULL-terminated string STR.  */
15733
15734 static unsigned int
15735 num_occurences_in_str (char c, char *str)
15736 {
15737   unsigned int res = 0;
15738   while (*str != '\0')
15739     {
15740       if (*str == c)
15741         res++;
15742
15743       str++;
15744     }
15745
15746   return res;
15747 }
15748
15749 /* Parse the tree in ARGS that contains the target attribute information
15750    and update the global target options space.  */
15751
15752 bool
15753 aarch64_process_target_attr (tree args)
15754 {
15755   if (TREE_CODE (args) == TREE_LIST)
15756     {
15757       do
15758         {
15759           tree head = TREE_VALUE (args);
15760           if (head)
15761             {
15762               if (!aarch64_process_target_attr (head))
15763                 return false;
15764             }
15765           args = TREE_CHAIN (args);
15766         } while (args);
15767
15768       return true;
15769     }
15770
15771   if (TREE_CODE (args) != STRING_CST)
15772     {
15773       error ("attribute %<target%> argument not a string");
15774       return false;
15775     }
15776
15777   size_t len = strlen (TREE_STRING_POINTER (args));
15778   char *str_to_check = (char *) alloca (len + 1);
15779   strcpy (str_to_check, TREE_STRING_POINTER (args));
15780
15781   if (len == 0)
15782     {
15783       error ("malformed %<target()%> pragma or attribute");
15784       return false;
15785     }
15786
15787   /* Used to catch empty spaces between commas i.e.
15788      attribute ((target ("attr1,,attr2"))).  */
15789   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15790
15791   /* Handle multiple target attributes separated by ','.  */
15792   char *token = strtok_r (str_to_check, ",", &str_to_check);
15793
15794   unsigned int num_attrs = 0;
15795   while (token)
15796     {
15797       num_attrs++;
15798       if (!aarch64_process_one_target_attr (token))
15799         {
15800           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
15801           return false;
15802         }
15803
15804       token = strtok_r (NULL, ",", &str_to_check);
15805     }
15806
15807   if (num_attrs != num_commas + 1)
15808     {
15809       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
15810       return false;
15811     }
15812
15813   return true;
15814 }
15815
15816 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
15817    process attribute ((target ("..."))).  */
15818
15819 static bool
15820 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15821 {
15822   struct cl_target_option cur_target;
15823   bool ret;
15824   tree old_optimize;
15825   tree new_target, new_optimize;
15826   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15827
15828   /* If what we're processing is the current pragma string then the
15829      target option node is already stored in target_option_current_node
15830      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
15831      having to re-parse the string.  This is especially useful to keep
15832      arm_neon.h compile times down since that header contains a lot
15833      of intrinsics enclosed in pragmas.  */
15834   if (!existing_target && args == current_target_pragma)
15835     {
15836       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15837       return true;
15838     }
15839   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15840
15841   old_optimize
15842     = build_optimization_node (&global_options, &global_options_set);
15843   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15844
15845   /* If the function changed the optimization levels as well as setting
15846      target options, start with the optimizations specified.  */
15847   if (func_optimize && func_optimize != old_optimize)
15848     cl_optimization_restore (&global_options, &global_options_set,
15849                              TREE_OPTIMIZATION (func_optimize));
15850
15851   /* Save the current target options to restore at the end.  */
15852   cl_target_option_save (&cur_target, &global_options, &global_options_set);
15853
15854   /* If fndecl already has some target attributes applied to it, unpack
15855      them so that we add this attribute on top of them, rather than
15856      overwriting them.  */
15857   if (existing_target)
15858     {
15859       struct cl_target_option *existing_options
15860         = TREE_TARGET_OPTION (existing_target);
15861
15862       if (existing_options)
15863         cl_target_option_restore (&global_options, &global_options_set,
15864                                   existing_options);
15865     }
15866   else
15867     cl_target_option_restore (&global_options, &global_options_set,
15868                               TREE_TARGET_OPTION (target_option_current_node));
15869
15870   ret = aarch64_process_target_attr (args);
15871
15872   /* Set up any additional state.  */
15873   if (ret)
15874     {
15875       aarch64_override_options_internal (&global_options);
15876       /* Initialize SIMD builtins if we haven't already.
15877          Set current_target_pragma to NULL for the duration so that
15878          the builtin initialization code doesn't try to tag the functions
15879          being built with the attributes specified by any current pragma, thus
15880          going into an infinite recursion.  */
15881       if (TARGET_SIMD)
15882         {
15883           tree saved_current_target_pragma = current_target_pragma;
15884           current_target_pragma = NULL;
15885           aarch64_init_simd_builtins ();
15886           current_target_pragma = saved_current_target_pragma;
15887         }
15888       new_target = build_target_option_node (&global_options,
15889                                              &global_options_set);
15890     }
15891   else
15892     new_target = NULL;
15893
15894   new_optimize = build_optimization_node (&global_options,
15895                                           &global_options_set);
15896
15897   if (fndecl && ret)
15898     {
15899       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15900
15901       if (old_optimize != new_optimize)
15902         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15903     }
15904
15905   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
15906
15907   if (old_optimize != new_optimize)
15908     cl_optimization_restore (&global_options, &global_options_set,
15909                              TREE_OPTIMIZATION (old_optimize));
15910   return ret;
15911 }
15912
15913 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
15914    tri-bool options (yes, no, don't care) and the default value is
15915    DEF, determine whether to reject inlining.  */
15916
15917 static bool
15918 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15919                                      int dont_care, int def)
15920 {
15921   /* If the callee doesn't care, always allow inlining.  */
15922   if (callee == dont_care)
15923     return true;
15924
15925   /* If the caller doesn't care, always allow inlining.  */
15926   if (caller == dont_care)
15927     return true;
15928
15929   /* Otherwise, allow inlining if either the callee and caller values
15930      agree, or if the callee is using the default value.  */
15931   return (callee == caller || callee == def);
15932 }
15933
15934 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
15935    to inline CALLEE into CALLER based on target-specific info.
15936    Make sure that the caller and callee have compatible architectural
15937    features.  Then go through the other possible target attributes
15938    and see if they can block inlining.  Try not to reject always_inline
15939    callees unless they are incompatible architecturally.  */
15940
15941 static bool
15942 aarch64_can_inline_p (tree caller, tree callee)
15943 {
15944   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15945   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15946
15947   struct cl_target_option *caller_opts
15948         = TREE_TARGET_OPTION (caller_tree ? caller_tree
15949                                            : target_option_default_node);
15950
15951   struct cl_target_option *callee_opts
15952         = TREE_TARGET_OPTION (callee_tree ? callee_tree
15953                                            : target_option_default_node);
15954
15955   /* Callee's ISA flags should be a subset of the caller's.  */
15956   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15957        != callee_opts->x_aarch64_isa_flags)
15958     return false;
15959
15960   /* Allow non-strict aligned functions inlining into strict
15961      aligned ones.  */
15962   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15963        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15964       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15965            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15966     return false;
15967
15968   bool always_inline = lookup_attribute ("always_inline",
15969                                           DECL_ATTRIBUTES (callee));
15970
15971   /* If the architectural features match up and the callee is always_inline
15972      then the other attributes don't matter.  */
15973   if (always_inline)
15974     return true;
15975
15976   if (caller_opts->x_aarch64_cmodel_var
15977       != callee_opts->x_aarch64_cmodel_var)
15978     return false;
15979
15980   if (caller_opts->x_aarch64_tls_dialect
15981       != callee_opts->x_aarch64_tls_dialect)
15982     return false;
15983
15984   /* Honour explicit requests to workaround errata.  */
15985   if (!aarch64_tribools_ok_for_inlining_p (
15986           caller_opts->x_aarch64_fix_a53_err835769,
15987           callee_opts->x_aarch64_fix_a53_err835769,
15988           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15989     return false;
15990
15991   if (!aarch64_tribools_ok_for_inlining_p (
15992           caller_opts->x_aarch64_fix_a53_err843419,
15993           callee_opts->x_aarch64_fix_a53_err843419,
15994           2, TARGET_FIX_ERR_A53_843419))
15995     return false;
15996
15997   /* If the user explicitly specified -momit-leaf-frame-pointer for the
15998      caller and calle and they don't match up, reject inlining.  */
15999   if (!aarch64_tribools_ok_for_inlining_p (
16000           caller_opts->x_flag_omit_leaf_frame_pointer,
16001           callee_opts->x_flag_omit_leaf_frame_pointer,
16002           2, 1))
16003     return false;
16004
16005   /* If the callee has specific tuning overrides, respect them.  */
16006   if (callee_opts->x_aarch64_override_tune_string != NULL
16007       && caller_opts->x_aarch64_override_tune_string == NULL)
16008     return false;
16009
16010   /* If the user specified tuning override strings for the
16011      caller and callee and they don't match up, reject inlining.
16012      We just do a string compare here, we don't analyze the meaning
16013      of the string, as it would be too costly for little gain.  */
16014   if (callee_opts->x_aarch64_override_tune_string
16015       && caller_opts->x_aarch64_override_tune_string
16016       && (strcmp (callee_opts->x_aarch64_override_tune_string,
16017                   caller_opts->x_aarch64_override_tune_string) != 0))
16018     return false;
16019
16020   return true;
16021 }
16022
16023 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
16024    been already.  */
16025
16026 unsigned int
16027 aarch64_tlsdesc_abi_id ()
16028 {
16029   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
16030   if (!tlsdesc_abi.initialized_p ())
16031     {
16032       HARD_REG_SET full_reg_clobbers;
16033       CLEAR_HARD_REG_SET (full_reg_clobbers);
16034       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
16035       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
16036       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
16037         SET_HARD_REG_BIT (full_reg_clobbers, regno);
16038       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
16039     }
16040   return tlsdesc_abi.id ();
16041 }
16042
16043 /* Return true if SYMBOL_REF X binds locally.  */
16044
16045 static bool
16046 aarch64_symbol_binds_local_p (const_rtx x)
16047 {
16048   return (SYMBOL_REF_DECL (x)
16049           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
16050           : SYMBOL_REF_LOCAL_P (x));
16051 }
16052
16053 /* Return true if SYMBOL_REF X is thread local */
16054 static bool
16055 aarch64_tls_symbol_p (rtx x)
16056 {
16057   if (! TARGET_HAVE_TLS)
16058     return false;
16059
16060   x = strip_salt (x);
16061   if (!SYMBOL_REF_P (x))
16062     return false;
16063
16064   return SYMBOL_REF_TLS_MODEL (x) != 0;
16065 }
16066
16067 /* Classify a TLS symbol into one of the TLS kinds.  */
16068 enum aarch64_symbol_type
16069 aarch64_classify_tls_symbol (rtx x)
16070 {
16071   enum tls_model tls_kind = tls_symbolic_operand_type (x);
16072
16073   switch (tls_kind)
16074     {
16075     case TLS_MODEL_GLOBAL_DYNAMIC:
16076     case TLS_MODEL_LOCAL_DYNAMIC:
16077       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
16078
16079     case TLS_MODEL_INITIAL_EXEC:
16080       switch (aarch64_cmodel)
16081         {
16082         case AARCH64_CMODEL_TINY:
16083         case AARCH64_CMODEL_TINY_PIC:
16084           return SYMBOL_TINY_TLSIE;
16085         default:
16086           return SYMBOL_SMALL_TLSIE;
16087         }
16088
16089     case TLS_MODEL_LOCAL_EXEC:
16090       if (aarch64_tls_size == 12)
16091         return SYMBOL_TLSLE12;
16092       else if (aarch64_tls_size == 24)
16093         return SYMBOL_TLSLE24;
16094       else if (aarch64_tls_size == 32)
16095         return SYMBOL_TLSLE32;
16096       else if (aarch64_tls_size == 48)
16097         return SYMBOL_TLSLE48;
16098       else
16099         gcc_unreachable ();
16100
16101     case TLS_MODEL_EMULATED:
16102     case TLS_MODEL_NONE:
16103       return SYMBOL_FORCE_TO_MEM;
16104
16105     default:
16106       gcc_unreachable ();
16107     }
16108 }
16109
16110 /* Return the correct method for accessing X + OFFSET, where X is either
16111    a SYMBOL_REF or LABEL_REF.  */
16112
16113 enum aarch64_symbol_type
16114 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
16115 {
16116   x = strip_salt (x);
16117
16118   if (LABEL_REF_P (x))
16119     {
16120       switch (aarch64_cmodel)
16121         {
16122         case AARCH64_CMODEL_LARGE:
16123           return SYMBOL_FORCE_TO_MEM;
16124
16125         case AARCH64_CMODEL_TINY_PIC:
16126         case AARCH64_CMODEL_TINY:
16127           return SYMBOL_TINY_ABSOLUTE;
16128
16129         case AARCH64_CMODEL_SMALL_SPIC:
16130         case AARCH64_CMODEL_SMALL_PIC:
16131         case AARCH64_CMODEL_SMALL:
16132           return SYMBOL_SMALL_ABSOLUTE;
16133
16134         default:
16135           gcc_unreachable ();
16136         }
16137     }
16138
16139   if (SYMBOL_REF_P (x))
16140     {
16141       if (aarch64_tls_symbol_p (x))
16142         return aarch64_classify_tls_symbol (x);
16143
16144       switch (aarch64_cmodel)
16145         {
16146         case AARCH64_CMODEL_TINY:
16147           /* When we retrieve symbol + offset address, we have to make sure
16148              the offset does not cause overflow of the final address.  But
16149              we have no way of knowing the address of symbol at compile time
16150              so we can't accurately say if the distance between the PC and
16151              symbol + offset is outside the addressible range of +/-1MB in the
16152              TINY code model.  So we limit the maximum offset to +/-64KB and
16153              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
16154              If offset_within_block_p is true we allow larger offsets.
16155              Furthermore force to memory if the symbol is a weak reference to
16156              something that doesn't resolve to a symbol in this module.  */
16157
16158           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
16159             return SYMBOL_FORCE_TO_MEM;
16160           if (!(IN_RANGE (offset, -0x10000, 0x10000)
16161                 || offset_within_block_p (x, offset)))
16162             return SYMBOL_FORCE_TO_MEM;
16163
16164           return SYMBOL_TINY_ABSOLUTE;
16165
16166         case AARCH64_CMODEL_SMALL:
16167           /* Same reasoning as the tiny code model, but the offset cap here is
16168              1MB, allowing +/-3.9GB for the offset to the symbol.  */
16169
16170           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
16171             return SYMBOL_FORCE_TO_MEM;
16172           if (!(IN_RANGE (offset, -0x100000, 0x100000)
16173                 || offset_within_block_p (x, offset)))
16174             return SYMBOL_FORCE_TO_MEM;
16175
16176           return SYMBOL_SMALL_ABSOLUTE;
16177
16178         case AARCH64_CMODEL_TINY_PIC:
16179           if (!aarch64_symbol_binds_local_p (x))
16180             return SYMBOL_TINY_GOT;
16181           return SYMBOL_TINY_ABSOLUTE;
16182
16183         case AARCH64_CMODEL_SMALL_SPIC:
16184         case AARCH64_CMODEL_SMALL_PIC:
16185           if (!aarch64_symbol_binds_local_p (x))
16186             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
16187                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
16188           return SYMBOL_SMALL_ABSOLUTE;
16189
16190         case AARCH64_CMODEL_LARGE:
16191           /* This is alright even in PIC code as the constant
16192              pool reference is always PC relative and within
16193              the same translation unit.  */
16194           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
16195             return SYMBOL_SMALL_ABSOLUTE;
16196           else
16197             return SYMBOL_FORCE_TO_MEM;
16198
16199         default:
16200           gcc_unreachable ();
16201         }
16202     }
16203
16204   /* By default push everything into the constant pool.  */
16205   return SYMBOL_FORCE_TO_MEM;
16206 }
16207
16208 bool
16209 aarch64_constant_address_p (rtx x)
16210 {
16211   return (CONSTANT_P (x) && memory_address_p (DImode, x));
16212 }
16213
16214 bool
16215 aarch64_legitimate_pic_operand_p (rtx x)
16216 {
16217   poly_int64 offset;
16218   x = strip_offset_and_salt (x, &offset);
16219   if (SYMBOL_REF_P (x))
16220     return false;
16221
16222   return true;
16223 }
16224
16225 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
16226    that should be rematerialized rather than spilled.  */
16227
16228 static bool
16229 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
16230 {
16231   /* Support CSE and rematerialization of common constants.  */
16232   if (CONST_INT_P (x)
16233       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
16234       || GET_CODE (x) == CONST_VECTOR)
16235     return true;
16236
16237   /* Do not allow vector struct mode constants for Advanced SIMD.
16238      We could support 0 and -1 easily, but they need support in
16239      aarch64-simd.md.  */
16240   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16241   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16242     return false;
16243
16244   /* Only accept variable-length vector constants if they can be
16245      handled directly.
16246
16247      ??? It would be possible to handle rematerialization of other
16248      constants via secondary reloads.  */
16249   if (vec_flags & VEC_ANY_SVE)
16250     return aarch64_simd_valid_immediate (x, NULL);
16251
16252   if (GET_CODE (x) == HIGH)
16253     x = XEXP (x, 0);
16254
16255   /* Accept polynomial constants that can be calculated by using the
16256      destination of a move as the sole temporary.  Constants that
16257      require a second temporary cannot be rematerialized (they can't be
16258      forced to memory and also aren't legitimate constants).  */
16259   poly_int64 offset;
16260   if (poly_int_rtx_p (x, &offset))
16261     return aarch64_offset_temporaries (false, offset) <= 1;
16262
16263   /* If an offset is being added to something else, we need to allow the
16264      base to be moved into the destination register, meaning that there
16265      are no free temporaries for the offset.  */
16266   x = strip_offset_and_salt (x, &offset);
16267   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
16268     return false;
16269
16270   /* Do not allow const (plus (anchor_symbol, const_int)).  */
16271   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
16272     return false;
16273
16274   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
16275      so spilling them is better than rematerialization.  */
16276   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
16277     return true;
16278
16279   /* Label references are always constant.  */
16280   if (LABEL_REF_P (x))
16281     return true;
16282
16283   return false;
16284 }
16285
16286 rtx
16287 aarch64_load_tp (rtx target)
16288 {
16289   if (!target
16290       || GET_MODE (target) != Pmode
16291       || !register_operand (target, Pmode))
16292     target = gen_reg_rtx (Pmode);
16293
16294   /* Can return in any reg.  */
16295   emit_insn (gen_aarch64_load_tp_hard (target));
16296   return target;
16297 }
16298
16299 /* On AAPCS systems, this is the "struct __va_list".  */
16300 static GTY(()) tree va_list_type;
16301
16302 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
16303    Return the type to use as __builtin_va_list.
16304
16305    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
16306
16307    struct __va_list
16308    {
16309      void *__stack;
16310      void *__gr_top;
16311      void *__vr_top;
16312      int   __gr_offs;
16313      int   __vr_offs;
16314    };  */
16315
16316 static tree
16317 aarch64_build_builtin_va_list (void)
16318 {
16319   tree va_list_name;
16320   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16321
16322   /* Create the type.  */
16323   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16324   /* Give it the required name.  */
16325   va_list_name = build_decl (BUILTINS_LOCATION,
16326                              TYPE_DECL,
16327                              get_identifier ("__va_list"),
16328                              va_list_type);
16329   DECL_ARTIFICIAL (va_list_name) = 1;
16330   TYPE_NAME (va_list_type) = va_list_name;
16331   TYPE_STUB_DECL (va_list_type) = va_list_name;
16332
16333   /* Create the fields.  */
16334   f_stack = build_decl (BUILTINS_LOCATION,
16335                         FIELD_DECL, get_identifier ("__stack"),
16336                         ptr_type_node);
16337   f_grtop = build_decl (BUILTINS_LOCATION,
16338                         FIELD_DECL, get_identifier ("__gr_top"),
16339                         ptr_type_node);
16340   f_vrtop = build_decl (BUILTINS_LOCATION,
16341                         FIELD_DECL, get_identifier ("__vr_top"),
16342                         ptr_type_node);
16343   f_groff = build_decl (BUILTINS_LOCATION,
16344                         FIELD_DECL, get_identifier ("__gr_offs"),
16345                         integer_type_node);
16346   f_vroff = build_decl (BUILTINS_LOCATION,
16347                         FIELD_DECL, get_identifier ("__vr_offs"),
16348                         integer_type_node);
16349
16350   /* Tell tree-stdarg pass about our internal offset fields.
16351      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16352      purpose to identify whether the code is updating va_list internal
16353      offset fields through irregular way.  */
16354   va_list_gpr_counter_field = f_groff;
16355   va_list_fpr_counter_field = f_vroff;
16356
16357   DECL_ARTIFICIAL (f_stack) = 1;
16358   DECL_ARTIFICIAL (f_grtop) = 1;
16359   DECL_ARTIFICIAL (f_vrtop) = 1;
16360   DECL_ARTIFICIAL (f_groff) = 1;
16361   DECL_ARTIFICIAL (f_vroff) = 1;
16362
16363   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16364   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16365   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16366   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16367   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16368
16369   TYPE_FIELDS (va_list_type) = f_stack;
16370   DECL_CHAIN (f_stack) = f_grtop;
16371   DECL_CHAIN (f_grtop) = f_vrtop;
16372   DECL_CHAIN (f_vrtop) = f_groff;
16373   DECL_CHAIN (f_groff) = f_vroff;
16374
16375   /* Compute its layout.  */
16376   layout_type (va_list_type);
16377
16378   return va_list_type;
16379 }
16380
16381 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
16382 static void
16383 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16384 {
16385   const CUMULATIVE_ARGS *cum;
16386   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16387   tree stack, grtop, vrtop, groff, vroff;
16388   tree t;
16389   int gr_save_area_size = cfun->va_list_gpr_size;
16390   int vr_save_area_size = cfun->va_list_fpr_size;
16391   int vr_offset;
16392
16393   cum = &crtl->args.info;
16394   if (cfun->va_list_gpr_size)
16395     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16396                              cfun->va_list_gpr_size);
16397   if (cfun->va_list_fpr_size)
16398     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16399                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
16400
16401   if (!TARGET_FLOAT)
16402     {
16403       gcc_assert (cum->aapcs_nvrn == 0);
16404       vr_save_area_size = 0;
16405     }
16406
16407   f_stack = TYPE_FIELDS (va_list_type_node);
16408   f_grtop = DECL_CHAIN (f_stack);
16409   f_vrtop = DECL_CHAIN (f_grtop);
16410   f_groff = DECL_CHAIN (f_vrtop);
16411   f_vroff = DECL_CHAIN (f_groff);
16412
16413   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16414                   NULL_TREE);
16415   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16416                   NULL_TREE);
16417   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16418                   NULL_TREE);
16419   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16420                   NULL_TREE);
16421   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16422                   NULL_TREE);
16423
16424   /* Emit code to initialize STACK, which points to the next varargs stack
16425      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
16426      by named arguments.  STACK is 8-byte aligned.  */
16427   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16428   if (cum->aapcs_stack_size > 0)
16429     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16430   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16431   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16432
16433   /* Emit code to initialize GRTOP, the top of the GR save area.
16434      virtual_incoming_args_rtx should have been 16 byte aligned.  */
16435   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16436   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16437   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16438
16439   /* Emit code to initialize VRTOP, the top of the VR save area.
16440      This address is gr_save_area_bytes below GRTOP, rounded
16441      down to the next 16-byte boundary.  */
16442   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
16443   vr_offset = ROUND_UP (gr_save_area_size,
16444                         STACK_BOUNDARY / BITS_PER_UNIT);
16445
16446   if (vr_offset)
16447     t = fold_build_pointer_plus_hwi (t, -vr_offset);
16448   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16449   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16450
16451   /* Emit code to initialize GROFF, the offset from GRTOP of the
16452      next GPR argument.  */
16453   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16454               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16455   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16456
16457   /* Likewise emit code to initialize VROFF, the offset from FTOP
16458      of the next VR argument.  */
16459   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16460               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16461   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16462 }
16463
16464 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
16465
16466 static tree
16467 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16468                               gimple_seq *post_p ATTRIBUTE_UNUSED)
16469 {
16470   tree addr;
16471   bool indirect_p;
16472   bool is_ha;           /* is HFA or HVA.  */
16473   bool dw_align;        /* double-word align.  */
16474   machine_mode ag_mode = VOIDmode;
16475   int nregs;
16476   machine_mode mode;
16477
16478   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16479   tree stack, f_top, f_off, off, arg, roundup, on_stack;
16480   HOST_WIDE_INT size, rsize, adjust, align;
16481   tree t, u, cond1, cond2;
16482
16483   indirect_p = pass_va_arg_by_reference (type);
16484   if (indirect_p)
16485     type = build_pointer_type (type);
16486
16487   mode = TYPE_MODE (type);
16488
16489   f_stack = TYPE_FIELDS (va_list_type_node);
16490   f_grtop = DECL_CHAIN (f_stack);
16491   f_vrtop = DECL_CHAIN (f_grtop);
16492   f_groff = DECL_CHAIN (f_vrtop);
16493   f_vroff = DECL_CHAIN (f_groff);
16494
16495   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16496                   f_stack, NULL_TREE);
16497   size = int_size_in_bytes (type);
16498
16499   bool abi_break;
16500   align
16501     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
16502
16503   dw_align = false;
16504   adjust = 0;
16505   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16506                                                &is_ha, false))
16507     {
16508       /* No frontends can create types with variable-sized modes, so we
16509          shouldn't be asked to pass or return them.  */
16510       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16511
16512       /* TYPE passed in fp/simd registers.  */
16513       if (!TARGET_FLOAT)
16514         aarch64_err_no_fpadvsimd (mode);
16515
16516       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16517                       unshare_expr (valist), f_vrtop, NULL_TREE);
16518       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16519                       unshare_expr (valist), f_vroff, NULL_TREE);
16520
16521       rsize = nregs * UNITS_PER_VREG;
16522
16523       if (is_ha)
16524         {
16525           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16526             adjust = UNITS_PER_VREG - ag_size;
16527         }
16528       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16529                && size < UNITS_PER_VREG)
16530         {
16531           adjust = UNITS_PER_VREG - size;
16532         }
16533     }
16534   else
16535     {
16536       /* TYPE passed in general registers.  */
16537       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16538                       unshare_expr (valist), f_grtop, NULL_TREE);
16539       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16540                       unshare_expr (valist), f_groff, NULL_TREE);
16541       rsize = ROUND_UP (size, UNITS_PER_WORD);
16542       nregs = rsize / UNITS_PER_WORD;
16543
16544       if (align > 8)
16545         {
16546           if (abi_break && warn_psabi)
16547             inform (input_location, "parameter passing for argument of type "
16548                     "%qT changed in GCC 9.1", type);
16549           dw_align = true;
16550         }
16551
16552       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16553           && size < UNITS_PER_WORD)
16554         {
16555           adjust = UNITS_PER_WORD  - size;
16556         }
16557     }
16558
16559   /* Get a local temporary for the field value.  */
16560   off = get_initialized_tmp_var (f_off, pre_p, NULL);
16561
16562   /* Emit code to branch if off >= 0.  */
16563   t = build2 (GE_EXPR, boolean_type_node, off,
16564               build_int_cst (TREE_TYPE (off), 0));
16565   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16566
16567   if (dw_align)
16568     {
16569       /* Emit: offs = (offs + 15) & -16.  */
16570       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16571                   build_int_cst (TREE_TYPE (off), 15));
16572       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16573                   build_int_cst (TREE_TYPE (off), -16));
16574       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16575     }
16576   else
16577     roundup = NULL;
16578
16579   /* Update ap.__[g|v]r_offs  */
16580   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16581               build_int_cst (TREE_TYPE (off), rsize));
16582   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16583
16584   /* String up.  */
16585   if (roundup)
16586     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16587
16588   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
16589   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16590               build_int_cst (TREE_TYPE (f_off), 0));
16591   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16592
16593   /* String up: make sure the assignment happens before the use.  */
16594   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16595   COND_EXPR_ELSE (cond1) = t;
16596
16597   /* Prepare the trees handling the argument that is passed on the stack;
16598      the top level node will store in ON_STACK.  */
16599   arg = get_initialized_tmp_var (stack, pre_p, NULL);
16600   if (align > 8)
16601     {
16602       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
16603       t = fold_build_pointer_plus_hwi (arg, 15);
16604       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16605                   build_int_cst (TREE_TYPE (t), -16));
16606       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16607     }
16608   else
16609     roundup = NULL;
16610   /* Advance ap.__stack  */
16611   t = fold_build_pointer_plus_hwi (arg, size + 7);
16612   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16613               build_int_cst (TREE_TYPE (t), -8));
16614   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16615   /* String up roundup and advance.  */
16616   if (roundup)
16617     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16618   /* String up with arg */
16619   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16620   /* Big-endianness related address adjustment.  */
16621   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16622       && size < UNITS_PER_WORD)
16623   {
16624     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16625                 size_int (UNITS_PER_WORD - size));
16626     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16627   }
16628
16629   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16630   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16631
16632   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
16633   t = off;
16634   if (adjust)
16635     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16636                 build_int_cst (TREE_TYPE (off), adjust));
16637
16638   t = fold_convert (sizetype, t);
16639   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16640
16641   if (is_ha)
16642     {
16643       /* type ha; // treat as "struct {ftype field[n];}"
16644          ... [computing offs]
16645          for (i = 0; i <nregs; ++i, offs += 16)
16646            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16647          return ha;  */
16648       int i;
16649       tree tmp_ha, field_t, field_ptr_t;
16650
16651       /* Declare a local variable.  */
16652       tmp_ha = create_tmp_var_raw (type, "ha");
16653       gimple_add_tmp_var (tmp_ha);
16654
16655       /* Establish the base type.  */
16656       switch (ag_mode)
16657         {
16658         case E_SFmode:
16659           field_t = float_type_node;
16660           field_ptr_t = float_ptr_type_node;
16661           break;
16662         case E_DFmode:
16663           field_t = double_type_node;
16664           field_ptr_t = double_ptr_type_node;
16665           break;
16666         case E_TFmode:
16667           field_t = long_double_type_node;
16668           field_ptr_t = long_double_ptr_type_node;
16669           break;
16670         case E_HFmode:
16671           field_t = aarch64_fp16_type_node;
16672           field_ptr_t = aarch64_fp16_ptr_type_node;
16673           break;
16674         case E_BFmode:
16675           field_t = aarch64_bf16_type_node;
16676           field_ptr_t = aarch64_bf16_ptr_type_node;
16677           break;
16678         case E_V2SImode:
16679         case E_V4SImode:
16680             {
16681               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16682               field_t = build_vector_type_for_mode (innertype, ag_mode);
16683               field_ptr_t = build_pointer_type (field_t);
16684             }
16685           break;
16686         default:
16687           gcc_assert (0);
16688         }
16689
16690       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
16691       TREE_ADDRESSABLE (tmp_ha) = 1;
16692       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16693       addr = t;
16694       t = fold_convert (field_ptr_t, addr);
16695       t = build2 (MODIFY_EXPR, field_t,
16696                   build1 (INDIRECT_REF, field_t, tmp_ha),
16697                   build1 (INDIRECT_REF, field_t, t));
16698
16699       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
16700       for (i = 1; i < nregs; ++i)
16701         {
16702           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16703           u = fold_convert (field_ptr_t, addr);
16704           u = build2 (MODIFY_EXPR, field_t,
16705                       build2 (MEM_REF, field_t, tmp_ha,
16706                               build_int_cst (field_ptr_t,
16707                                              (i *
16708                                               int_size_in_bytes (field_t)))),
16709                       build1 (INDIRECT_REF, field_t, u));
16710           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16711         }
16712
16713       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16714       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16715     }
16716
16717   COND_EXPR_ELSE (cond2) = t;
16718   addr = fold_convert (build_pointer_type (type), cond1);
16719   addr = build_va_arg_indirect_ref (addr);
16720
16721   if (indirect_p)
16722     addr = build_va_arg_indirect_ref (addr);
16723
16724   return addr;
16725 }
16726
16727 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
16728
16729 static void
16730 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16731                                 const function_arg_info &arg,
16732                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16733 {
16734   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16735   CUMULATIVE_ARGS local_cum;
16736   int gr_saved = cfun->va_list_gpr_size;
16737   int vr_saved = cfun->va_list_fpr_size;
16738
16739   /* The caller has advanced CUM up to, but not beyond, the last named
16740      argument.  Advance a local copy of CUM past the last "real" named
16741      argument, to find out how many registers are left over.  */
16742   local_cum = *cum;
16743   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16744
16745   /* Found out how many registers we need to save.
16746      Honor tree-stdvar analysis results.  */
16747   if (cfun->va_list_gpr_size)
16748     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16749                     cfun->va_list_gpr_size / UNITS_PER_WORD);
16750   if (cfun->va_list_fpr_size)
16751     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16752                     cfun->va_list_fpr_size / UNITS_PER_VREG);
16753
16754   if (!TARGET_FLOAT)
16755     {
16756       gcc_assert (local_cum.aapcs_nvrn == 0);
16757       vr_saved = 0;
16758     }
16759
16760   if (!no_rtl)
16761     {
16762       if (gr_saved > 0)
16763         {
16764           rtx ptr, mem;
16765
16766           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
16767           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16768                                - gr_saved * UNITS_PER_WORD);
16769           mem = gen_frame_mem (BLKmode, ptr);
16770           set_mem_alias_set (mem, get_varargs_alias_set ());
16771
16772           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16773                                mem, gr_saved);
16774         }
16775       if (vr_saved > 0)
16776         {
16777           /* We can't use move_block_from_reg, because it will use
16778              the wrong mode, storing D regs only.  */
16779           machine_mode mode = TImode;
16780           int off, i, vr_start;
16781
16782           /* Set OFF to the offset from virtual_incoming_args_rtx of
16783              the first vector register.  The VR save area lies below
16784              the GR one, and is aligned to 16 bytes.  */
16785           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16786                            STACK_BOUNDARY / BITS_PER_UNIT);
16787           off -= vr_saved * UNITS_PER_VREG;
16788
16789           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16790           for (i = 0; i < vr_saved; ++i)
16791             {
16792               rtx ptr, mem;
16793
16794               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16795               mem = gen_frame_mem (mode, ptr);
16796               set_mem_alias_set (mem, get_varargs_alias_set ());
16797               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
16798               off += UNITS_PER_VREG;
16799             }
16800         }
16801     }
16802
16803   /* We don't save the size into *PRETEND_SIZE because we want to avoid
16804      any complication of having crtl->args.pretend_args_size changed.  */
16805   cfun->machine->frame.saved_varargs_size
16806     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16807                  STACK_BOUNDARY / BITS_PER_UNIT)
16808        + vr_saved * UNITS_PER_VREG);
16809 }
16810
16811 static void
16812 aarch64_conditional_register_usage (void)
16813 {
16814   int i;
16815   if (!TARGET_FLOAT)
16816     {
16817       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16818         {
16819           fixed_regs[i] = 1;
16820           call_used_regs[i] = 1;
16821         }
16822     }
16823   if (!TARGET_SVE)
16824     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16825       {
16826         fixed_regs[i] = 1;
16827         call_used_regs[i] = 1;
16828       }
16829
16830   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
16831   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16832   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16833
16834   /* When tracking speculation, we need a couple of call-clobbered registers
16835      to track the speculation state.  It would be nice to just use
16836      IP0 and IP1, but currently there are numerous places that just
16837      assume these registers are free for other uses (eg pointer
16838      authentication).  */
16839   if (aarch64_track_speculation)
16840     {
16841       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16842       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16843       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16844       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16845     }
16846 }
16847
16848 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
16849
16850 bool
16851 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16852 {
16853   /* For records we're passed a FIELD_DECL, for arrays we're passed
16854      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
16855   const_tree type = TREE_TYPE (field_or_array);
16856
16857   /* Assign BLKmode to anything that contains multiple SVE predicates.
16858      For structures, the "multiple" case is indicated by MODE being
16859      VOIDmode.  */
16860   unsigned int num_zr, num_pr;
16861   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16862     {
16863       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16864         return !simple_cst_equal (TYPE_SIZE (field_or_array),
16865                                   TYPE_SIZE (type));
16866       return mode == VOIDmode;
16867     }
16868
16869   return default_member_type_forces_blk (field_or_array, mode);
16870 }
16871
16872 /* Bitmasks that indicate whether earlier versions of GCC would have
16873    taken a different path through the ABI logic.  This should result in
16874    a -Wpsabi warning if the earlier path led to a different ABI decision.
16875
16876    WARN_PSABI_EMPTY_CXX17_BASE
16877       Indicates that the type includes an artificial empty C++17 base field
16878       that, prior to GCC 10.1, would prevent the type from being treated as
16879       a HFA or HVA.  See PR94383 for details.
16880
16881    WARN_PSABI_NO_UNIQUE_ADDRESS
16882       Indicates that the type includes an empty [[no_unique_address]] field
16883       that, prior to GCC 10.1, would prevent the type from being treated as
16884       a HFA or HVA.  */
16885 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
16886 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
16887
16888 /* Walk down the type tree of TYPE counting consecutive base elements.
16889    If *MODEP is VOIDmode, then set it to the first valid floating point
16890    type.  If a non-floating point type is found, or if a floating point
16891    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16892    otherwise return the count in the sub-tree.
16893
16894    The WARN_PSABI_FLAGS argument allows the caller to check whether this
16895    function has changed its behavior relative to earlier versions of GCC.
16896    Normally the argument should be nonnull and point to a zero-initialized
16897    variable.  The function then records whether the ABI decision might
16898    be affected by a known fix to the ABI logic, setting the associated
16899    WARN_PSABI_* bits if so.
16900
16901    When the argument is instead a null pointer, the function tries to
16902    simulate the behavior of GCC before all such ABI fixes were made.
16903    This is useful to check whether the function returns something
16904    different after the ABI fixes.  */
16905 static int
16906 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
16907                          unsigned int *warn_psabi_flags)
16908 {
16909   machine_mode mode;
16910   HOST_WIDE_INT size;
16911
16912   if (aarch64_sve::builtin_type_p (type))
16913     return -1;
16914
16915   switch (TREE_CODE (type))
16916     {
16917     case REAL_TYPE:
16918       mode = TYPE_MODE (type);
16919       if (mode != DFmode && mode != SFmode
16920           && mode != TFmode && mode != HFmode)
16921         return -1;
16922
16923       if (*modep == VOIDmode)
16924         *modep = mode;
16925
16926       if (*modep == mode)
16927         return 1;
16928
16929       break;
16930
16931     case COMPLEX_TYPE:
16932       mode = TYPE_MODE (TREE_TYPE (type));
16933       if (mode != DFmode && mode != SFmode
16934           && mode != TFmode && mode != HFmode)
16935         return -1;
16936
16937       if (*modep == VOIDmode)
16938         *modep = mode;
16939
16940       if (*modep == mode)
16941         return 2;
16942
16943       break;
16944
16945     case VECTOR_TYPE:
16946       /* Use V2SImode and V4SImode as representatives of all 64-bit
16947          and 128-bit vector types.  */
16948       size = int_size_in_bytes (type);
16949       switch (size)
16950         {
16951         case 8:
16952           mode = V2SImode;
16953           break;
16954         case 16:
16955           mode = V4SImode;
16956           break;
16957         default:
16958           return -1;
16959         }
16960
16961       if (*modep == VOIDmode)
16962         *modep = mode;
16963
16964       /* Vector modes are considered to be opaque: two vectors are
16965          equivalent for the purposes of being homogeneous aggregates
16966          if they are the same size.  */
16967       if (*modep == mode)
16968         return 1;
16969
16970       break;
16971
16972     case ARRAY_TYPE:
16973       {
16974         int count;
16975         tree index = TYPE_DOMAIN (type);
16976
16977         /* Can't handle incomplete types nor sizes that are not
16978            fixed.  */
16979         if (!COMPLETE_TYPE_P (type)
16980             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16981           return -1;
16982
16983         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
16984                                          warn_psabi_flags);
16985         if (count == -1
16986             || !index
16987             || !TYPE_MAX_VALUE (index)
16988             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
16989             || !TYPE_MIN_VALUE (index)
16990             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
16991             || count < 0)
16992           return -1;
16993
16994         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16995                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
16996
16997         /* There must be no padding.  */
16998         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16999                       count * GET_MODE_BITSIZE (*modep)))
17000           return -1;
17001
17002         return count;
17003       }
17004
17005     case RECORD_TYPE:
17006       {
17007         int count = 0;
17008         int sub_count;
17009         tree field;
17010
17011         /* Can't handle incomplete types nor sizes that are not
17012            fixed.  */
17013         if (!COMPLETE_TYPE_P (type)
17014             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
17015           return -1;
17016
17017         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17018           {
17019             if (TREE_CODE (field) != FIELD_DECL)
17020               continue;
17021
17022             if (DECL_FIELD_ABI_IGNORED (field))
17023               {
17024                 /* See whether this is something that earlier versions of
17025                    GCC failed to ignore.  */
17026                 unsigned int flag;
17027                 if (lookup_attribute ("no_unique_address",
17028                                       DECL_ATTRIBUTES (field)))
17029                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
17030                 else if (cxx17_empty_base_field_p (field))
17031                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
17032                 else
17033                   /* No compatibility problem.  */
17034                   continue;
17035
17036                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
17037                 if (warn_psabi_flags)
17038                   {
17039                     *warn_psabi_flags |= flag;
17040                     continue;
17041                   }
17042               }
17043
17044             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
17045                                                  warn_psabi_flags);
17046             if (sub_count < 0)
17047               return -1;
17048             count += sub_count;
17049           }
17050
17051         /* There must be no padding.  */
17052         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17053                       count * GET_MODE_BITSIZE (*modep)))
17054           return -1;
17055
17056         return count;
17057       }
17058
17059     case UNION_TYPE:
17060     case QUAL_UNION_TYPE:
17061       {
17062         /* These aren't very interesting except in a degenerate case.  */
17063         int count = 0;
17064         int sub_count;
17065         tree field;
17066
17067         /* Can't handle incomplete types nor sizes that are not
17068            fixed.  */
17069         if (!COMPLETE_TYPE_P (type)
17070             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
17071           return -1;
17072
17073         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17074           {
17075             if (TREE_CODE (field) != FIELD_DECL)
17076               continue;
17077
17078             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
17079                                                  warn_psabi_flags);
17080             if (sub_count < 0)
17081               return -1;
17082             count = count > sub_count ? count : sub_count;
17083           }
17084
17085         /* There must be no padding.  */
17086         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17087                       count * GET_MODE_BITSIZE (*modep)))
17088           return -1;
17089
17090         return count;
17091       }
17092
17093     default:
17094       break;
17095     }
17096
17097   return -1;
17098 }
17099
17100 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
17101    type as described in AAPCS64 \S 4.1.2.
17102
17103    See the comment above aarch64_composite_type_p for the notes on MODE.  */
17104
17105 static bool
17106 aarch64_short_vector_p (const_tree type,
17107                         machine_mode mode)
17108 {
17109   poly_int64 size = -1;
17110
17111   if (type && TREE_CODE (type) == VECTOR_TYPE)
17112     {
17113       if (aarch64_sve::builtin_type_p (type))
17114         return false;
17115       size = int_size_in_bytes (type);
17116     }
17117   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
17118            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
17119     {
17120       /* Rely only on the type, not the mode, when processing SVE types.  */
17121       if (type && aarch64_some_values_include_pst_objects_p (type))
17122         /* Leave later code to report an error if SVE is disabled.  */
17123         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
17124       else
17125         size = GET_MODE_SIZE (mode);
17126     }
17127   if (known_eq (size, 8) || known_eq (size, 16))
17128     {
17129       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
17130          they are being treated as scalable AAPCS64 types.  */
17131       gcc_assert (!aarch64_sve_mode_p (mode));
17132       return true;
17133     }
17134   return false;
17135 }
17136
17137 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
17138    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
17139    array types.  The C99 floating-point complex types are also considered
17140    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
17141    types, which are GCC extensions and out of the scope of AAPCS64, are
17142    treated as composite types here as well.
17143
17144    Note that MODE itself is not sufficient in determining whether a type
17145    is such a composite type or not.  This is because
17146    stor-layout.c:compute_record_mode may have already changed the MODE
17147    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
17148    structure with only one field may have its MODE set to the mode of the
17149    field.  Also an integer mode whose size matches the size of the
17150    RECORD_TYPE type may be used to substitute the original mode
17151    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
17152    solely relied on.  */
17153
17154 static bool
17155 aarch64_composite_type_p (const_tree type,
17156                           machine_mode mode)
17157 {
17158   if (aarch64_short_vector_p (type, mode))
17159     return false;
17160
17161   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
17162     return true;
17163
17164   if (mode == BLKmode
17165       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
17166       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
17167     return true;
17168
17169   return false;
17170 }
17171
17172 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
17173    shall be passed or returned in simd/fp register(s) (providing these
17174    parameter passing registers are available).
17175
17176    Upon successful return, *COUNT returns the number of needed registers,
17177    *BASE_MODE returns the mode of the individual register and when IS_HAF
17178    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
17179    floating-point aggregate or a homogeneous short-vector aggregate.
17180
17181    SILENT_P is true if the function should refrain from reporting any
17182    diagnostics.  This should only be used if the caller is certain that
17183    any ABI decisions would eventually come through this function with
17184    SILENT_P set to false.  */
17185
17186 static bool
17187 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
17188                                          const_tree type,
17189                                          machine_mode *base_mode,
17190                                          int *count,
17191                                          bool *is_ha,
17192                                          bool silent_p)
17193 {
17194   if (is_ha != NULL) *is_ha = false;
17195
17196   machine_mode new_mode = VOIDmode;
17197   bool composite_p = aarch64_composite_type_p (type, mode);
17198
17199   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
17200       || aarch64_short_vector_p (type, mode))
17201     {
17202       *count = 1;
17203       new_mode = mode;
17204     }
17205   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
17206     {
17207       if (is_ha != NULL) *is_ha = true;
17208       *count = 2;
17209       new_mode = GET_MODE_INNER (mode);
17210     }
17211   else if (type && composite_p)
17212     {
17213       unsigned int warn_psabi_flags = 0;
17214       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
17215                                               &warn_psabi_flags);
17216       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
17217         {
17218           static unsigned last_reported_type_uid;
17219           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
17220           int alt;
17221           if (!silent_p
17222               && warn_psabi
17223               && warn_psabi_flags
17224               && uid != last_reported_type_uid
17225               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
17226                   != ag_count))
17227             {
17228               const char *url
17229                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
17230               gcc_assert (alt == -1);
17231               last_reported_type_uid = uid;
17232               /* Use TYPE_MAIN_VARIANT to strip any redundant const
17233                  qualification.  */
17234               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
17235                 inform (input_location, "parameter passing for argument of "
17236                         "type %qT with %<[[no_unique_address]]%> members "
17237                         "changed %{in GCC 10.1%}",
17238                         TYPE_MAIN_VARIANT (type), url);
17239               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
17240                 inform (input_location, "parameter passing for argument of "
17241                         "type %qT when C++17 is enabled changed to match "
17242                         "C++14 %{in GCC 10.1%}",
17243                         TYPE_MAIN_VARIANT (type), url);
17244             }
17245
17246           if (is_ha != NULL) *is_ha = true;
17247           *count = ag_count;
17248         }
17249       else
17250         return false;
17251     }
17252   else
17253     return false;
17254
17255   gcc_assert (!aarch64_sve_mode_p (new_mode));
17256   *base_mode = new_mode;
17257   return true;
17258 }
17259
17260 /* Implement TARGET_STRUCT_VALUE_RTX.  */
17261
17262 static rtx
17263 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
17264                           int incoming ATTRIBUTE_UNUSED)
17265 {
17266   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
17267 }
17268
17269 /* Implements target hook vector_mode_supported_p.  */
17270 static bool
17271 aarch64_vector_mode_supported_p (machine_mode mode)
17272 {
17273   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17274   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
17275 }
17276
17277 /* Return the full-width SVE vector mode for element mode MODE, if one
17278    exists.  */
17279 opt_machine_mode
17280 aarch64_full_sve_mode (scalar_mode mode)
17281 {
17282   switch (mode)
17283     {
17284     case E_DFmode:
17285       return VNx2DFmode;
17286     case E_SFmode:
17287       return VNx4SFmode;
17288     case E_HFmode:
17289       return VNx8HFmode;
17290     case E_BFmode:
17291       return VNx8BFmode;
17292     case E_DImode:
17293       return VNx2DImode;
17294     case E_SImode:
17295       return VNx4SImode;
17296     case E_HImode:
17297       return VNx8HImode;
17298     case E_QImode:
17299       return VNx16QImode;
17300     default:
17301       return opt_machine_mode ();
17302     }
17303 }
17304
17305 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
17306    if it exists.  */
17307 opt_machine_mode
17308 aarch64_vq_mode (scalar_mode mode)
17309 {
17310   switch (mode)
17311     {
17312     case E_DFmode:
17313       return V2DFmode;
17314     case E_SFmode:
17315       return V4SFmode;
17316     case E_HFmode:
17317       return V8HFmode;
17318     case E_BFmode:
17319       return V8BFmode;
17320     case E_SImode:
17321       return V4SImode;
17322     case E_HImode:
17323       return V8HImode;
17324     case E_QImode:
17325       return V16QImode;
17326     case E_DImode:
17327       return V2DImode;
17328     default:
17329       return opt_machine_mode ();
17330     }
17331 }
17332
17333 /* Return appropriate SIMD container
17334    for MODE within a vector of WIDTH bits.  */
17335 static machine_mode
17336 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
17337 {
17338   if (TARGET_SVE
17339       && maybe_ne (width, 128)
17340       && known_eq (width, BITS_PER_SVE_VECTOR))
17341     return aarch64_full_sve_mode (mode).else_mode (word_mode);
17342
17343   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
17344   if (TARGET_SIMD)
17345     {
17346       if (known_eq (width, 128))
17347         return aarch64_vq_mode (mode).else_mode (word_mode);
17348       else
17349         switch (mode)
17350           {
17351           case E_SFmode:
17352             return V2SFmode;
17353           case E_HFmode:
17354             return V4HFmode;
17355           case E_BFmode:
17356             return V4BFmode;
17357           case E_SImode:
17358             return V2SImode;
17359           case E_HImode:
17360             return V4HImode;
17361           case E_QImode:
17362             return V8QImode;
17363           default:
17364             break;
17365           }
17366     }
17367   return word_mode;
17368 }
17369
17370 static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64);
17371
17372 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
17373    and return whether the SVE mode should be preferred over the
17374    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
17375 static bool
17376 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
17377 {
17378   /* Take into account the aarch64-autovec-preference param if non-zero.  */
17379   bool only_asimd_p = aarch64_autovec_preference == 1;
17380   bool only_sve_p = aarch64_autovec_preference == 2;
17381
17382   if (only_asimd_p)
17383     return false;
17384   if (only_sve_p)
17385     return true;
17386
17387   /* The preference in case of a tie in costs.  */
17388   bool prefer_asimd = aarch64_autovec_preference == 3;
17389   bool prefer_sve = aarch64_autovec_preference == 4;
17390
17391   aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
17392
17393   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
17394   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
17395   /* If the CPU information does not have an SVE width registered use the
17396      generic poly_int comparison that prefers SVE.  If a preference is
17397      explicitly requested avoid this path.  */
17398   if (tune_width == SVE_SCALABLE
17399       && !prefer_asimd
17400       && !prefer_sve)
17401     return maybe_gt (nunits_sve, nunits_asimd);
17402
17403   /* Otherwise estimate the runtime width of the modes involved.  */
17404   HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve);
17405   HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd);
17406
17407   /* Preferring SVE means picking it first unless the Advanced SIMD mode
17408      is clearly wider.  */
17409   if (prefer_sve)
17410     return est_sve >= est_asimd;
17411   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
17412      is clearly wider.  */
17413   if (prefer_asimd)
17414     return est_sve > est_asimd;
17415
17416   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
17417   return est_sve > est_asimd;
17418 }
17419
17420 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
17421 static machine_mode
17422 aarch64_preferred_simd_mode (scalar_mode mode)
17423 {
17424   /* Take into account explicit auto-vectorization ISA preferences through
17425      aarch64_cmp_autovec_modes.  */
17426   poly_int64 bits
17427     = (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
17428        ? BITS_PER_SVE_VECTOR : 128;
17429   return aarch64_simd_container_mode (mode, bits);
17430 }
17431
17432 /* Return a list of possible vector sizes for the vectorizer
17433    to iterate over.  */
17434 static unsigned int
17435 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
17436 {
17437   static const machine_mode sve_modes[] = {
17438     /* Try using full vectors for all element types.  */
17439     VNx16QImode,
17440
17441     /* Try using 16-bit containers for 8-bit elements and full vectors
17442        for wider elements.  */
17443     VNx8QImode,
17444
17445     /* Try using 32-bit containers for 8-bit and 16-bit elements and
17446        full vectors for wider elements.  */
17447     VNx4QImode,
17448
17449     /* Try using 64-bit containers for all element types.  */
17450     VNx2QImode
17451   };
17452
17453   static const machine_mode advsimd_modes[] = {
17454     /* Try using 128-bit vectors for all element types.  */
17455     V16QImode,
17456
17457     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17458        for wider elements.  */
17459     V8QImode,
17460
17461     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17462        for wider elements.
17463
17464        TODO: We could support a limited form of V4QImode too, so that
17465        we use 32-bit vectors for 8-bit elements.  */
17466     V4HImode,
17467
17468     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17469        for 64-bit elements.
17470
17471        TODO: We could similarly support limited forms of V2QImode and V2HImode
17472        for this case.  */
17473     V2SImode
17474   };
17475
17476   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17477      This is because:
17478
17479      - If we can't use N-byte Advanced SIMD vectors then the placement
17480        doesn't matter; we'll just continue as though the Advanced SIMD
17481        entry didn't exist.
17482
17483      - If an SVE main loop with N bytes ends up being cheaper than an
17484        Advanced SIMD main loop with N bytes then by default we'll replace
17485        the Advanced SIMD version with the SVE one.
17486
17487      - If an Advanced SIMD main loop with N bytes ends up being cheaper
17488        than an SVE main loop with N bytes then by default we'll try to
17489        use the SVE loop to vectorize the epilogue instead.  */
17490
17491   bool only_asimd_p = aarch64_autovec_preference == 1;
17492   bool only_sve_p = aarch64_autovec_preference == 2;
17493
17494   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
17495   unsigned int advsimd_i = 0;
17496
17497   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
17498     {
17499       if (sve_i < ARRAY_SIZE (sve_modes)
17500           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
17501                                         advsimd_modes[advsimd_i]))
17502         modes->safe_push (sve_modes[sve_i++]);
17503       else
17504         modes->safe_push (advsimd_modes[advsimd_i++]);
17505     }
17506   while (sve_i < ARRAY_SIZE (sve_modes))
17507    modes->safe_push (sve_modes[sve_i++]);
17508
17509   unsigned int flags = 0;
17510   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17511      can compare SVE against Advanced SIMD and so that we can compare
17512      multiple SVE vectorization approaches against each other.  There's
17513      not really any point doing this for Advanced SIMD only, since the
17514      first mode that works should always be the best.  */
17515   if (TARGET_SVE && aarch64_sve_compare_costs)
17516     flags |= VECT_COMPARE_COSTS;
17517   return flags;
17518 }
17519
17520 /* Implement TARGET_MANGLE_TYPE.  */
17521
17522 static const char *
17523 aarch64_mangle_type (const_tree type)
17524 {
17525   /* The AArch64 ABI documents say that "__va_list" has to be
17526      mangled as if it is in the "std" namespace.  */
17527   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17528     return "St9__va_list";
17529
17530   /* Half-precision floating point types.  */
17531   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
17532     {
17533       if (TYPE_MODE (type) == BFmode)
17534         return "u6__bf16";
17535       else
17536         return "Dh";
17537     }
17538
17539   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
17540      builtin types.  */
17541   if (TYPE_NAME (type) != NULL)
17542     {
17543       const char *res;
17544       if ((res = aarch64_general_mangle_builtin_type (type))
17545           || (res = aarch64_sve::mangle_builtin_type (type)))
17546         return res;
17547     }
17548
17549   /* Use the default mangling.  */
17550   return NULL;
17551 }
17552
17553 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
17554
17555 static bool
17556 aarch64_verify_type_context (location_t loc, type_context_kind context,
17557                              const_tree type, bool silent_p)
17558 {
17559   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17560 }
17561
17562 /* Find the first rtx_insn before insn that will generate an assembly
17563    instruction.  */
17564
17565 static rtx_insn *
17566 aarch64_prev_real_insn (rtx_insn *insn)
17567 {
17568   if (!insn)
17569     return NULL;
17570
17571   do
17572     {
17573       insn = prev_real_insn (insn);
17574     }
17575   while (insn && recog_memoized (insn) < 0);
17576
17577   return insn;
17578 }
17579
17580 static bool
17581 is_madd_op (enum attr_type t1)
17582 {
17583   unsigned int i;
17584   /* A number of these may be AArch32 only.  */
17585   enum attr_type mlatypes[] = {
17586     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17587     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17588     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17589   };
17590
17591   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17592     {
17593       if (t1 == mlatypes[i])
17594         return true;
17595     }
17596
17597   return false;
17598 }
17599
17600 /* Check if there is a register dependency between a load and the insn
17601    for which we hold recog_data.  */
17602
17603 static bool
17604 dep_between_memop_and_curr (rtx memop)
17605 {
17606   rtx load_reg;
17607   int opno;
17608
17609   gcc_assert (GET_CODE (memop) == SET);
17610
17611   if (!REG_P (SET_DEST (memop)))
17612     return false;
17613
17614   load_reg = SET_DEST (memop);
17615   for (opno = 1; opno < recog_data.n_operands; opno++)
17616     {
17617       rtx operand = recog_data.operand[opno];
17618       if (REG_P (operand)
17619           && reg_overlap_mentioned_p (load_reg, operand))
17620         return true;
17621
17622     }
17623   return false;
17624 }
17625
17626
17627 /* When working around the Cortex-A53 erratum 835769,
17628    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17629    instruction and has a preceding memory instruction such that a NOP
17630    should be inserted between them.  */
17631
17632 bool
17633 aarch64_madd_needs_nop (rtx_insn* insn)
17634 {
17635   enum attr_type attr_type;
17636   rtx_insn *prev;
17637   rtx body;
17638
17639   if (!TARGET_FIX_ERR_A53_835769)
17640     return false;
17641
17642   if (!INSN_P (insn) || recog_memoized (insn) < 0)
17643     return false;
17644
17645   attr_type = get_attr_type (insn);
17646   if (!is_madd_op (attr_type))
17647     return false;
17648
17649   prev = aarch64_prev_real_insn (insn);
17650   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17651      Restore recog state to INSN to avoid state corruption.  */
17652   extract_constrain_insn_cached (insn);
17653
17654   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17655     return false;
17656
17657   body = single_set (prev);
17658
17659   /* If the previous insn is a memory op and there is no dependency between
17660      it and the DImode madd, emit a NOP between them.  If body is NULL then we
17661      have a complex memory operation, probably a load/store pair.
17662      Be conservative for now and emit a NOP.  */
17663   if (GET_MODE (recog_data.operand[0]) == DImode
17664       && (!body || !dep_between_memop_and_curr (body)))
17665     return true;
17666
17667   return false;
17668
17669 }
17670
17671
17672 /* Implement FINAL_PRESCAN_INSN.  */
17673
17674 void
17675 aarch64_final_prescan_insn (rtx_insn *insn)
17676 {
17677   if (aarch64_madd_needs_nop (insn))
17678     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17679 }
17680
17681
17682 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17683    instruction.  */
17684
17685 bool
17686 aarch64_sve_index_immediate_p (rtx base_or_step)
17687 {
17688   return (CONST_INT_P (base_or_step)
17689           && IN_RANGE (INTVAL (base_or_step), -16, 15));
17690 }
17691
17692 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17693    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
17694
17695 bool
17696 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17697 {
17698   rtx elt = unwrap_const_vec_duplicate (x);
17699   if (!CONST_INT_P (elt))
17700     return false;
17701
17702   HOST_WIDE_INT val = INTVAL (elt);
17703   if (negate_p)
17704     val = -val;
17705   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17706
17707   if (val & 0xff)
17708     return IN_RANGE (val, 0, 0xff);
17709   return IN_RANGE (val, 0, 0xff00);
17710 }
17711
17712 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17713    instructions when applied to mode MODE.  Negate X first if NEGATE_P
17714    is true.  */
17715
17716 bool
17717 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17718 {
17719   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17720     return false;
17721
17722   /* After the optional negation, the immediate must be nonnegative.
17723      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17724      instead of SQADD Zn.B, Zn.B, #129.  */
17725   rtx elt = unwrap_const_vec_duplicate (x);
17726   return negate_p == (INTVAL (elt) < 0);
17727 }
17728
17729 /* Return true if X is a valid immediate operand for an SVE logical
17730    instruction such as AND.  */
17731
17732 bool
17733 aarch64_sve_bitmask_immediate_p (rtx x)
17734 {
17735   rtx elt;
17736
17737   return (const_vec_duplicate_p (x, &elt)
17738           && CONST_INT_P (elt)
17739           && aarch64_bitmask_imm (INTVAL (elt),
17740                                   GET_MODE_INNER (GET_MODE (x))));
17741 }
17742
17743 /* Return true if X is a valid immediate for the SVE DUP and CPY
17744    instructions.  */
17745
17746 bool
17747 aarch64_sve_dup_immediate_p (rtx x)
17748 {
17749   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17750   if (!CONST_INT_P (x))
17751     return false;
17752
17753   HOST_WIDE_INT val = INTVAL (x);
17754   if (val & 0xff)
17755     return IN_RANGE (val, -0x80, 0x7f);
17756   return IN_RANGE (val, -0x8000, 0x7f00);
17757 }
17758
17759 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17760    SIGNED_P says whether the operand is signed rather than unsigned.  */
17761
17762 bool
17763 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17764 {
17765   x = unwrap_const_vec_duplicate (x);
17766   return (CONST_INT_P (x)
17767           && (signed_p
17768               ? IN_RANGE (INTVAL (x), -16, 15)
17769               : IN_RANGE (INTVAL (x), 0, 127)));
17770 }
17771
17772 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17773    instruction.  Negate X first if NEGATE_P is true.  */
17774
17775 bool
17776 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17777 {
17778   rtx elt;
17779   REAL_VALUE_TYPE r;
17780
17781   if (!const_vec_duplicate_p (x, &elt)
17782       || !CONST_DOUBLE_P (elt))
17783     return false;
17784
17785   r = *CONST_DOUBLE_REAL_VALUE (elt);
17786
17787   if (negate_p)
17788     r = real_value_negate (&r);
17789
17790   if (real_equal (&r, &dconst1))
17791     return true;
17792   if (real_equal (&r, &dconsthalf))
17793     return true;
17794   return false;
17795 }
17796
17797 /* Return true if X is a valid immediate operand for an SVE FMUL
17798    instruction.  */
17799
17800 bool
17801 aarch64_sve_float_mul_immediate_p (rtx x)
17802 {
17803   rtx elt;
17804
17805   return (const_vec_duplicate_p (x, &elt)
17806           && CONST_DOUBLE_P (elt)
17807           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17808               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
17809 }
17810
17811 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17812    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
17813    is nonnull, use it to describe valid immediates.  */
17814 static bool
17815 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17816                                     simd_immediate_info *info,
17817                                     enum simd_immediate_check which,
17818                                     simd_immediate_info::insn_type insn)
17819 {
17820   /* Try a 4-byte immediate with LSL.  */
17821   for (unsigned int shift = 0; shift < 32; shift += 8)
17822     if ((val32 & (0xff << shift)) == val32)
17823       {
17824         if (info)
17825           *info = simd_immediate_info (SImode, val32 >> shift, insn,
17826                                        simd_immediate_info::LSL, shift);
17827         return true;
17828       }
17829
17830   /* Try a 2-byte immediate with LSL.  */
17831   unsigned int imm16 = val32 & 0xffff;
17832   if (imm16 == (val32 >> 16))
17833     for (unsigned int shift = 0; shift < 16; shift += 8)
17834       if ((imm16 & (0xff << shift)) == imm16)
17835         {
17836           if (info)
17837             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17838                                          simd_immediate_info::LSL, shift);
17839           return true;
17840         }
17841
17842   /* Try a 4-byte immediate with MSL, except for cases that MVN
17843      can handle.  */
17844   if (which == AARCH64_CHECK_MOV)
17845     for (unsigned int shift = 8; shift < 24; shift += 8)
17846       {
17847         unsigned int low = (1 << shift) - 1;
17848         if (((val32 & (0xff << shift)) | low) == val32)
17849           {
17850             if (info)
17851               *info = simd_immediate_info (SImode, val32 >> shift, insn,
17852                                            simd_immediate_info::MSL, shift);
17853             return true;
17854           }
17855       }
17856
17857   return false;
17858 }
17859
17860 /* Return true if replicating VAL64 is a valid immediate for the
17861    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
17862    use it to describe valid immediates.  */
17863 static bool
17864 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17865                                  simd_immediate_info *info,
17866                                  enum simd_immediate_check which)
17867 {
17868   unsigned int val32 = val64 & 0xffffffff;
17869   unsigned int val16 = val64 & 0xffff;
17870   unsigned int val8 = val64 & 0xff;
17871
17872   if (val32 == (val64 >> 32))
17873     {
17874       if ((which & AARCH64_CHECK_ORR) != 0
17875           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17876                                                  simd_immediate_info::MOV))
17877         return true;
17878
17879       if ((which & AARCH64_CHECK_BIC) != 0
17880           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17881                                                  simd_immediate_info::MVN))
17882         return true;
17883
17884       /* Try using a replicated byte.  */
17885       if (which == AARCH64_CHECK_MOV
17886           && val16 == (val32 >> 16)
17887           && val8 == (val16 >> 8))
17888         {
17889           if (info)
17890             *info = simd_immediate_info (QImode, val8);
17891           return true;
17892         }
17893     }
17894
17895   /* Try using a bit-to-bytemask.  */
17896   if (which == AARCH64_CHECK_MOV)
17897     {
17898       unsigned int i;
17899       for (i = 0; i < 64; i += 8)
17900         {
17901           unsigned char byte = (val64 >> i) & 0xff;
17902           if (byte != 0 && byte != 0xff)
17903             break;
17904         }
17905       if (i == 64)
17906         {
17907           if (info)
17908             *info = simd_immediate_info (DImode, val64);
17909           return true;
17910         }
17911     }
17912   return false;
17913 }
17914
17915 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17916    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
17917
17918 static bool
17919 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17920                              simd_immediate_info *info)
17921 {
17922   scalar_int_mode mode = DImode;
17923   unsigned int val32 = val64 & 0xffffffff;
17924   if (val32 == (val64 >> 32))
17925     {
17926       mode = SImode;
17927       unsigned int val16 = val32 & 0xffff;
17928       if (val16 == (val32 >> 16))
17929         {
17930           mode = HImode;
17931           unsigned int val8 = val16 & 0xff;
17932           if (val8 == (val16 >> 8))
17933             mode = QImode;
17934         }
17935     }
17936   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17937   if (IN_RANGE (val, -0x80, 0x7f))
17938     {
17939       /* DUP with no shift.  */
17940       if (info)
17941         *info = simd_immediate_info (mode, val);
17942       return true;
17943     }
17944   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17945     {
17946       /* DUP with LSL #8.  */
17947       if (info)
17948         *info = simd_immediate_info (mode, val);
17949       return true;
17950     }
17951   if (aarch64_bitmask_imm (val64, mode))
17952     {
17953       /* DUPM.  */
17954       if (info)
17955         *info = simd_immediate_info (mode, val);
17956       return true;
17957     }
17958   return false;
17959 }
17960
17961 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17962
17963        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17964
17965    where PATTERN is the svpattern as a CONST_INT and where ZERO
17966    is a zero constant of the required PTRUE mode (which can have
17967    fewer elements than X's mode, if zero bits are significant).
17968
17969    If so, and if INFO is nonnull, describe the immediate in INFO.  */
17970 bool
17971 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17972 {
17973   if (GET_CODE (x) != CONST)
17974     return false;
17975
17976   x = XEXP (x, 0);
17977   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17978     return false;
17979
17980   if (info)
17981     {
17982       aarch64_svpattern pattern
17983         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17984       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17985       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17986       *info = simd_immediate_info (int_mode, pattern);
17987     }
17988   return true;
17989 }
17990
17991 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
17992    it to describe valid immediates.  */
17993
17994 static bool
17995 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17996 {
17997   if (aarch64_sve_ptrue_svpattern_p (x, info))
17998     return true;
17999
18000   if (x == CONST0_RTX (GET_MODE (x)))
18001     {
18002       if (info)
18003         *info = simd_immediate_info (DImode, 0);
18004       return true;
18005     }
18006
18007   /* Analyze the value as a VNx16BImode.  This should be relatively
18008      efficient, since rtx_vector_builder has enough built-in capacity
18009      to store all VLA predicate constants without needing the heap.  */
18010   rtx_vector_builder builder;
18011   if (!aarch64_get_sve_pred_bits (builder, x))
18012     return false;
18013
18014   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
18015   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
18016     {
18017       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
18018       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
18019       if (pattern != AARCH64_NUM_SVPATTERNS)
18020         {
18021           if (info)
18022             {
18023               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
18024               *info = simd_immediate_info (int_mode, pattern);
18025             }
18026           return true;
18027         }
18028     }
18029   return false;
18030 }
18031
18032 /* Return true if OP is a valid SIMD immediate for the operation
18033    described by WHICH.  If INFO is nonnull, use it to describe valid
18034    immediates.  */
18035 bool
18036 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
18037                               enum simd_immediate_check which)
18038 {
18039   machine_mode mode = GET_MODE (op);
18040   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18041   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
18042     return false;
18043
18044   if (vec_flags & VEC_SVE_PRED)
18045     return aarch64_sve_pred_valid_immediate (op, info);
18046
18047   scalar_mode elt_mode = GET_MODE_INNER (mode);
18048   rtx base, step;
18049   unsigned int n_elts;
18050   if (GET_CODE (op) == CONST_VECTOR
18051       && CONST_VECTOR_DUPLICATE_P (op))
18052     n_elts = CONST_VECTOR_NPATTERNS (op);
18053   else if ((vec_flags & VEC_SVE_DATA)
18054            && const_vec_series_p (op, &base, &step))
18055     {
18056       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
18057       if (!aarch64_sve_index_immediate_p (base)
18058           || !aarch64_sve_index_immediate_p (step))
18059         return false;
18060
18061       if (info)
18062         {
18063           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
18064              should yield two integer values per 128-bit block, meaning
18065              that we need to treat it in the same way as V2DI and then
18066              ignore the upper 32 bits of each element.  */
18067           elt_mode = aarch64_sve_container_int_mode (mode);
18068           *info = simd_immediate_info (elt_mode, base, step);
18069         }
18070       return true;
18071     }
18072   else if (GET_CODE (op) == CONST_VECTOR
18073            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
18074     /* N_ELTS set above.  */;
18075   else
18076     return false;
18077
18078   scalar_float_mode elt_float_mode;
18079   if (n_elts == 1
18080       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
18081     {
18082       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
18083       if (aarch64_float_const_zero_rtx_p (elt)
18084           || aarch64_float_const_representable_p (elt))
18085         {
18086           if (info)
18087             *info = simd_immediate_info (elt_float_mode, elt);
18088           return true;
18089         }
18090     }
18091
18092   /* If all elements in an SVE vector have the same value, we have a free
18093      choice between using the element mode and using the container mode.
18094      Using the element mode means that unused parts of the vector are
18095      duplicates of the used elements, while using the container mode means
18096      that the unused parts are an extension of the used elements.  Using the
18097      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
18098      for its container mode VNx4SI while 0x00000101 isn't.
18099
18100      If not all elements in an SVE vector have the same value, we need the
18101      transition from one element to the next to occur at container boundaries.
18102      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
18103      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
18104   scalar_int_mode elt_int_mode;
18105   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
18106     elt_int_mode = aarch64_sve_container_int_mode (mode);
18107   else
18108     elt_int_mode = int_mode_for_mode (elt_mode).require ();
18109
18110   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
18111   if (elt_size > 8)
18112     return false;
18113
18114   /* Expand the vector constant out into a byte vector, with the least
18115      significant byte of the register first.  */
18116   auto_vec<unsigned char, 16> bytes;
18117   bytes.reserve (n_elts * elt_size);
18118   for (unsigned int i = 0; i < n_elts; i++)
18119     {
18120       /* The vector is provided in gcc endian-neutral fashion.
18121          For aarch64_be Advanced SIMD, it must be laid out in the vector
18122          register in reverse order.  */
18123       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
18124       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
18125
18126       if (elt_mode != elt_int_mode)
18127         elt = gen_lowpart (elt_int_mode, elt);
18128
18129       if (!CONST_INT_P (elt))
18130         return false;
18131
18132       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
18133       for (unsigned int byte = 0; byte < elt_size; byte++)
18134         {
18135           bytes.quick_push (elt_val & 0xff);
18136           elt_val >>= BITS_PER_UNIT;
18137         }
18138     }
18139
18140   /* The immediate must repeat every eight bytes.  */
18141   unsigned int nbytes = bytes.length ();
18142   for (unsigned i = 8; i < nbytes; ++i)
18143     if (bytes[i] != bytes[i - 8])
18144       return false;
18145
18146   /* Get the repeating 8-byte value as an integer.  No endian correction
18147      is needed here because bytes is already in lsb-first order.  */
18148   unsigned HOST_WIDE_INT val64 = 0;
18149   for (unsigned int i = 0; i < 8; i++)
18150     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
18151               << (i * BITS_PER_UNIT));
18152
18153   if (vec_flags & VEC_SVE_DATA)
18154     return aarch64_sve_valid_immediate (val64, info);
18155   else
18156     return aarch64_advsimd_valid_immediate (val64, info, which);
18157 }
18158
18159 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
18160    has a step in the range of INDEX.  Return the index expression if so,
18161    otherwise return null.  */
18162 rtx
18163 aarch64_check_zero_based_sve_index_immediate (rtx x)
18164 {
18165   rtx base, step;
18166   if (const_vec_series_p (x, &base, &step)
18167       && base == const0_rtx
18168       && aarch64_sve_index_immediate_p (step))
18169     return step;
18170   return NULL_RTX;
18171 }
18172
18173 /* Check of immediate shift constants are within range.  */
18174 bool
18175 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
18176 {
18177   x = unwrap_const_vec_duplicate (x);
18178   if (!CONST_INT_P (x))
18179     return false;
18180   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
18181   if (left)
18182     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
18183   else
18184     return IN_RANGE (INTVAL (x), 1, bit_width);
18185 }
18186
18187 /* Return the bitmask CONST_INT to select the bits required by a zero extract
18188    operation of width WIDTH at bit position POS.  */
18189
18190 rtx
18191 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
18192 {
18193   gcc_assert (CONST_INT_P (width));
18194   gcc_assert (CONST_INT_P (pos));
18195
18196   unsigned HOST_WIDE_INT mask
18197     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
18198   return GEN_INT (mask << UINTVAL (pos));
18199 }
18200
18201 bool
18202 aarch64_mov_operand_p (rtx x, machine_mode mode)
18203 {
18204   if (GET_CODE (x) == HIGH
18205       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
18206     return true;
18207
18208   if (CONST_INT_P (x))
18209     return true;
18210
18211   if (VECTOR_MODE_P (GET_MODE (x)))
18212     {
18213       /* Require predicate constants to be VNx16BI before RA, so that we
18214          force everything to have a canonical form.  */
18215       if (!lra_in_progress
18216           && !reload_completed
18217           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
18218           && GET_MODE (x) != VNx16BImode)
18219         return false;
18220
18221       return aarch64_simd_valid_immediate (x, NULL);
18222     }
18223
18224   x = strip_salt (x);
18225   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
18226     return true;
18227
18228   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
18229     return true;
18230
18231   return aarch64_classify_symbolic_expression (x)
18232     == SYMBOL_TINY_ABSOLUTE;
18233 }
18234
18235 /* Return a const_int vector of VAL.  */
18236 rtx
18237 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
18238 {
18239   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
18240   return gen_const_vec_duplicate (mode, c);
18241 }
18242
18243 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
18244
18245 bool
18246 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
18247 {
18248   machine_mode vmode;
18249
18250   vmode = aarch64_simd_container_mode (mode, 64);
18251   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
18252   return aarch64_simd_valid_immediate (op_v, NULL);
18253 }
18254
18255 /* Construct and return a PARALLEL RTX vector with elements numbering the
18256    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
18257    the vector - from the perspective of the architecture.  This does not
18258    line up with GCC's perspective on lane numbers, so we end up with
18259    different masks depending on our target endian-ness.  The diagram
18260    below may help.  We must draw the distinction when building masks
18261    which select one half of the vector.  An instruction selecting
18262    architectural low-lanes for a big-endian target, must be described using
18263    a mask selecting GCC high-lanes.
18264
18265                  Big-Endian             Little-Endian
18266
18267 GCC             0   1   2   3           3   2   1   0
18268               | x | x | x | x |       | x | x | x | x |
18269 Architecture    3   2   1   0           3   2   1   0
18270
18271 Low Mask:         { 2, 3 }                { 0, 1 }
18272 High Mask:        { 0, 1 }                { 2, 3 }
18273
18274    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
18275
18276 rtx
18277 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
18278 {
18279   rtvec v = rtvec_alloc (nunits / 2);
18280   int high_base = nunits / 2;
18281   int low_base = 0;
18282   int base;
18283   rtx t1;
18284   int i;
18285
18286   if (BYTES_BIG_ENDIAN)
18287     base = high ? low_base : high_base;
18288   else
18289     base = high ? high_base : low_base;
18290
18291   for (i = 0; i < nunits / 2; i++)
18292     RTVEC_ELT (v, i) = GEN_INT (base + i);
18293
18294   t1 = gen_rtx_PARALLEL (mode, v);
18295   return t1;
18296 }
18297
18298 /* Check OP for validity as a PARALLEL RTX vector with elements
18299    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
18300    from the perspective of the architecture.  See the diagram above
18301    aarch64_simd_vect_par_cnst_half for more details.  */
18302
18303 bool
18304 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
18305                                        bool high)
18306 {
18307   int nelts;
18308   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
18309     return false;
18310
18311   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
18312   HOST_WIDE_INT count_op = XVECLEN (op, 0);
18313   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
18314   int i = 0;
18315
18316   if (count_op != count_ideal)
18317     return false;
18318
18319   for (i = 0; i < count_ideal; i++)
18320     {
18321       rtx elt_op = XVECEXP (op, 0, i);
18322       rtx elt_ideal = XVECEXP (ideal, 0, i);
18323
18324       if (!CONST_INT_P (elt_op)
18325           || INTVAL (elt_ideal) != INTVAL (elt_op))
18326         return false;
18327     }
18328   return true;
18329 }
18330
18331 /* Return a PARALLEL containing NELTS elements, with element I equal
18332    to BASE + I * STEP.  */
18333
18334 rtx
18335 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
18336 {
18337   rtvec vec = rtvec_alloc (nelts);
18338   for (unsigned int i = 0; i < nelts; ++i)
18339     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
18340   return gen_rtx_PARALLEL (VOIDmode, vec);
18341 }
18342
18343 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
18344    series with step STEP.  */
18345
18346 bool
18347 aarch64_stepped_int_parallel_p (rtx op, int step)
18348 {
18349   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
18350     return false;
18351
18352   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
18353   for (int i = 1; i < XVECLEN (op, 0); ++i)
18354     if (!CONST_INT_P (XVECEXP (op, 0, i))
18355         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
18356       return false;
18357
18358   return true;
18359 }
18360
18361 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
18362    HIGH (exclusive).  */
18363 void
18364 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
18365                           const_tree exp)
18366 {
18367   HOST_WIDE_INT lane;
18368   gcc_assert (CONST_INT_P (operand));
18369   lane = INTVAL (operand);
18370
18371   if (lane < low || lane >= high)
18372   {
18373     if (exp)
18374       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
18375     else
18376       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
18377   }
18378 }
18379
18380 /* Peform endian correction on lane number N, which indexes a vector
18381    of mode MODE, and return the result as an SImode rtx.  */
18382
18383 rtx
18384 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18385 {
18386   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18387 }
18388
18389 /* Return TRUE if OP is a valid vector addressing mode.  */
18390
18391 bool
18392 aarch64_simd_mem_operand_p (rtx op)
18393 {
18394   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
18395                         || REG_P (XEXP (op, 0)));
18396 }
18397
18398 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
18399
18400 bool
18401 aarch64_sve_ld1r_operand_p (rtx op)
18402 {
18403   struct aarch64_address_info addr;
18404   scalar_mode mode;
18405
18406   return (MEM_P (op)
18407           && is_a <scalar_mode> (GET_MODE (op), &mode)
18408           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18409           && addr.type == ADDRESS_REG_IMM
18410           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18411 }
18412
18413 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18414    where the size of the read data is specified by `mode` and the size of the
18415    vector elements are specified by `elem_mode`.   */
18416 bool
18417 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18418                                    scalar_mode elem_mode)
18419 {
18420   struct aarch64_address_info addr;
18421   if (!MEM_P (op)
18422       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18423     return false;
18424
18425   if (addr.type == ADDRESS_REG_IMM)
18426     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
18427
18428   if (addr.type == ADDRESS_REG_REG)
18429     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18430
18431   return false;
18432 }
18433
18434 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
18435 bool
18436 aarch64_sve_ld1rq_operand_p (rtx op)
18437 {
18438   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18439                                             GET_MODE_INNER (GET_MODE (op)));
18440 }
18441
18442 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18443    accessing a vector where the element size is specified by `elem_mode`.  */
18444 bool
18445 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18446 {
18447   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18448 }
18449
18450 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
18451 bool
18452 aarch64_sve_ldff1_operand_p (rtx op)
18453 {
18454   if (!MEM_P (op))
18455     return false;
18456
18457   struct aarch64_address_info addr;
18458   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18459     return false;
18460
18461   if (addr.type == ADDRESS_REG_IMM)
18462     return known_eq (addr.const_offset, 0);
18463
18464   return addr.type == ADDRESS_REG_REG;
18465 }
18466
18467 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
18468 bool
18469 aarch64_sve_ldnf1_operand_p (rtx op)
18470 {
18471   struct aarch64_address_info addr;
18472
18473   return (MEM_P (op)
18474           && aarch64_classify_address (&addr, XEXP (op, 0),
18475                                        GET_MODE (op), false)
18476           && addr.type == ADDRESS_REG_IMM);
18477 }
18478
18479 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18480    The conditions for STR are the same.  */
18481 bool
18482 aarch64_sve_ldr_operand_p (rtx op)
18483 {
18484   struct aarch64_address_info addr;
18485
18486   return (MEM_P (op)
18487           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18488                                        false, ADDR_QUERY_ANY)
18489           && addr.type == ADDRESS_REG_IMM);
18490 }
18491
18492 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18493    addressing memory of mode MODE.  */
18494 bool
18495 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18496 {
18497   struct aarch64_address_info addr;
18498   if (!aarch64_classify_address (&addr, op, mode, false))
18499     return false;
18500
18501   if (addr.type == ADDRESS_REG_IMM)
18502     return known_eq (addr.const_offset, 0);
18503
18504   return addr.type == ADDRESS_REG_REG;
18505 }
18506
18507 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18508    We need to be able to access the individual pieces, so the range
18509    is different from LD[234] and ST[234].  */
18510 bool
18511 aarch64_sve_struct_memory_operand_p (rtx op)
18512 {
18513   if (!MEM_P (op))
18514     return false;
18515
18516   machine_mode mode = GET_MODE (op);
18517   struct aarch64_address_info addr;
18518   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18519                                  ADDR_QUERY_ANY)
18520       || addr.type != ADDRESS_REG_IMM)
18521     return false;
18522
18523   poly_int64 first = addr.const_offset;
18524   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18525   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18526           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18527 }
18528
18529 /* Emit a register copy from operand to operand, taking care not to
18530    early-clobber source registers in the process.
18531
18532    COUNT is the number of components into which the copy needs to be
18533    decomposed.  */
18534 void
18535 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
18536                                 unsigned int count)
18537 {
18538   unsigned int i;
18539   int rdest = REGNO (operands[0]);
18540   int rsrc = REGNO (operands[1]);
18541
18542   if (!reg_overlap_mentioned_p (operands[0], operands[1])
18543       || rdest < rsrc)
18544     for (i = 0; i < count; i++)
18545       emit_move_insn (gen_rtx_REG (mode, rdest + i),
18546                       gen_rtx_REG (mode, rsrc + i));
18547   else
18548     for (i = 0; i < count; i++)
18549       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18550                       gen_rtx_REG (mode, rsrc + count - i - 1));
18551 }
18552
18553 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
18554    one of VSTRUCT modes: OI, CI, or XI.  */
18555 int
18556 aarch64_simd_attr_length_rglist (machine_mode mode)
18557 {
18558   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
18559   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
18560 }
18561
18562 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
18563    alignment of a vector to 128 bits.  SVE predicates have an alignment of
18564    16 bits.  */
18565 static HOST_WIDE_INT
18566 aarch64_simd_vector_alignment (const_tree type)
18567 {
18568   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18569      be set for non-predicate vectors of booleans.  Modes are the most
18570      direct way we have of identifying real SVE predicate types.  */
18571   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18572     return 16;
18573   widest_int min_size
18574     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18575   return wi::umin (min_size, 128).to_uhwi ();
18576 }
18577
18578 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
18579 static poly_uint64
18580 aarch64_vectorize_preferred_vector_alignment (const_tree type)
18581 {
18582   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18583     {
18584       /* If the length of the vector is fixed, try to align to that length,
18585          otherwise don't try to align at all.  */
18586       HOST_WIDE_INT result;
18587       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
18588         result = TYPE_ALIGN (TREE_TYPE (type));
18589       return result;
18590     }
18591   return TYPE_ALIGN (type);
18592 }
18593
18594 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
18595 static bool
18596 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18597 {
18598   if (is_packed)
18599     return false;
18600
18601   /* For fixed-length vectors, check that the vectorizer will aim for
18602      full-vector alignment.  This isn't true for generic GCC vectors
18603      that are wider than the ABI maximum of 128 bits.  */
18604   poly_uint64 preferred_alignment =
18605     aarch64_vectorize_preferred_vector_alignment (type);
18606   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
18607       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18608                    preferred_alignment))
18609     return false;
18610
18611   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
18612   return true;
18613 }
18614
18615 /* Return true if the vector misalignment factor is supported by the
18616    target.  */
18617 static bool
18618 aarch64_builtin_support_vector_misalignment (machine_mode mode,
18619                                              const_tree type, int misalignment,
18620                                              bool is_packed)
18621 {
18622   if (TARGET_SIMD && STRICT_ALIGNMENT)
18623     {
18624       /* Return if movmisalign pattern is not supported for this mode.  */
18625       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18626         return false;
18627
18628       /* Misalignment factor is unknown at compile time.  */
18629       if (misalignment == -1)
18630         return false;
18631     }
18632   return default_builtin_support_vector_misalignment (mode, type, misalignment,
18633                                                       is_packed);
18634 }
18635
18636 /* If VALS is a vector constant that can be loaded into a register
18637    using DUP, generate instructions to do so and return an RTX to
18638    assign to the register.  Otherwise return NULL_RTX.  */
18639 static rtx
18640 aarch64_simd_dup_constant (rtx vals)
18641 {
18642   machine_mode mode = GET_MODE (vals);
18643   machine_mode inner_mode = GET_MODE_INNER (mode);
18644   rtx x;
18645
18646   if (!const_vec_duplicate_p (vals, &x))
18647     return NULL_RTX;
18648
18649   /* We can load this constant by using DUP and a constant in a
18650      single ARM register.  This will be cheaper than a vector
18651      load.  */
18652   x = copy_to_mode_reg (inner_mode, x);
18653   return gen_vec_duplicate (mode, x);
18654 }
18655
18656
18657 /* Generate code to load VALS, which is a PARALLEL containing only
18658    constants (for vec_init) or CONST_VECTOR, efficiently into a
18659    register.  Returns an RTX to copy into the register, or NULL_RTX
18660    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
18661 static rtx
18662 aarch64_simd_make_constant (rtx vals)
18663 {
18664   machine_mode mode = GET_MODE (vals);
18665   rtx const_dup;
18666   rtx const_vec = NULL_RTX;
18667   int n_const = 0;
18668   int i;
18669
18670   if (GET_CODE (vals) == CONST_VECTOR)
18671     const_vec = vals;
18672   else if (GET_CODE (vals) == PARALLEL)
18673     {
18674       /* A CONST_VECTOR must contain only CONST_INTs and
18675          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18676          Only store valid constants in a CONST_VECTOR.  */
18677       int n_elts = XVECLEN (vals, 0);
18678       for (i = 0; i < n_elts; ++i)
18679         {
18680           rtx x = XVECEXP (vals, 0, i);
18681           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18682             n_const++;
18683         }
18684       if (n_const == n_elts)
18685         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18686     }
18687   else
18688     gcc_unreachable ();
18689
18690   if (const_vec != NULL_RTX
18691       && aarch64_simd_valid_immediate (const_vec, NULL))
18692     /* Load using MOVI/MVNI.  */
18693     return const_vec;
18694   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18695     /* Loaded using DUP.  */
18696     return const_dup;
18697   else if (const_vec != NULL_RTX)
18698     /* Load from constant pool. We cannot take advantage of single-cycle
18699        LD1 because we need a PC-relative addressing mode.  */
18700     return const_vec;
18701   else
18702     /* A PARALLEL containing something not valid inside CONST_VECTOR.
18703        We cannot construct an initializer.  */
18704     return NULL_RTX;
18705 }
18706
18707 /* Expand a vector initialisation sequence, such that TARGET is
18708    initialised to contain VALS.  */
18709
18710 void
18711 aarch64_expand_vector_init (rtx target, rtx vals)
18712 {
18713   machine_mode mode = GET_MODE (target);
18714   scalar_mode inner_mode = GET_MODE_INNER (mode);
18715   /* The number of vector elements.  */
18716   int n_elts = XVECLEN (vals, 0);
18717   /* The number of vector elements which are not constant.  */
18718   int n_var = 0;
18719   rtx any_const = NULL_RTX;
18720   /* The first element of vals.  */
18721   rtx v0 = XVECEXP (vals, 0, 0);
18722   bool all_same = true;
18723
18724   /* This is a special vec_init<M><N> where N is not an element mode but a
18725      vector mode with half the elements of M.  We expect to find two entries
18726      of mode N in VALS and we must put their concatentation into TARGET.  */
18727   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18728     {
18729       gcc_assert (known_eq (GET_MODE_SIZE (mode),
18730                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18731       rtx lo = XVECEXP (vals, 0, 0);
18732       rtx hi = XVECEXP (vals, 0, 1);
18733       machine_mode narrow_mode = GET_MODE (lo);
18734       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18735       gcc_assert (narrow_mode == GET_MODE (hi));
18736
18737       /* When we want to concatenate a half-width vector with zeroes we can
18738          use the aarch64_combinez[_be] patterns.  Just make sure that the
18739          zeroes are in the right half.  */
18740       if (BYTES_BIG_ENDIAN
18741           && aarch64_simd_imm_zero (lo, narrow_mode)
18742           && general_operand (hi, narrow_mode))
18743         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18744       else if (!BYTES_BIG_ENDIAN
18745                && aarch64_simd_imm_zero (hi, narrow_mode)
18746                && general_operand (lo, narrow_mode))
18747         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18748       else
18749         {
18750           /* Else create the two half-width registers and combine them.  */
18751           if (!REG_P (lo))
18752             lo = force_reg (GET_MODE (lo), lo);
18753           if (!REG_P (hi))
18754             hi = force_reg (GET_MODE (hi), hi);
18755
18756           if (BYTES_BIG_ENDIAN)
18757             std::swap (lo, hi);
18758           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18759         }
18760      return;
18761    }
18762
18763   /* Count the number of variable elements to initialise.  */
18764   for (int i = 0; i < n_elts; ++i)
18765     {
18766       rtx x = XVECEXP (vals, 0, i);
18767       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18768         ++n_var;
18769       else
18770         any_const = x;
18771
18772       all_same &= rtx_equal_p (x, v0);
18773     }
18774
18775   /* No variable elements, hand off to aarch64_simd_make_constant which knows
18776      how best to handle this.  */
18777   if (n_var == 0)
18778     {
18779       rtx constant = aarch64_simd_make_constant (vals);
18780       if (constant != NULL_RTX)
18781         {
18782           emit_move_insn (target, constant);
18783           return;
18784         }
18785     }
18786
18787   /* Splat a single non-constant element if we can.  */
18788   if (all_same)
18789     {
18790       rtx x = copy_to_mode_reg (inner_mode, v0);
18791       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18792       return;
18793     }
18794
18795   enum insn_code icode = optab_handler (vec_set_optab, mode);
18796   gcc_assert (icode != CODE_FOR_nothing);
18797
18798   /* If there are only variable elements, try to optimize
18799      the insertion using dup for the most common element
18800      followed by insertions.  */
18801
18802   /* The algorithm will fill matches[*][0] with the earliest matching element,
18803      and matches[X][1] with the count of duplicate elements (if X is the
18804      earliest element which has duplicates).  */
18805
18806   if (n_var == n_elts && n_elts <= 16)
18807     {
18808       int matches[16][2] = {0};
18809       for (int i = 0; i < n_elts; i++)
18810         {
18811           for (int j = 0; j <= i; j++)
18812             {
18813               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18814                 {
18815                   matches[i][0] = j;
18816                   matches[j][1]++;
18817                   break;
18818                 }
18819             }
18820         }
18821       int maxelement = 0;
18822       int maxv = 0;
18823       for (int i = 0; i < n_elts; i++)
18824         if (matches[i][1] > maxv)
18825           {
18826             maxelement = i;
18827             maxv = matches[i][1];
18828           }
18829
18830       /* Create a duplicate of the most common element, unless all elements
18831          are equally useless to us, in which case just immediately set the
18832          vector register using the first element.  */
18833
18834       if (maxv == 1)
18835         {
18836           /* For vectors of two 64-bit elements, we can do even better.  */
18837           if (n_elts == 2
18838               && (inner_mode == E_DImode
18839                   || inner_mode == E_DFmode))
18840
18841             {
18842               rtx x0 = XVECEXP (vals, 0, 0);
18843               rtx x1 = XVECEXP (vals, 0, 1);
18844               /* Combine can pick up this case, but handling it directly
18845                  here leaves clearer RTL.
18846
18847                  This is load_pair_lanes<mode>, and also gives us a clean-up
18848                  for store_pair_lanes<mode>.  */
18849               if (memory_operand (x0, inner_mode)
18850                   && memory_operand (x1, inner_mode)
18851                   && !STRICT_ALIGNMENT
18852                   && rtx_equal_p (XEXP (x1, 0),
18853                                   plus_constant (Pmode,
18854                                                  XEXP (x0, 0),
18855                                                  GET_MODE_SIZE (inner_mode))))
18856                 {
18857                   rtx t;
18858                   if (inner_mode == DFmode)
18859                     t = gen_load_pair_lanesdf (target, x0, x1);
18860                   else
18861                     t = gen_load_pair_lanesdi (target, x0, x1);
18862                   emit_insn (t);
18863                   return;
18864                 }
18865             }
18866           /* The subreg-move sequence below will move into lane zero of the
18867              vector register.  For big-endian we want that position to hold
18868              the last element of VALS.  */
18869           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18870           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18871           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18872         }
18873       else
18874         {
18875           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18876           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18877         }
18878
18879       /* Insert the rest.  */
18880       for (int i = 0; i < n_elts; i++)
18881         {
18882           rtx x = XVECEXP (vals, 0, i);
18883           if (matches[i][0] == maxelement)
18884             continue;
18885           x = copy_to_mode_reg (inner_mode, x);
18886           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18887         }
18888       return;
18889     }
18890
18891   /* Initialise a vector which is part-variable.  We want to first try
18892      to build those lanes which are constant in the most efficient way we
18893      can.  */
18894   if (n_var != n_elts)
18895     {
18896       rtx copy = copy_rtx (vals);
18897
18898       /* Load constant part of vector.  We really don't care what goes into the
18899          parts we will overwrite, but we're more likely to be able to load the
18900          constant efficiently if it has fewer, larger, repeating parts
18901          (see aarch64_simd_valid_immediate).  */
18902       for (int i = 0; i < n_elts; i++)
18903         {
18904           rtx x = XVECEXP (vals, 0, i);
18905           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18906             continue;
18907           rtx subst = any_const;
18908           for (int bit = n_elts / 2; bit > 0; bit /= 2)
18909             {
18910               /* Look in the copied vector, as more elements are const.  */
18911               rtx test = XVECEXP (copy, 0, i ^ bit);
18912               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18913                 {
18914                   subst = test;
18915                   break;
18916                 }
18917             }
18918           XVECEXP (copy, 0, i) = subst;
18919         }
18920       aarch64_expand_vector_init (target, copy);
18921     }
18922
18923   /* Insert the variable lanes directly.  */
18924   for (int i = 0; i < n_elts; i++)
18925     {
18926       rtx x = XVECEXP (vals, 0, i);
18927       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18928         continue;
18929       x = copy_to_mode_reg (inner_mode, x);
18930       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18931     }
18932 }
18933
18934 /* Emit RTL corresponding to:
18935    insr TARGET, ELEM.  */
18936
18937 static void
18938 emit_insr (rtx target, rtx elem)
18939 {
18940   machine_mode mode = GET_MODE (target);
18941   scalar_mode elem_mode = GET_MODE_INNER (mode);
18942   elem = force_reg (elem_mode, elem);
18943
18944   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18945   gcc_assert (icode != CODE_FOR_nothing);
18946   emit_insn (GEN_FCN (icode) (target, target, elem));
18947 }
18948
18949 /* Subroutine of aarch64_sve_expand_vector_init for handling
18950    trailing constants.
18951    This function works as follows:
18952    (a) Create a new vector consisting of trailing constants.
18953    (b) Initialize TARGET with the constant vector using emit_move_insn.
18954    (c) Insert remaining elements in TARGET using insr.
18955    NELTS is the total number of elements in original vector while
18956    while NELTS_REQD is the number of elements that are actually
18957    significant.
18958
18959    ??? The heuristic used is to do above only if number of constants
18960    is at least half the total number of elements.  May need fine tuning.  */
18961
18962 static bool
18963 aarch64_sve_expand_vector_init_handle_trailing_constants
18964  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18965 {
18966   machine_mode mode = GET_MODE (target);
18967   scalar_mode elem_mode = GET_MODE_INNER (mode);
18968   int n_trailing_constants = 0;
18969
18970   for (int i = nelts_reqd - 1;
18971        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
18972        i--)
18973     n_trailing_constants++;
18974
18975   if (n_trailing_constants >= nelts_reqd / 2)
18976     {
18977       /* Try to use the natural pattern of BUILDER to extend the trailing
18978          constant elements to a full vector.  Replace any variables in the
18979          extra elements with zeros.
18980
18981          ??? It would be better if the builders supported "don't care"
18982              elements, with the builder filling in whichever elements
18983              give the most compact encoding.  */
18984       rtx_vector_builder v (mode, nelts, 1);
18985       for (int i = 0; i < nelts; i++)
18986         {
18987           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
18988           if (!valid_for_const_vector_p (elem_mode, x))
18989             x = const0_rtx;
18990           v.quick_push (x);
18991         }
18992       rtx const_vec = v.build ();
18993       emit_move_insn (target, const_vec);
18994
18995       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18996         emit_insr (target, builder.elt (i));
18997
18998       return true;
18999     }
19000
19001   return false;
19002 }
19003
19004 /* Subroutine of aarch64_sve_expand_vector_init.
19005    Works as follows:
19006    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
19007    (b) Skip trailing elements from BUILDER, which are the same as
19008        element NELTS_REQD - 1.
19009    (c) Insert earlier elements in reverse order in TARGET using insr.  */
19010
19011 static void
19012 aarch64_sve_expand_vector_init_insert_elems (rtx target,
19013                                              const rtx_vector_builder &builder,
19014                                              int nelts_reqd)
19015 {
19016   machine_mode mode = GET_MODE (target);
19017   scalar_mode elem_mode = GET_MODE_INNER (mode);
19018
19019   struct expand_operand ops[2];
19020   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
19021   gcc_assert (icode != CODE_FOR_nothing);
19022
19023   create_output_operand (&ops[0], target, mode);
19024   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
19025   expand_insn (icode, 2, ops);
19026
19027   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19028   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
19029     emit_insr (target, builder.elt (i));
19030 }
19031
19032 /* Subroutine of aarch64_sve_expand_vector_init to handle case
19033    when all trailing elements of builder are same.
19034    This works as follows:
19035    (a) Use expand_insn interface to broadcast last vector element in TARGET.
19036    (b) Insert remaining elements in TARGET using insr.
19037
19038    ??? The heuristic used is to do above if number of same trailing elements
19039    is at least 3/4 of total number of elements, loosely based on
19040    heuristic from mostly_zeros_p.  May need fine-tuning.  */
19041
19042 static bool
19043 aarch64_sve_expand_vector_init_handle_trailing_same_elem
19044  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
19045 {
19046   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19047   if (ndups >= (3 * nelts_reqd) / 4)
19048     {
19049       aarch64_sve_expand_vector_init_insert_elems (target, builder,
19050                                                    nelts_reqd - ndups + 1);
19051       return true;
19052     }
19053
19054   return false;
19055 }
19056
19057 /* Initialize register TARGET from BUILDER. NELTS is the constant number
19058    of elements in BUILDER.
19059
19060    The function tries to initialize TARGET from BUILDER if it fits one
19061    of the special cases outlined below.
19062
19063    Failing that, the function divides BUILDER into two sub-vectors:
19064    v_even = even elements of BUILDER;
19065    v_odd = odd elements of BUILDER;
19066
19067    and recursively calls itself with v_even and v_odd.
19068
19069    if (recursive call succeeded for v_even or v_odd)
19070      TARGET = zip (v_even, v_odd)
19071
19072    The function returns true if it managed to build TARGET from BUILDER
19073    with one of the special cases, false otherwise.
19074
19075    Example: {a, 1, b, 2, c, 3, d, 4}
19076
19077    The vector gets divided into:
19078    v_even = {a, b, c, d}
19079    v_odd = {1, 2, 3, 4}
19080
19081    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
19082    initialize tmp2 from constant vector v_odd using emit_move_insn.
19083
19084    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
19085    4 elements, so we construct tmp1 from v_even using insr:
19086    tmp1 = dup(d)
19087    insr tmp1, c
19088    insr tmp1, b
19089    insr tmp1, a
19090
19091    And finally:
19092    TARGET = zip (tmp1, tmp2)
19093    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
19094
19095 static bool
19096 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
19097                                 int nelts, int nelts_reqd)
19098 {
19099   machine_mode mode = GET_MODE (target);
19100
19101   /* Case 1: Vector contains trailing constants.  */
19102
19103   if (aarch64_sve_expand_vector_init_handle_trailing_constants
19104        (target, builder, nelts, nelts_reqd))
19105     return true;
19106
19107   /* Case 2: Vector contains leading constants.  */
19108
19109   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
19110   for (int i = 0; i < nelts_reqd; i++)
19111     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
19112   rev_builder.finalize ();
19113
19114   if (aarch64_sve_expand_vector_init_handle_trailing_constants
19115        (target, rev_builder, nelts, nelts_reqd))
19116     {
19117       emit_insn (gen_aarch64_sve_rev (mode, target, target));
19118       return true;
19119     }
19120
19121   /* Case 3: Vector contains trailing same element.  */
19122
19123   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19124        (target, builder, nelts_reqd))
19125     return true;
19126
19127   /* Case 4: Vector contains leading same element.  */
19128
19129   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19130        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
19131     {
19132       emit_insn (gen_aarch64_sve_rev (mode, target, target));
19133       return true;
19134     }
19135
19136   /* Avoid recursing below 4-elements.
19137      ??? The threshold 4 may need fine-tuning.  */
19138
19139   if (nelts_reqd <= 4)
19140     return false;
19141
19142   rtx_vector_builder v_even (mode, nelts, 1);
19143   rtx_vector_builder v_odd (mode, nelts, 1);
19144
19145   for (int i = 0; i < nelts * 2; i += 2)
19146     {
19147       v_even.quick_push (builder.elt (i));
19148       v_odd.quick_push (builder.elt (i + 1));
19149     }
19150
19151   v_even.finalize ();
19152   v_odd.finalize ();
19153
19154   rtx tmp1 = gen_reg_rtx (mode);
19155   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
19156                                                     nelts, nelts_reqd / 2);
19157
19158   rtx tmp2 = gen_reg_rtx (mode);
19159   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
19160                                                    nelts, nelts_reqd / 2);
19161
19162   if (!did_even_p && !did_odd_p)
19163     return false;
19164
19165   /* Initialize v_even and v_odd using INSR if it didn't match any of the
19166      special cases and zip v_even, v_odd.  */
19167
19168   if (!did_even_p)
19169     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
19170
19171   if (!did_odd_p)
19172     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
19173
19174   rtvec v = gen_rtvec (2, tmp1, tmp2);
19175   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
19176   return true;
19177 }
19178
19179 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
19180
19181 void
19182 aarch64_sve_expand_vector_init (rtx target, rtx vals)
19183 {
19184   machine_mode mode = GET_MODE (target);
19185   int nelts = XVECLEN (vals, 0);
19186
19187   rtx_vector_builder v (mode, nelts, 1);
19188   for (int i = 0; i < nelts; i++)
19189     v.quick_push (XVECEXP (vals, 0, i));
19190   v.finalize ();
19191
19192   /* If neither sub-vectors of v could be initialized specially,
19193      then use INSR to insert all elements from v into TARGET.
19194      ??? This might not be optimal for vectors with large
19195      initializers like 16-element or above.
19196      For nelts < 4, it probably isn't useful to handle specially.  */
19197
19198   if (nelts < 4
19199       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
19200     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
19201 }
19202
19203 /* Check whether VALUE is a vector constant in which every element
19204    is either a power of 2 or a negated power of 2.  If so, return
19205    a constant vector of log2s, and flip CODE between PLUS and MINUS
19206    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
19207
19208 static rtx
19209 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
19210 {
19211   if (GET_CODE (value) != CONST_VECTOR)
19212     return NULL_RTX;
19213
19214   rtx_vector_builder builder;
19215   if (!builder.new_unary_operation (GET_MODE (value), value, false))
19216     return NULL_RTX;
19217
19218   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
19219   /* 1 if the result of the multiplication must be negated,
19220      0 if it mustn't, or -1 if we don't yet care.  */
19221   int negate = -1;
19222   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
19223   for (unsigned int i = 0; i < encoded_nelts; ++i)
19224     {
19225       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
19226       if (!CONST_SCALAR_INT_P (elt))
19227         return NULL_RTX;
19228       rtx_mode_t val (elt, int_mode);
19229       wide_int pow2 = wi::neg (val);
19230       if (val != pow2)
19231         {
19232           /* It matters whether we negate or not.  Make that choice,
19233              and make sure that it's consistent with previous elements.  */
19234           if (negate == !wi::neg_p (val))
19235             return NULL_RTX;
19236           negate = wi::neg_p (val);
19237           if (!negate)
19238             pow2 = val;
19239         }
19240       /* POW2 is now the value that we want to be a power of 2.  */
19241       int shift = wi::exact_log2 (pow2);
19242       if (shift < 0)
19243         return NULL_RTX;
19244       builder.quick_push (gen_int_mode (shift, int_mode));
19245     }
19246   if (negate == -1)
19247     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
19248     code = PLUS;
19249   else if (negate == 1)
19250     code = code == PLUS ? MINUS : PLUS;
19251   return builder.build ();
19252 }
19253
19254 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
19255    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
19256    operands array, in the same order as for fma_optab.  Return true if
19257    the function emitted all the necessary instructions, false if the caller
19258    should generate the pattern normally with the new OPERANDS array.  */
19259
19260 bool
19261 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
19262 {
19263   machine_mode mode = GET_MODE (operands[0]);
19264   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
19265     {
19266       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
19267                                   NULL_RTX, true, OPTAB_DIRECT);
19268       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
19269                           operands[3], product, operands[0], true,
19270                           OPTAB_DIRECT);
19271       return true;
19272     }
19273   operands[2] = force_reg (mode, operands[2]);
19274   return false;
19275 }
19276
19277 /* Likewise, but for a conditional pattern.  */
19278
19279 bool
19280 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
19281 {
19282   machine_mode mode = GET_MODE (operands[0]);
19283   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
19284     {
19285       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
19286                                   NULL_RTX, true, OPTAB_DIRECT);
19287       emit_insn (gen_cond (code, mode, operands[0], operands[1],
19288                            operands[4], product, operands[5]));
19289       return true;
19290     }
19291   operands[3] = force_reg (mode, operands[3]);
19292   return false;
19293 }
19294
19295 static unsigned HOST_WIDE_INT
19296 aarch64_shift_truncation_mask (machine_mode mode)
19297 {
19298   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
19299     return 0;
19300   return GET_MODE_UNIT_BITSIZE (mode) - 1;
19301 }
19302
19303 /* Select a format to encode pointers in exception handling data.  */
19304 int
19305 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
19306 {
19307    int type;
19308    switch (aarch64_cmodel)
19309      {
19310      case AARCH64_CMODEL_TINY:
19311      case AARCH64_CMODEL_TINY_PIC:
19312      case AARCH64_CMODEL_SMALL:
19313      case AARCH64_CMODEL_SMALL_PIC:
19314      case AARCH64_CMODEL_SMALL_SPIC:
19315        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
19316           for everything.  */
19317        type = DW_EH_PE_sdata4;
19318        break;
19319      default:
19320        /* No assumptions here.  8-byte relocs required.  */
19321        type = DW_EH_PE_sdata8;
19322        break;
19323      }
19324    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
19325 }
19326
19327 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
19328
19329 static void
19330 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
19331 {
19332   if (TREE_CODE (decl) == FUNCTION_DECL)
19333     {
19334       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
19335       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
19336         {
19337           fprintf (stream, "\t.variant_pcs\t");
19338           assemble_name (stream, name);
19339           fprintf (stream, "\n");
19340         }
19341     }
19342 }
19343
19344 /* The last .arch and .tune assembly strings that we printed.  */
19345 static std::string aarch64_last_printed_arch_string;
19346 static std::string aarch64_last_printed_tune_string;
19347
19348 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
19349    by the function fndecl.  */
19350
19351 void
19352 aarch64_declare_function_name (FILE *stream, const char* name,
19353                                 tree fndecl)
19354 {
19355   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19356
19357   struct cl_target_option *targ_options;
19358   if (target_parts)
19359     targ_options = TREE_TARGET_OPTION (target_parts);
19360   else
19361     targ_options = TREE_TARGET_OPTION (target_option_current_node);
19362   gcc_assert (targ_options);
19363
19364   const struct processor *this_arch
19365     = aarch64_get_arch (targ_options->x_explicit_arch);
19366
19367   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
19368   std::string extension
19369     = aarch64_get_extension_string_for_isa_flags (isa_flags,
19370                                                   this_arch->flags);
19371   /* Only update the assembler .arch string if it is distinct from the last
19372      such string we printed.  */
19373   std::string to_print = this_arch->name + extension;
19374   if (to_print != aarch64_last_printed_arch_string)
19375     {
19376       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19377       aarch64_last_printed_arch_string = to_print;
19378     }
19379
19380   /* Print the cpu name we're tuning for in the comments, might be
19381      useful to readers of the generated asm.  Do it only when it changes
19382      from function to function and verbose assembly is requested.  */
19383   const struct processor *this_tune
19384     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19385
19386   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19387     {
19388       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19389                    this_tune->name);
19390       aarch64_last_printed_tune_string = this_tune->name;
19391     }
19392
19393   aarch64_asm_output_variant_pcs (stream, fndecl, name);
19394
19395   /* Don't forget the type directive for ELF.  */
19396   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19397   ASM_OUTPUT_LABEL (stream, name);
19398
19399   cfun->machine->label_is_assembled = true;
19400 }
19401
19402 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
19403    the function label and emit a BTI if necessary.  */
19404
19405 void
19406 aarch64_print_patchable_function_entry (FILE *file,
19407                                         unsigned HOST_WIDE_INT patch_area_size,
19408                                         bool record_p)
19409 {
19410   if (cfun->machine->label_is_assembled
19411       && aarch64_bti_enabled ()
19412       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
19413     {
19414       /* Remove the BTI that follows the patch area and insert a new BTI
19415          before the patch area right after the function label.  */
19416       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19417       if (insn
19418           && INSN_P (insn)
19419           && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19420           && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
19421         delete_insn (insn);
19422       asm_fprintf (file, "\thint\t34 // bti c\n");
19423     }
19424
19425   default_print_patchable_function_entry (file, patch_area_size, record_p);
19426 }
19427
19428 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
19429
19430 void
19431 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19432 {
19433   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19434   const char *value = IDENTIFIER_POINTER (target);
19435   aarch64_asm_output_variant_pcs (stream, decl, name);
19436   ASM_OUTPUT_DEF (stream, name, value);
19437 }
19438
19439 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
19440    function symbol references.  */
19441
19442 void
19443 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
19444 {
19445   default_elf_asm_output_external (stream, decl, name);
19446   aarch64_asm_output_variant_pcs (stream, decl, name);
19447 }
19448
19449 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19450    Used to output the .cfi_b_key_frame directive when signing the current
19451    function with the B key.  */
19452
19453 void
19454 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19455 {
19456   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
19457       && aarch64_ra_sign_key == AARCH64_KEY_B)
19458         asm_fprintf (f, "\t.cfi_b_key_frame\n");
19459 }
19460
19461 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
19462
19463 static void
19464 aarch64_start_file (void)
19465 {
19466   struct cl_target_option *default_options
19467     = TREE_TARGET_OPTION (target_option_default_node);
19468
19469   const struct processor *default_arch
19470     = aarch64_get_arch (default_options->x_explicit_arch);
19471   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
19472   std::string extension
19473     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19474                                                   default_arch->flags);
19475
19476    aarch64_last_printed_arch_string = default_arch->name + extension;
19477    aarch64_last_printed_tune_string = "";
19478    asm_fprintf (asm_out_file, "\t.arch %s\n",
19479                 aarch64_last_printed_arch_string.c_str ());
19480
19481    default_file_start ();
19482 }
19483
19484 /* Emit load exclusive.  */
19485
19486 static void
19487 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
19488                              rtx mem, rtx model_rtx)
19489 {
19490   if (mode == TImode)
19491     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19492                                                 gen_highpart (DImode, rval),
19493                                                 mem, model_rtx));
19494   else
19495     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
19496 }
19497
19498 /* Emit store exclusive.  */
19499
19500 static void
19501 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
19502                               rtx mem, rtx rval, rtx model_rtx)
19503 {
19504   if (mode == TImode)
19505     emit_insn (gen_aarch64_store_exclusive_pair
19506                (bval, mem, operand_subword (rval, 0, 0, TImode),
19507                 operand_subword (rval, 1, 0, TImode), model_rtx));
19508   else
19509     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
19510 }
19511
19512 /* Mark the previous jump instruction as unlikely.  */
19513
19514 static void
19515 aarch64_emit_unlikely_jump (rtx insn)
19516 {
19517   rtx_insn *jump = emit_jump_insn (insn);
19518   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
19519 }
19520
19521 /* We store the names of the various atomic helpers in a 5x4 array.
19522    Return the libcall function given MODE, MODEL and NAMES.  */
19523
19524 rtx
19525 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19526                         const atomic_ool_names *names)
19527 {
19528   memmodel model = memmodel_base (INTVAL (model_rtx));
19529   int mode_idx, model_idx;
19530
19531   switch (mode)
19532     {
19533     case E_QImode:
19534       mode_idx = 0;
19535       break;
19536     case E_HImode:
19537       mode_idx = 1;
19538       break;
19539     case E_SImode:
19540       mode_idx = 2;
19541       break;
19542     case E_DImode:
19543       mode_idx = 3;
19544       break;
19545     case E_TImode:
19546       mode_idx = 4;
19547       break;
19548     default:
19549       gcc_unreachable ();
19550     }
19551
19552   switch (model)
19553     {
19554     case MEMMODEL_RELAXED:
19555       model_idx = 0;
19556       break;
19557     case MEMMODEL_CONSUME:
19558     case MEMMODEL_ACQUIRE:
19559       model_idx = 1;
19560       break;
19561     case MEMMODEL_RELEASE:
19562       model_idx = 2;
19563       break;
19564     case MEMMODEL_ACQ_REL:
19565     case MEMMODEL_SEQ_CST:
19566       model_idx = 3;
19567       break;
19568     default:
19569       gcc_unreachable ();
19570     }
19571
19572   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19573                                       VISIBILITY_HIDDEN);
19574 }
19575
19576 #define DEF0(B, N) \
19577   { "__aarch64_" #B #N "_relax", \
19578     "__aarch64_" #B #N "_acq", \
19579     "__aarch64_" #B #N "_rel", \
19580     "__aarch64_" #B #N "_acq_rel" }
19581
19582 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19583                  { NULL, NULL, NULL, NULL }
19584 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19585
19586 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19587 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19588 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19589 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19590 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19591 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19592
19593 #undef DEF0
19594 #undef DEF4
19595 #undef DEF5
19596
19597 /* Expand a compare and swap pattern.  */
19598
19599 void
19600 aarch64_expand_compare_and_swap (rtx operands[])
19601 {
19602   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19603   machine_mode mode, r_mode;
19604
19605   bval = operands[0];
19606   rval = operands[1];
19607   mem = operands[2];
19608   oldval = operands[3];
19609   newval = operands[4];
19610   is_weak = operands[5];
19611   mod_s = operands[6];
19612   mod_f = operands[7];
19613   mode = GET_MODE (mem);
19614
19615   /* Normally the succ memory model must be stronger than fail, but in the
19616      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19617      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
19618   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19619       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
19620     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19621
19622   r_mode = mode;
19623   if (mode == QImode || mode == HImode)
19624     {
19625       r_mode = SImode;
19626       rval = gen_reg_rtx (r_mode);
19627     }
19628
19629   if (TARGET_LSE)
19630     {
19631       /* The CAS insn requires oldval and rval overlap, but we need to
19632          have a copy of oldval saved across the operation to tell if
19633          the operation is successful.  */
19634       if (reg_overlap_mentioned_p (rval, oldval))
19635         rval = copy_to_mode_reg (r_mode, oldval);
19636       else
19637         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19638
19639       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19640                                                    newval, mod_s));
19641       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19642     }
19643   else if (TARGET_OUTLINE_ATOMICS)
19644     {
19645       /* Oldval must satisfy compare afterward.  */
19646       if (!aarch64_plus_operand (oldval, mode))
19647         oldval = force_reg (mode, oldval);
19648       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19649       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19650                                       oldval, mode, newval, mode,
19651                                       XEXP (mem, 0), Pmode);
19652       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19653     }
19654   else
19655     {
19656       /* The oldval predicate varies by mode.  Test it and force to reg.  */
19657       insn_code code = code_for_aarch64_compare_and_swap (mode);
19658       if (!insn_data[code].operand[2].predicate (oldval, mode))
19659         oldval = force_reg (mode, oldval);
19660
19661       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19662                                  is_weak, mod_s, mod_f));
19663       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19664     }
19665
19666   if (r_mode != mode)
19667     rval = gen_lowpart (mode, rval);
19668   emit_move_insn (operands[1], rval);
19669
19670   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19671   emit_insn (gen_rtx_SET (bval, x));
19672 }
19673
19674 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19675    sequence implementing an atomic operation.  */
19676
19677 static void
19678 aarch64_emit_post_barrier (enum memmodel model)
19679 {
19680   const enum memmodel base_model = memmodel_base (model);
19681
19682   if (is_mm_sync (model)
19683       && (base_model == MEMMODEL_ACQUIRE
19684           || base_model == MEMMODEL_ACQ_REL
19685           || base_model == MEMMODEL_SEQ_CST))
19686     {
19687       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19688     }
19689 }
19690
19691 /* Split a compare and swap pattern.  */
19692
19693 void
19694 aarch64_split_compare_and_swap (rtx operands[])
19695 {
19696   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19697   gcc_assert (epilogue_completed);
19698
19699   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19700   machine_mode mode;
19701   bool is_weak;
19702   rtx_code_label *label1, *label2;
19703   enum memmodel model;
19704
19705   rval = operands[0];
19706   mem = operands[1];
19707   oldval = operands[2];
19708   newval = operands[3];
19709   is_weak = (operands[4] != const0_rtx);
19710   model_rtx = operands[5];
19711   scratch = operands[7];
19712   mode = GET_MODE (mem);
19713   model = memmodel_from_int (INTVAL (model_rtx));
19714
19715   /* When OLDVAL is zero and we want the strong version we can emit a tighter
19716     loop:
19717     .label1:
19718         LD[A]XR rval, [mem]
19719         CBNZ    rval, .label2
19720         ST[L]XR scratch, newval, [mem]
19721         CBNZ    scratch, .label1
19722     .label2:
19723         CMP     rval, 0.  */
19724   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19725                         oldval == const0_rtx && mode != TImode);
19726
19727   label1 = NULL;
19728   if (!is_weak)
19729     {
19730       label1 = gen_label_rtx ();
19731       emit_label (label1);
19732     }
19733   label2 = gen_label_rtx ();
19734
19735   /* The initial load can be relaxed for a __sync operation since a final
19736      barrier will be emitted to stop code hoisting.  */
19737   if (is_mm_sync (model))
19738     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19739   else
19740     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19741
19742   if (strong_zero_p)
19743     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19744   else
19745     {
19746       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19747       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19748     }
19749   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19750                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19751   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19752
19753   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19754
19755   if (!is_weak)
19756     {
19757       if (aarch64_track_speculation)
19758         {
19759           /* Emit an explicit compare instruction, so that we can correctly
19760              track the condition codes.  */
19761           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19762           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19763         }
19764       else
19765         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19766
19767       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19768                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
19769       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19770     }
19771   else
19772     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19773
19774   emit_label (label2);
19775
19776   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19777      to set the condition flags.  If this is not used it will be removed by
19778      later passes.  */
19779   if (strong_zero_p)
19780     aarch64_gen_compare_reg (NE, rval, const0_rtx);
19781
19782   /* Emit any final barrier needed for a __sync operation.  */
19783   if (is_mm_sync (model))
19784     aarch64_emit_post_barrier (model);
19785 }
19786
19787 /* Split an atomic operation.  */
19788
19789 void
19790 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
19791                          rtx value, rtx model_rtx, rtx cond)
19792 {
19793   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19794   gcc_assert (epilogue_completed);
19795
19796   machine_mode mode = GET_MODE (mem);
19797   machine_mode wmode = (mode == DImode ? DImode : SImode);
19798   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19799   const bool is_sync = is_mm_sync (model);
19800   rtx_code_label *label;
19801   rtx x;
19802
19803   /* Split the atomic operation into a sequence.  */
19804   label = gen_label_rtx ();
19805   emit_label (label);
19806
19807   if (new_out)
19808     new_out = gen_lowpart (wmode, new_out);
19809   if (old_out)
19810     old_out = gen_lowpart (wmode, old_out);
19811   else
19812     old_out = new_out;
19813   value = simplify_gen_subreg (wmode, value, mode, 0);
19814
19815   /* The initial load can be relaxed for a __sync operation since a final
19816      barrier will be emitted to stop code hoisting.  */
19817  if (is_sync)
19818     aarch64_emit_load_exclusive (mode, old_out, mem,
19819                                  GEN_INT (MEMMODEL_RELAXED));
19820   else
19821     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
19822
19823   switch (code)
19824     {
19825     case SET:
19826       new_out = value;
19827       break;
19828
19829     case NOT:
19830       x = gen_rtx_AND (wmode, old_out, value);
19831       emit_insn (gen_rtx_SET (new_out, x));
19832       x = gen_rtx_NOT (wmode, new_out);
19833       emit_insn (gen_rtx_SET (new_out, x));
19834       break;
19835
19836     case MINUS:
19837       if (CONST_INT_P (value))
19838         {
19839           value = GEN_INT (-INTVAL (value));
19840           code = PLUS;
19841         }
19842       /* Fall through.  */
19843
19844     default:
19845       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
19846       emit_insn (gen_rtx_SET (new_out, x));
19847       break;
19848     }
19849
19850   aarch64_emit_store_exclusive (mode, cond, mem,
19851                                 gen_lowpart (mode, new_out), model_rtx);
19852
19853   if (aarch64_track_speculation)
19854     {
19855       /* Emit an explicit compare instruction, so that we can correctly
19856          track the condition codes.  */
19857       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19858       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19859     }
19860   else
19861     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19862
19863   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19864                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
19865   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19866
19867   /* Emit any final barrier needed for a __sync operation.  */
19868   if (is_sync)
19869     aarch64_emit_post_barrier (model);
19870 }
19871
19872 static void
19873 aarch64_init_libfuncs (void)
19874 {
19875    /* Half-precision float operations.  The compiler handles all operations
19876      with NULL libfuncs by converting to SFmode.  */
19877
19878   /* Conversions.  */
19879   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19880   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19881
19882   /* Arithmetic.  */
19883   set_optab_libfunc (add_optab, HFmode, NULL);
19884   set_optab_libfunc (sdiv_optab, HFmode, NULL);
19885   set_optab_libfunc (smul_optab, HFmode, NULL);
19886   set_optab_libfunc (neg_optab, HFmode, NULL);
19887   set_optab_libfunc (sub_optab, HFmode, NULL);
19888
19889   /* Comparisons.  */
19890   set_optab_libfunc (eq_optab, HFmode, NULL);
19891   set_optab_libfunc (ne_optab, HFmode, NULL);
19892   set_optab_libfunc (lt_optab, HFmode, NULL);
19893   set_optab_libfunc (le_optab, HFmode, NULL);
19894   set_optab_libfunc (ge_optab, HFmode, NULL);
19895   set_optab_libfunc (gt_optab, HFmode, NULL);
19896   set_optab_libfunc (unord_optab, HFmode, NULL);
19897 }
19898
19899 /* Target hook for c_mode_for_suffix.  */
19900 static machine_mode
19901 aarch64_c_mode_for_suffix (char suffix)
19902 {
19903   if (suffix == 'q')
19904     return TFmode;
19905
19906   return VOIDmode;
19907 }
19908
19909 /* We can only represent floating point constants which will fit in
19910    "quarter-precision" values.  These values are characterised by
19911    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
19912    by:
19913
19914    (-1)^s * (n/16) * 2^r
19915
19916    Where:
19917      's' is the sign bit.
19918      'n' is an integer in the range 16 <= n <= 31.
19919      'r' is an integer in the range -3 <= r <= 4.  */
19920
19921 /* Return true iff X can be represented by a quarter-precision
19922    floating point immediate operand X.  Note, we cannot represent 0.0.  */
19923 bool
19924 aarch64_float_const_representable_p (rtx x)
19925 {
19926   /* This represents our current view of how many bits
19927      make up the mantissa.  */
19928   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
19929   int exponent;
19930   unsigned HOST_WIDE_INT mantissa, mask;
19931   REAL_VALUE_TYPE r, m;
19932   bool fail;
19933
19934   x = unwrap_const_vec_duplicate (x);
19935   if (!CONST_DOUBLE_P (x))
19936     return false;
19937
19938   if (GET_MODE (x) == VOIDmode
19939       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
19940     return false;
19941
19942   r = *CONST_DOUBLE_REAL_VALUE (x);
19943
19944   /* We cannot represent infinities, NaNs or +/-zero.  We won't
19945      know if we have +zero until we analyse the mantissa, but we
19946      can reject the other invalid values.  */
19947   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19948       || REAL_VALUE_MINUS_ZERO (r))
19949     return false;
19950
19951   /* Extract exponent.  */
19952   r = real_value_abs (&r);
19953   exponent = REAL_EXP (&r);
19954
19955   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19956      highest (sign) bit, with a fixed binary point at bit point_pos.
19957      m1 holds the low part of the mantissa, m2 the high part.
19958      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19959      bits for the mantissa, this can fail (low bits will be lost).  */
19960   real_ldexp (&m, &r, point_pos - exponent);
19961   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
19962
19963   /* If the low part of the mantissa has bits set we cannot represent
19964      the value.  */
19965   if (w.ulow () != 0)
19966     return false;
19967   /* We have rejected the lower HOST_WIDE_INT, so update our
19968      understanding of how many bits lie in the mantissa and
19969      look only at the high HOST_WIDE_INT.  */
19970   mantissa = w.elt (1);
19971   point_pos -= HOST_BITS_PER_WIDE_INT;
19972
19973   /* We can only represent values with a mantissa of the form 1.xxxx.  */
19974   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19975   if ((mantissa & mask) != 0)
19976     return false;
19977
19978   /* Having filtered unrepresentable values, we may now remove all
19979      but the highest 5 bits.  */
19980   mantissa >>= point_pos - 5;
19981
19982   /* We cannot represent the value 0.0, so reject it.  This is handled
19983      elsewhere.  */
19984   if (mantissa == 0)
19985     return false;
19986
19987   /* Then, as bit 4 is always set, we can mask it off, leaving
19988      the mantissa in the range [0, 15].  */
19989   mantissa &= ~(1 << 4);
19990   gcc_assert (mantissa <= 15);
19991
19992   /* GCC internally does not use IEEE754-like encoding (where normalized
19993      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
19994      Our mantissa values are shifted 4 places to the left relative to
19995      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19996      by 5 places to correct for GCC's representation.  */
19997   exponent = 5 - exponent;
19998
19999   return (exponent >= 0 && exponent <= 7);
20000 }
20001
20002 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
20003    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
20004    output MOVI/MVNI, ORR or BIC immediate.  */
20005 char*
20006 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
20007                                    enum simd_immediate_check which)
20008 {
20009   bool is_valid;
20010   static char templ[40];
20011   const char *mnemonic;
20012   const char *shift_op;
20013   unsigned int lane_count = 0;
20014   char element_char;
20015
20016   struct simd_immediate_info info;
20017
20018   /* This will return true to show const_vector is legal for use as either
20019      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
20020      It will also update INFO to show how the immediate should be generated.
20021      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
20022   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
20023   gcc_assert (is_valid);
20024
20025   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20026   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
20027
20028   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
20029     {
20030       gcc_assert (info.insn == simd_immediate_info::MOV
20031                   && info.u.mov.shift == 0);
20032       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
20033          move immediate path.  */
20034       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20035         info.u.mov.value = GEN_INT (0);
20036       else
20037         {
20038           const unsigned int buf_size = 20;
20039           char float_buf[buf_size] = {'\0'};
20040           real_to_decimal_for_mode (float_buf,
20041                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
20042                                     buf_size, buf_size, 1, info.elt_mode);
20043
20044           if (lane_count == 1)
20045             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
20046           else
20047             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
20048                       lane_count, element_char, float_buf);
20049           return templ;
20050         }
20051     }
20052
20053   gcc_assert (CONST_INT_P (info.u.mov.value));
20054
20055   if (which == AARCH64_CHECK_MOV)
20056     {
20057       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
20058       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
20059                   ? "msl" : "lsl");
20060       if (lane_count == 1)
20061         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
20062                   mnemonic, UINTVAL (info.u.mov.value));
20063       else if (info.u.mov.shift)
20064         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20065                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
20066                   element_char, UINTVAL (info.u.mov.value), shift_op,
20067                   info.u.mov.shift);
20068       else
20069         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20070                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
20071                   element_char, UINTVAL (info.u.mov.value));
20072     }
20073   else
20074     {
20075       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
20076       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
20077       if (info.u.mov.shift)
20078         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20079                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
20080                   element_char, UINTVAL (info.u.mov.value), "lsl",
20081                   info.u.mov.shift);
20082       else
20083         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20084                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
20085                   element_char, UINTVAL (info.u.mov.value));
20086     }
20087   return templ;
20088 }
20089
20090 char*
20091 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
20092 {
20093
20094   /* If a floating point number was passed and we desire to use it in an
20095      integer mode do the conversion to integer.  */
20096   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
20097     {
20098       unsigned HOST_WIDE_INT ival;
20099       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
20100           gcc_unreachable ();
20101       immediate = gen_int_mode (ival, mode);
20102     }
20103
20104   machine_mode vmode;
20105   /* use a 64 bit mode for everything except for DI/DF mode, where we use
20106      a 128 bit vector mode.  */
20107   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
20108
20109   vmode = aarch64_simd_container_mode (mode, width);
20110   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
20111   return aarch64_output_simd_mov_immediate (v_op, width);
20112 }
20113
20114 /* Return the output string to use for moving immediate CONST_VECTOR
20115    into an SVE register.  */
20116
20117 char *
20118 aarch64_output_sve_mov_immediate (rtx const_vector)
20119 {
20120   static char templ[40];
20121   struct simd_immediate_info info;
20122   char element_char;
20123
20124   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
20125   gcc_assert (is_valid);
20126
20127   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20128
20129   machine_mode vec_mode = GET_MODE (const_vector);
20130   if (aarch64_sve_pred_mode_p (vec_mode))
20131     {
20132       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
20133       if (info.insn == simd_immediate_info::MOV)
20134         {
20135           gcc_assert (info.u.mov.value == const0_rtx);
20136           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
20137         }
20138       else
20139         {
20140           gcc_assert (info.insn == simd_immediate_info::PTRUE);
20141           unsigned int total_bytes;
20142           if (info.u.pattern == AARCH64_SV_ALL
20143               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
20144             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
20145                       total_bytes / GET_MODE_SIZE (info.elt_mode));
20146           else
20147             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
20148                       svpattern_token (info.u.pattern));
20149         }
20150       return buf;
20151     }
20152
20153   if (info.insn == simd_immediate_info::INDEX)
20154     {
20155       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
20156                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
20157                 element_char, INTVAL (info.u.index.base),
20158                 INTVAL (info.u.index.step));
20159       return templ;
20160     }
20161
20162   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
20163     {
20164       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20165         info.u.mov.value = GEN_INT (0);
20166       else
20167         {
20168           const int buf_size = 20;
20169           char float_buf[buf_size] = {};
20170           real_to_decimal_for_mode (float_buf,
20171                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
20172                                     buf_size, buf_size, 1, info.elt_mode);
20173
20174           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
20175                     element_char, float_buf);
20176           return templ;
20177         }
20178     }
20179
20180   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
20181             element_char, INTVAL (info.u.mov.value));
20182   return templ;
20183 }
20184
20185 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
20186    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
20187    pattern.  */
20188
20189 char *
20190 aarch64_output_sve_ptrues (rtx const_unspec)
20191 {
20192   static char templ[40];
20193
20194   struct simd_immediate_info info;
20195   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
20196   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
20197
20198   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20199   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
20200             svpattern_token (info.u.pattern));
20201   return templ;
20202 }
20203
20204 /* Split operands into moves from op[1] + op[2] into op[0].  */
20205
20206 void
20207 aarch64_split_combinev16qi (rtx operands[3])
20208 {
20209   unsigned int dest = REGNO (operands[0]);
20210   unsigned int src1 = REGNO (operands[1]);
20211   unsigned int src2 = REGNO (operands[2]);
20212   machine_mode halfmode = GET_MODE (operands[1]);
20213   unsigned int halfregs = REG_NREGS (operands[1]);
20214   rtx destlo, desthi;
20215
20216   gcc_assert (halfmode == V16QImode);
20217
20218   if (src1 == dest && src2 == dest + halfregs)
20219     {
20220       /* No-op move.  Can't split to nothing; emit something.  */
20221       emit_note (NOTE_INSN_DELETED);
20222       return;
20223     }
20224
20225   /* Preserve register attributes for variable tracking.  */
20226   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
20227   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
20228                                GET_MODE_SIZE (halfmode));
20229
20230   /* Special case of reversed high/low parts.  */
20231   if (reg_overlap_mentioned_p (operands[2], destlo)
20232       && reg_overlap_mentioned_p (operands[1], desthi))
20233     {
20234       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20235       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
20236       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20237     }
20238   else if (!reg_overlap_mentioned_p (operands[2], destlo))
20239     {
20240       /* Try to avoid unnecessary moves if part of the result
20241          is in the right place already.  */
20242       if (src1 != dest)
20243         emit_move_insn (destlo, operands[1]);
20244       if (src2 != dest + halfregs)
20245         emit_move_insn (desthi, operands[2]);
20246     }
20247   else
20248     {
20249       if (src2 != dest + halfregs)
20250         emit_move_insn (desthi, operands[2]);
20251       if (src1 != dest)
20252         emit_move_insn (destlo, operands[1]);
20253     }
20254 }
20255
20256 /* vec_perm support.  */
20257
20258 struct expand_vec_perm_d
20259 {
20260   rtx target, op0, op1;
20261   vec_perm_indices perm;
20262   machine_mode vmode;
20263   unsigned int vec_flags;
20264   bool one_vector_p;
20265   bool testing_p;
20266 };
20267
20268 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
20269
20270 /* Generate a variable permutation.  */
20271
20272 static void
20273 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
20274 {
20275   machine_mode vmode = GET_MODE (target);
20276   bool one_vector_p = rtx_equal_p (op0, op1);
20277
20278   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
20279   gcc_checking_assert (GET_MODE (op0) == vmode);
20280   gcc_checking_assert (GET_MODE (op1) == vmode);
20281   gcc_checking_assert (GET_MODE (sel) == vmode);
20282   gcc_checking_assert (TARGET_SIMD);
20283
20284   if (one_vector_p)
20285     {
20286       if (vmode == V8QImode)
20287         {
20288           /* Expand the argument to a V16QI mode by duplicating it.  */
20289           rtx pair = gen_reg_rtx (V16QImode);
20290           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
20291           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20292         }
20293       else
20294         {
20295           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
20296         }
20297     }
20298   else
20299     {
20300       rtx pair;
20301
20302       if (vmode == V8QImode)
20303         {
20304           pair = gen_reg_rtx (V16QImode);
20305           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
20306           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20307         }
20308       else
20309         {
20310           pair = gen_reg_rtx (OImode);
20311           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
20312           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
20313         }
20314     }
20315 }
20316
20317 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
20318    NELT is the number of elements in the vector.  */
20319
20320 void
20321 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
20322                          unsigned int nelt)
20323 {
20324   machine_mode vmode = GET_MODE (target);
20325   bool one_vector_p = rtx_equal_p (op0, op1);
20326   rtx mask;
20327
20328   /* The TBL instruction does not use a modulo index, so we must take care
20329      of that ourselves.  */
20330   mask = aarch64_simd_gen_const_vector_dup (vmode,
20331       one_vector_p ? nelt - 1 : 2 * nelt - 1);
20332   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
20333
20334   /* For big-endian, we also need to reverse the index within the vector
20335      (but not which vector).  */
20336   if (BYTES_BIG_ENDIAN)
20337     {
20338       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
20339       if (!one_vector_p)
20340         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
20341       sel = expand_simple_binop (vmode, XOR, sel, mask,
20342                                  NULL, 0, OPTAB_LIB_WIDEN);
20343     }
20344   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
20345 }
20346
20347 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
20348
20349 static void
20350 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
20351 {
20352   emit_insn (gen_rtx_SET (target,
20353                           gen_rtx_UNSPEC (GET_MODE (target),
20354                                           gen_rtvec (2, op0, op1), code)));
20355 }
20356
20357 /* Expand an SVE vec_perm with the given operands.  */
20358
20359 void
20360 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
20361 {
20362   machine_mode data_mode = GET_MODE (target);
20363   machine_mode sel_mode = GET_MODE (sel);
20364   /* Enforced by the pattern condition.  */
20365   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
20366
20367   /* Note: vec_perm indices are supposed to wrap when they go beyond the
20368      size of the two value vectors, i.e. the upper bits of the indices
20369      are effectively ignored.  SVE TBL instead produces 0 for any
20370      out-of-range indices, so we need to modulo all the vec_perm indices
20371      to ensure they are all in range.  */
20372   rtx sel_reg = force_reg (sel_mode, sel);
20373
20374   /* Check if the sel only references the first values vector.  */
20375   if (GET_CODE (sel) == CONST_VECTOR
20376       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20377     {
20378       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20379       return;
20380     }
20381
20382   /* Check if the two values vectors are the same.  */
20383   if (rtx_equal_p (op0, op1))
20384     {
20385       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20386       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20387                                          NULL, 0, OPTAB_DIRECT);
20388       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20389       return;
20390     }
20391
20392   /* Run TBL on for each value vector and combine the results.  */
20393
20394   rtx res0 = gen_reg_rtx (data_mode);
20395   rtx res1 = gen_reg_rtx (data_mode);
20396   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20397   if (GET_CODE (sel) != CONST_VECTOR
20398       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20399     {
20400       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20401                                                        2 * nunits - 1);
20402       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20403                                      NULL, 0, OPTAB_DIRECT);
20404     }
20405   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20406   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20407                                      NULL, 0, OPTAB_DIRECT);
20408   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20409   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20410     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20411   else
20412     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20413 }
20414
20415 /* Recognize patterns suitable for the TRN instructions.  */
20416 static bool
20417 aarch64_evpc_trn (struct expand_vec_perm_d *d)
20418 {
20419   HOST_WIDE_INT odd;
20420   poly_uint64 nelt = d->perm.length ();
20421   rtx out, in0, in1, x;
20422   machine_mode vmode = d->vmode;
20423
20424   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20425     return false;
20426
20427   /* Note that these are little-endian tests.
20428      We correct for big-endian later.  */
20429   if (!d->perm[0].is_constant (&odd)
20430       || (odd != 0 && odd != 1)
20431       || !d->perm.series_p (0, 2, odd, 2)
20432       || !d->perm.series_p (1, 2, nelt + odd, 2))
20433     return false;
20434
20435   /* Success!  */
20436   if (d->testing_p)
20437     return true;
20438
20439   in0 = d->op0;
20440   in1 = d->op1;
20441   /* We don't need a big-endian lane correction for SVE; see the comment
20442      at the head of aarch64-sve.md for details.  */
20443   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20444     {
20445       x = in0, in0 = in1, in1 = x;
20446       odd = !odd;
20447     }
20448   out = d->target;
20449
20450   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20451                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
20452   return true;
20453 }
20454
20455 /* Try to re-encode the PERM constant so it combines odd and even elements.
20456    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
20457    We retry with this new constant with the full suite of patterns.  */
20458 static bool
20459 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
20460 {
20461   expand_vec_perm_d newd;
20462   unsigned HOST_WIDE_INT nelt;
20463
20464   if (d->vec_flags != VEC_ADVSIMD)
20465     return false;
20466
20467   /* Get the new mode.  Always twice the size of the inner
20468      and half the elements.  */
20469   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
20470   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
20471   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
20472   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
20473
20474   if (new_mode == word_mode)
20475     return false;
20476
20477   /* to_constant is safe since this routine is specific to Advanced SIMD
20478      vectors.  */
20479   nelt = d->perm.length ().to_constant ();
20480
20481   vec_perm_builder newpermconst;
20482   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
20483
20484   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
20485   for (unsigned int i = 0; i < nelt; i += 2)
20486     {
20487       poly_int64 elt0 = d->perm[i];
20488       poly_int64 elt1 = d->perm[i + 1];
20489       poly_int64 newelt;
20490       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
20491         return false;
20492       newpermconst.quick_push (newelt.to_constant ());
20493     }
20494   newpermconst.finalize ();
20495
20496   newd.vmode = new_mode;
20497   newd.vec_flags = VEC_ADVSIMD;
20498   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
20499   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
20500   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
20501   newd.testing_p = d->testing_p;
20502   newd.one_vector_p = d->one_vector_p;
20503
20504   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
20505   return aarch64_expand_vec_perm_const_1 (&newd);
20506 }
20507
20508 /* Recognize patterns suitable for the UZP instructions.  */
20509 static bool
20510 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20511 {
20512   HOST_WIDE_INT odd;
20513   rtx out, in0, in1, x;
20514   machine_mode vmode = d->vmode;
20515
20516   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20517     return false;
20518
20519   /* Note that these are little-endian tests.
20520      We correct for big-endian later.  */
20521   if (!d->perm[0].is_constant (&odd)
20522       || (odd != 0 && odd != 1)
20523       || !d->perm.series_p (0, 1, odd, 2))
20524     return false;
20525
20526   /* Success!  */
20527   if (d->testing_p)
20528     return true;
20529
20530   in0 = d->op0;
20531   in1 = d->op1;
20532   /* We don't need a big-endian lane correction for SVE; see the comment
20533      at the head of aarch64-sve.md for details.  */
20534   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20535     {
20536       x = in0, in0 = in1, in1 = x;
20537       odd = !odd;
20538     }
20539   out = d->target;
20540
20541   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20542                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
20543   return true;
20544 }
20545
20546 /* Recognize patterns suitable for the ZIP instructions.  */
20547 static bool
20548 aarch64_evpc_zip (struct expand_vec_perm_d *d)
20549 {
20550   unsigned int high;
20551   poly_uint64 nelt = d->perm.length ();
20552   rtx out, in0, in1, x;
20553   machine_mode vmode = d->vmode;
20554
20555   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20556     return false;
20557
20558   /* Note that these are little-endian tests.
20559      We correct for big-endian later.  */
20560   poly_uint64 first = d->perm[0];
20561   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20562       || !d->perm.series_p (0, 2, first, 1)
20563       || !d->perm.series_p (1, 2, first + nelt, 1))
20564     return false;
20565   high = maybe_ne (first, 0U);
20566
20567   /* Success!  */
20568   if (d->testing_p)
20569     return true;
20570
20571   in0 = d->op0;
20572   in1 = d->op1;
20573   /* We don't need a big-endian lane correction for SVE; see the comment
20574      at the head of aarch64-sve.md for details.  */
20575   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20576     {
20577       x = in0, in0 = in1, in1 = x;
20578       high = !high;
20579     }
20580   out = d->target;
20581
20582   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20583                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
20584   return true;
20585 }
20586
20587 /* Recognize patterns for the EXT insn.  */
20588
20589 static bool
20590 aarch64_evpc_ext (struct expand_vec_perm_d *d)
20591 {
20592   HOST_WIDE_INT location;
20593   rtx offset;
20594
20595   /* The first element always refers to the first vector.
20596      Check if the extracted indices are increasing by one.  */
20597   if (d->vec_flags == VEC_SVE_PRED
20598       || !d->perm[0].is_constant (&location)
20599       || !d->perm.series_p (0, 1, location, 1))
20600     return false;
20601
20602   /* Success! */
20603   if (d->testing_p)
20604     return true;
20605
20606   /* The case where (location == 0) is a no-op for both big- and little-endian,
20607      and is removed by the mid-end at optimization levels -O1 and higher.
20608
20609      We don't need a big-endian lane correction for SVE; see the comment
20610      at the head of aarch64-sve.md for details.  */
20611   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
20612     {
20613       /* After setup, we want the high elements of the first vector (stored
20614          at the LSB end of the register), and the low elements of the second
20615          vector (stored at the MSB end of the register). So swap.  */
20616       std::swap (d->op0, d->op1);
20617       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20618          to_constant () is safe since this is restricted to Advanced SIMD
20619          vectors.  */
20620       location = d->perm.length ().to_constant () - location;
20621     }
20622
20623   offset = GEN_INT (location);
20624   emit_set_insn (d->target,
20625                  gen_rtx_UNSPEC (d->vmode,
20626                                  gen_rtvec (3, d->op0, d->op1, offset),
20627                                  UNSPEC_EXT));
20628   return true;
20629 }
20630
20631 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20632    within each 64-bit, 32-bit or 16-bit granule.  */
20633
20634 static bool
20635 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
20636 {
20637   HOST_WIDE_INT diff;
20638   unsigned int i, size, unspec;
20639   machine_mode pred_mode;
20640
20641   if (d->vec_flags == VEC_SVE_PRED
20642       || !d->one_vector_p
20643       || !d->perm[0].is_constant (&diff)
20644       || !diff)
20645     return false;
20646
20647   if (d->vec_flags & VEC_SVE_DATA)
20648     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
20649   else
20650     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
20651   if (size == 64)
20652     {
20653       unspec = UNSPEC_REV64;
20654       pred_mode = VNx2BImode;
20655     }
20656   else if (size == 32)
20657     {
20658       unspec = UNSPEC_REV32;
20659       pred_mode = VNx4BImode;
20660     }
20661   else if (size == 16)
20662     {
20663       unspec = UNSPEC_REV16;
20664       pred_mode = VNx8BImode;
20665     }
20666   else
20667     return false;
20668
20669   unsigned int step = diff + 1;
20670   for (i = 0; i < step; ++i)
20671     if (!d->perm.series_p (i, step, diff - i, step))
20672       return false;
20673
20674   /* Success! */
20675   if (d->testing_p)
20676     return true;
20677
20678   if (d->vec_flags & VEC_SVE_DATA)
20679     {
20680       rtx pred = aarch64_ptrue_reg (pred_mode);
20681       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
20682                                          d->target, pred, d->op0));
20683       return true;
20684     }
20685   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
20686   emit_set_insn (d->target, src);
20687   return true;
20688 }
20689
20690 /* Recognize patterns for the REV insn, which reverses elements within
20691    a full vector.  */
20692
20693 static bool
20694 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20695 {
20696   poly_uint64 nelt = d->perm.length ();
20697
20698   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
20699     return false;
20700
20701   if (!d->perm.series_p (0, 1, nelt - 1, -1))
20702     return false;
20703
20704   /* Success! */
20705   if (d->testing_p)
20706     return true;
20707
20708   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20709   emit_set_insn (d->target, src);
20710   return true;
20711 }
20712
20713 static bool
20714 aarch64_evpc_dup (struct expand_vec_perm_d *d)
20715 {
20716   rtx out = d->target;
20717   rtx in0;
20718   HOST_WIDE_INT elt;
20719   machine_mode vmode = d->vmode;
20720   rtx lane;
20721
20722   if (d->vec_flags == VEC_SVE_PRED
20723       || d->perm.encoding ().encoded_nelts () != 1
20724       || !d->perm[0].is_constant (&elt))
20725     return false;
20726
20727   if ((d->vec_flags & VEC_SVE_DATA)
20728       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
20729     return false;
20730
20731   /* Success! */
20732   if (d->testing_p)
20733     return true;
20734
20735   /* The generic preparation in aarch64_expand_vec_perm_const_1
20736      swaps the operand order and the permute indices if it finds
20737      d->perm[0] to be in the second operand.  Thus, we can always
20738      use d->op0 and need not do any extra arithmetic to get the
20739      correct lane number.  */
20740   in0 = d->op0;
20741   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
20742
20743   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20744   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20745   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20746   return true;
20747 }
20748
20749 static bool
20750 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20751 {
20752   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20753   machine_mode vmode = d->vmode;
20754
20755   /* Make sure that the indices are constant.  */
20756   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20757   for (unsigned int i = 0; i < encoded_nelts; ++i)
20758     if (!d->perm[i].is_constant ())
20759       return false;
20760
20761   if (d->testing_p)
20762     return true;
20763
20764   /* Generic code will try constant permutation twice.  Once with the
20765      original mode and again with the elements lowered to QImode.
20766      So wait and don't do the selector expansion ourselves.  */
20767   if (vmode != V8QImode && vmode != V16QImode)
20768     return false;
20769
20770   /* to_constant is safe since this routine is specific to Advanced SIMD
20771      vectors.  */
20772   unsigned int nelt = d->perm.length ().to_constant ();
20773   for (unsigned int i = 0; i < nelt; ++i)
20774     /* If big-endian and two vectors we end up with a weird mixed-endian
20775        mode on NEON.  Reverse the index within each word but not the word
20776        itself.  to_constant is safe because we checked is_constant above.  */
20777     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20778                         ? d->perm[i].to_constant () ^ (nelt - 1)
20779                         : d->perm[i].to_constant ());
20780
20781   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20782   sel = force_reg (vmode, sel);
20783
20784   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20785   return true;
20786 }
20787
20788 /* Try to implement D using an SVE TBL instruction.  */
20789
20790 static bool
20791 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20792 {
20793   unsigned HOST_WIDE_INT nelt;
20794
20795   /* Permuting two variable-length vectors could overflow the
20796      index range.  */
20797   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20798     return false;
20799
20800   if (d->testing_p)
20801     return true;
20802
20803   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20804   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
20805   if (d->one_vector_p)
20806     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20807   else
20808     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
20809   return true;
20810 }
20811
20812 /* Try to implement D using SVE SEL instruction.  */
20813
20814 static bool
20815 aarch64_evpc_sel (struct expand_vec_perm_d *d)
20816 {
20817   machine_mode vmode = d->vmode;
20818   int unit_size = GET_MODE_UNIT_SIZE (vmode);
20819
20820   if (d->vec_flags != VEC_SVE_DATA
20821       || unit_size > 8)
20822     return false;
20823
20824   int n_patterns = d->perm.encoding ().npatterns ();
20825   poly_int64 vec_len = d->perm.length ();
20826
20827   for (int i = 0; i < n_patterns; ++i)
20828     if (!known_eq (d->perm[i], i)
20829         && !known_eq (d->perm[i], vec_len + i))
20830       return false;
20831
20832   for (int i = n_patterns; i < n_patterns * 2; i++)
20833     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20834         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20835       return false;
20836
20837   if (d->testing_p)
20838     return true;
20839
20840   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
20841
20842   /* Build a predicate that is true when op0 elements should be used.  */
20843   rtx_vector_builder builder (pred_mode, n_patterns, 2);
20844   for (int i = 0; i < n_patterns * 2; i++)
20845     {
20846       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20847                                           : CONST0_RTX (BImode);
20848       builder.quick_push (elem);
20849     }
20850
20851   rtx const_vec = builder.build ();
20852   rtx pred = force_reg (pred_mode, const_vec);
20853   /* TARGET = PRED ? OP0 : OP1.  */
20854   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
20855   return true;
20856 }
20857
20858 /* Recognize patterns suitable for the INS instructions.  */
20859 static bool
20860 aarch64_evpc_ins (struct expand_vec_perm_d *d)
20861 {
20862   machine_mode mode = d->vmode;
20863   unsigned HOST_WIDE_INT nelt;
20864
20865   if (d->vec_flags != VEC_ADVSIMD)
20866     return false;
20867
20868   /* to_constant is safe since this routine is specific to Advanced SIMD
20869      vectors.  */
20870   nelt = d->perm.length ().to_constant ();
20871   rtx insv = d->op0;
20872
20873   HOST_WIDE_INT idx = -1;
20874
20875   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20876     {
20877       HOST_WIDE_INT elt;
20878       if (!d->perm[i].is_constant (&elt))
20879         return false;
20880       if (elt == (HOST_WIDE_INT) i)
20881         continue;
20882       if (idx != -1)
20883         {
20884           idx = -1;
20885           break;
20886         }
20887       idx = i;
20888     }
20889
20890   if (idx == -1)
20891     {
20892       insv = d->op1;
20893       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20894         {
20895           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
20896             continue;
20897           if (idx != -1)
20898             return false;
20899           idx = i;
20900         }
20901
20902       if (idx == -1)
20903         return false;
20904     }
20905
20906   if (d->testing_p)
20907     return true;
20908
20909   gcc_assert (idx != -1);
20910
20911   unsigned extractindex = d->perm[idx].to_constant ();
20912   rtx extractv = d->op0;
20913   if (extractindex >= nelt)
20914     {
20915       extractv = d->op1;
20916       extractindex -= nelt;
20917     }
20918   gcc_assert (extractindex < nelt);
20919
20920   emit_move_insn (d->target, insv);
20921   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
20922   expand_operand ops[5];
20923   create_output_operand (&ops[0], d->target, mode);
20924   create_input_operand (&ops[1], d->target, mode);
20925   create_integer_operand (&ops[2], 1 << idx);
20926   create_input_operand (&ops[3], extractv, mode);
20927   create_integer_operand (&ops[4], extractindex);
20928   expand_insn (icode, 5, ops);
20929
20930   return true;
20931 }
20932
20933 static bool
20934 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20935 {
20936   /* The pattern matching functions above are written to look for a small
20937      number to begin the sequence (0, 1, N/2).  If we begin with an index
20938      from the second operand, we can swap the operands.  */
20939   poly_int64 nelt = d->perm.length ();
20940   if (known_ge (d->perm[0], nelt))
20941     {
20942       d->perm.rotate_inputs (1);
20943       std::swap (d->op0, d->op1);
20944     }
20945
20946   if ((d->vec_flags == VEC_ADVSIMD
20947        || d->vec_flags == VEC_SVE_DATA
20948        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
20949        || d->vec_flags == VEC_SVE_PRED)
20950       && known_gt (nelt, 1))
20951     {
20952       if (aarch64_evpc_rev_local (d))
20953         return true;
20954       else if (aarch64_evpc_rev_global (d))
20955         return true;
20956       else if (aarch64_evpc_ext (d))
20957         return true;
20958       else if (aarch64_evpc_dup (d))
20959         return true;
20960       else if (aarch64_evpc_zip (d))
20961         return true;
20962       else if (aarch64_evpc_uzp (d))
20963         return true;
20964       else if (aarch64_evpc_trn (d))
20965         return true;
20966       else if (aarch64_evpc_sel (d))
20967         return true;
20968       else if (aarch64_evpc_ins (d))
20969         return true;
20970       else if (aarch64_evpc_reencode (d))
20971         return true;
20972       if (d->vec_flags == VEC_SVE_DATA)
20973         return aarch64_evpc_sve_tbl (d);
20974       else if (d->vec_flags == VEC_ADVSIMD)
20975         return aarch64_evpc_tbl (d);
20976     }
20977   return false;
20978 }
20979
20980 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
20981
20982 static bool
20983 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20984                                   rtx op1, const vec_perm_indices &sel)
20985 {
20986   struct expand_vec_perm_d d;
20987
20988   /* Check whether the mask can be applied to a single vector.  */
20989   if (sel.ninputs () == 1
20990       || (op0 && rtx_equal_p (op0, op1)))
20991     d.one_vector_p = true;
20992   else if (sel.all_from_input_p (0))
20993     {
20994       d.one_vector_p = true;
20995       op1 = op0;
20996     }
20997   else if (sel.all_from_input_p (1))
20998     {
20999       d.one_vector_p = true;
21000       op0 = op1;
21001     }
21002   else
21003     d.one_vector_p = false;
21004
21005   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
21006                      sel.nelts_per_input ());
21007   d.vmode = vmode;
21008   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
21009   d.target = target;
21010   d.op0 = op0;
21011   d.op1 = op1;
21012   d.testing_p = !target;
21013
21014   if (!d.testing_p)
21015     return aarch64_expand_vec_perm_const_1 (&d);
21016
21017   rtx_insn *last = get_last_insn ();
21018   bool ret = aarch64_expand_vec_perm_const_1 (&d);
21019   gcc_assert (last == get_last_insn ());
21020
21021   return ret;
21022 }
21023
21024 /* Generate a byte permute mask for a register of mode MODE,
21025    which has NUNITS units.  */
21026
21027 rtx
21028 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
21029 {
21030   /* We have to reverse each vector because we dont have
21031      a permuted load that can reverse-load according to ABI rules.  */
21032   rtx mask;
21033   rtvec v = rtvec_alloc (16);
21034   unsigned int i, j;
21035   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
21036
21037   gcc_assert (BYTES_BIG_ENDIAN);
21038   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
21039
21040   for (i = 0; i < nunits; i++)
21041     for (j = 0; j < usize; j++)
21042       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
21043   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
21044   return force_reg (V16QImode, mask);
21045 }
21046
21047 /* Expand an SVE integer comparison using the SVE equivalent of:
21048
21049      (set TARGET (CODE OP0 OP1)).  */
21050
21051 void
21052 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
21053 {
21054   machine_mode pred_mode = GET_MODE (target);
21055   machine_mode data_mode = GET_MODE (op0);
21056   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
21057                                       op0, op1);
21058   if (!rtx_equal_p (target, res))
21059     emit_move_insn (target, res);
21060 }
21061
21062 /* Return the UNSPEC_COND_* code for comparison CODE.  */
21063
21064 static unsigned int
21065 aarch64_unspec_cond_code (rtx_code code)
21066 {
21067   switch (code)
21068     {
21069     case NE:
21070       return UNSPEC_COND_FCMNE;
21071     case EQ:
21072       return UNSPEC_COND_FCMEQ;
21073     case LT:
21074       return UNSPEC_COND_FCMLT;
21075     case GT:
21076       return UNSPEC_COND_FCMGT;
21077     case LE:
21078       return UNSPEC_COND_FCMLE;
21079     case GE:
21080       return UNSPEC_COND_FCMGE;
21081     case UNORDERED:
21082       return UNSPEC_COND_FCMUO;
21083     default:
21084       gcc_unreachable ();
21085     }
21086 }
21087
21088 /* Emit:
21089
21090       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
21091
21092    where <X> is the operation associated with comparison CODE.
21093    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
21094
21095 static void
21096 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
21097                           bool known_ptrue_p, rtx op0, rtx op1)
21098 {
21099   rtx flag = gen_int_mode (known_ptrue_p, SImode);
21100   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
21101                                gen_rtvec (4, pred, flag, op0, op1),
21102                                aarch64_unspec_cond_code (code));
21103   emit_set_insn (target, unspec);
21104 }
21105
21106 /* Emit the SVE equivalent of:
21107
21108       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
21109       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
21110       (set TARGET (ior:PRED_MODE TMP1 TMP2))
21111
21112    where <Xi> is the operation associated with comparison CODEi.
21113    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
21114
21115 static void
21116 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
21117                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
21118 {
21119   machine_mode pred_mode = GET_MODE (pred);
21120   rtx tmp1 = gen_reg_rtx (pred_mode);
21121   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
21122   rtx tmp2 = gen_reg_rtx (pred_mode);
21123   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
21124   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
21125 }
21126
21127 /* Emit the SVE equivalent of:
21128
21129       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
21130       (set TARGET (not TMP))
21131
21132    where <X> is the operation associated with comparison CODE.
21133    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
21134
21135 static void
21136 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
21137                                  bool known_ptrue_p, rtx op0, rtx op1)
21138 {
21139   machine_mode pred_mode = GET_MODE (pred);
21140   rtx tmp = gen_reg_rtx (pred_mode);
21141   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
21142   aarch64_emit_unop (target, one_cmpl_optab, tmp);
21143 }
21144
21145 /* Expand an SVE floating-point comparison using the SVE equivalent of:
21146
21147      (set TARGET (CODE OP0 OP1))
21148
21149    If CAN_INVERT_P is true, the caller can also handle inverted results;
21150    return true if the result is in fact inverted.  */
21151
21152 bool
21153 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
21154                                   rtx op0, rtx op1, bool can_invert_p)
21155 {
21156   machine_mode pred_mode = GET_MODE (target);
21157   machine_mode data_mode = GET_MODE (op0);
21158
21159   rtx ptrue = aarch64_ptrue_reg (pred_mode);
21160   switch (code)
21161     {
21162     case UNORDERED:
21163       /* UNORDERED has no immediate form.  */
21164       op1 = force_reg (data_mode, op1);
21165       /* fall through */
21166     case LT:
21167     case LE:
21168     case GT:
21169     case GE:
21170     case EQ:
21171     case NE:
21172       {
21173         /* There is native support for the comparison.  */
21174         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21175         return false;
21176       }
21177
21178     case LTGT:
21179       /* This is a trapping operation (LT or GT).  */
21180       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
21181       return false;
21182
21183     case UNEQ:
21184       if (!flag_trapping_math)
21185         {
21186           /* This would trap for signaling NaNs.  */
21187           op1 = force_reg (data_mode, op1);
21188           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
21189                                         ptrue, true, op0, op1);
21190           return false;
21191         }
21192       /* fall through */
21193     case UNLT:
21194     case UNLE:
21195     case UNGT:
21196     case UNGE:
21197       if (flag_trapping_math)
21198         {
21199           /* Work out which elements are ordered.  */
21200           rtx ordered = gen_reg_rtx (pred_mode);
21201           op1 = force_reg (data_mode, op1);
21202           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
21203                                            ptrue, true, op0, op1);
21204
21205           /* Test the opposite condition for the ordered elements,
21206              then invert the result.  */
21207           if (code == UNEQ)
21208             code = NE;
21209           else
21210             code = reverse_condition_maybe_unordered (code);
21211           if (can_invert_p)
21212             {
21213               aarch64_emit_sve_fp_cond (target, code,
21214                                         ordered, false, op0, op1);
21215               return true;
21216             }
21217           aarch64_emit_sve_invert_fp_cond (target, code,
21218                                            ordered, false, op0, op1);
21219           return false;
21220         }
21221       break;
21222
21223     case ORDERED:
21224       /* ORDERED has no immediate form.  */
21225       op1 = force_reg (data_mode, op1);
21226       break;
21227
21228     default:
21229       gcc_unreachable ();
21230     }
21231
21232   /* There is native support for the inverse comparison.  */
21233   code = reverse_condition_maybe_unordered (code);
21234   if (can_invert_p)
21235     {
21236       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21237       return true;
21238     }
21239   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
21240   return false;
21241 }
21242
21243 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
21244    of the data being selected and CMP_MODE is the mode of the values being
21245    compared.  */
21246
21247 void
21248 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
21249                           rtx *ops)
21250 {
21251   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
21252   rtx pred = gen_reg_rtx (pred_mode);
21253   if (FLOAT_MODE_P (cmp_mode))
21254     {
21255       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
21256                                             ops[4], ops[5], true))
21257         std::swap (ops[1], ops[2]);
21258     }
21259   else
21260     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
21261
21262   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
21263     ops[1] = force_reg (data_mode, ops[1]);
21264   /* The "false" value can only be zero if the "true" value is a constant.  */
21265   if (register_operand (ops[1], data_mode)
21266       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
21267     ops[2] = force_reg (data_mode, ops[2]);
21268
21269   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
21270   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
21271 }
21272
21273 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
21274    true.  However due to issues with register allocation it is preferable
21275    to avoid tieing integer scalar and FP scalar modes.  Executing integer
21276    operations in general registers is better than treating them as scalar
21277    vector operations.  This reduces latency and avoids redundant int<->FP
21278    moves.  So tie modes if they are either the same class, or vector modes
21279    with other vector modes, vector structs or any scalar mode.  */
21280
21281 static bool
21282 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
21283 {
21284   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
21285     return true;
21286
21287   /* We specifically want to allow elements of "structure" modes to
21288      be tieable to the structure.  This more general condition allows
21289      other rarer situations too.  The reason we don't extend this to
21290      predicate modes is that there are no predicate structure modes
21291      nor any specific instructions for extracting part of a predicate
21292      register.  */
21293   if (aarch64_vector_data_mode_p (mode1)
21294       && aarch64_vector_data_mode_p (mode2))
21295     return true;
21296
21297   /* Also allow any scalar modes with vectors.  */
21298   if (aarch64_vector_mode_supported_p (mode1)
21299       || aarch64_vector_mode_supported_p (mode2))
21300     return true;
21301
21302   return false;
21303 }
21304
21305 /* Return a new RTX holding the result of moving POINTER forward by
21306    AMOUNT bytes.  */
21307
21308 static rtx
21309 aarch64_move_pointer (rtx pointer, poly_int64 amount)
21310 {
21311   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
21312
21313   return adjust_automodify_address (pointer, GET_MODE (pointer),
21314                                     next, amount);
21315 }
21316
21317 /* Return a new RTX holding the result of moving POINTER forward by the
21318    size of the mode it points to.  */
21319
21320 static rtx
21321 aarch64_progress_pointer (rtx pointer)
21322 {
21323   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
21324 }
21325
21326 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
21327    MODE bytes.  */
21328
21329 static void
21330 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
21331                                               machine_mode mode)
21332 {
21333   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
21334      address copies using V4SImode so that we can use Q registers.  */
21335   if (known_eq (GET_MODE_BITSIZE (mode), 256))
21336     {
21337       mode = V4SImode;
21338       rtx reg1 = gen_reg_rtx (mode);
21339       rtx reg2 = gen_reg_rtx (mode);
21340       /* "Cast" the pointers to the correct mode.  */
21341       *src = adjust_address (*src, mode, 0);
21342       *dst = adjust_address (*dst, mode, 0);
21343       /* Emit the memcpy.  */
21344       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
21345                                         aarch64_progress_pointer (*src)));
21346       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
21347                                          aarch64_progress_pointer (*dst), reg2));
21348       /* Move the pointers forward.  */
21349       *src = aarch64_move_pointer (*src, 32);
21350       *dst = aarch64_move_pointer (*dst, 32);
21351       return;
21352     }
21353
21354   rtx reg = gen_reg_rtx (mode);
21355
21356   /* "Cast" the pointers to the correct mode.  */
21357   *src = adjust_address (*src, mode, 0);
21358   *dst = adjust_address (*dst, mode, 0);
21359   /* Emit the memcpy.  */
21360   emit_move_insn (reg, *src);
21361   emit_move_insn (*dst, reg);
21362   /* Move the pointers forward.  */
21363   *src = aarch64_progress_pointer (*src);
21364   *dst = aarch64_progress_pointer (*dst);
21365 }
21366
21367 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
21368    we succeed, otherwise return false.  */
21369
21370 bool
21371 aarch64_expand_cpymem (rtx *operands)
21372 {
21373   int mode_bits;
21374   rtx dst = operands[0];
21375   rtx src = operands[1];
21376   rtx base;
21377   machine_mode cur_mode = BLKmode;
21378
21379   /* Only expand fixed-size copies.  */
21380   if (!CONST_INT_P (operands[2]))
21381     return false;
21382
21383   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
21384
21385   /* Inline up to 256 bytes when optimizing for speed.  */
21386   unsigned HOST_WIDE_INT max_copy_size = 256;
21387
21388   if (optimize_function_for_size_p (cfun))
21389     max_copy_size = 128;
21390
21391   int copy_bits = 256;
21392
21393   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
21394      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
21395   if (size <= 24
21396       || !TARGET_SIMD
21397       || (aarch64_tune_params.extra_tuning_flags
21398           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21399     {
21400       copy_bits = 128;
21401       max_copy_size = max_copy_size / 2;
21402     }
21403
21404   if (size > max_copy_size)
21405     return false;
21406
21407   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21408   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21409
21410   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
21411   src = adjust_automodify_address (src, VOIDmode, base, 0);
21412
21413   /* Convert size to bits to make the rest of the code simpler.  */
21414   int n = size * BITS_PER_UNIT;
21415
21416   while (n > 0)
21417     {
21418       /* Find the largest mode in which to do the copy in without over reading
21419          or writing.  */
21420       opt_scalar_int_mode mode_iter;
21421       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21422         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
21423           cur_mode = mode_iter.require ();
21424
21425       gcc_assert (cur_mode != BLKmode);
21426
21427       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21428
21429       /* Prefer Q-register accesses for the last bytes.  */
21430       if (mode_bits == 128 && copy_bits == 256)
21431         cur_mode = V4SImode;
21432
21433       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
21434
21435       n -= mode_bits;
21436
21437       /* Emit trailing copies using overlapping unaligned accesses - this is
21438          smaller and faster.  */
21439       if (n > 0 && n < copy_bits / 2)
21440         {
21441           machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
21442           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21443           gcc_assert (n_bits <= mode_bits);
21444           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
21445           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21446           n = n_bits;
21447         }
21448     }
21449
21450   return true;
21451 }
21452
21453 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
21454    SRC is a register we have created with the duplicated value to be set.  */
21455 static void
21456 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
21457                                             machine_mode mode)
21458 {
21459   /* If we are copying 128bits or 256bits, we can do that straight from
21460      the SIMD register we prepared.  */
21461   if (known_eq (GET_MODE_BITSIZE (mode), 256))
21462     {
21463       mode = GET_MODE (src);
21464       /* "Cast" the *dst to the correct mode.  */
21465       *dst = adjust_address (*dst, mode, 0);
21466       /* Emit the memset.  */
21467       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
21468                                          aarch64_progress_pointer (*dst), src));
21469
21470       /* Move the pointers forward.  */
21471       *dst = aarch64_move_pointer (*dst, 32);
21472       return;
21473     }
21474   if (known_eq (GET_MODE_BITSIZE (mode), 128))
21475     {
21476       /* "Cast" the *dst to the correct mode.  */
21477       *dst = adjust_address (*dst, GET_MODE (src), 0);
21478       /* Emit the memset.  */
21479       emit_move_insn (*dst, src);
21480       /* Move the pointers forward.  */
21481       *dst = aarch64_move_pointer (*dst, 16);
21482       return;
21483     }
21484   /* For copying less, we have to extract the right amount from src.  */
21485   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
21486
21487   /* "Cast" the *dst to the correct mode.  */
21488   *dst = adjust_address (*dst, mode, 0);
21489   /* Emit the memset.  */
21490   emit_move_insn (*dst, reg);
21491   /* Move the pointer forward.  */
21492   *dst = aarch64_progress_pointer (*dst);
21493 }
21494
21495 /* Expand setmem, as if from a __builtin_memset.  Return true if
21496    we succeed, otherwise return false.  */
21497
21498 bool
21499 aarch64_expand_setmem (rtx *operands)
21500 {
21501   int n, mode_bits;
21502   unsigned HOST_WIDE_INT len;
21503   rtx dst = operands[0];
21504   rtx val = operands[2], src;
21505   rtx base;
21506   machine_mode cur_mode = BLKmode, next_mode;
21507
21508   /* We can't do anything smart if the amount to copy is not constant.  */
21509   if (!CONST_INT_P (operands[1]))
21510     return false;
21511
21512   bool speed_p = !optimize_function_for_size_p (cfun);
21513
21514   /* Default the maximum to 256-bytes.  */
21515   unsigned max_set_size = 256;
21516
21517   /* In case we are optimizing for size or if the core does not
21518      want to use STP Q regs, lower the max_set_size.  */
21519   max_set_size = (!speed_p
21520                   || (aarch64_tune_params.extra_tuning_flags
21521                       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21522                   ? max_set_size / 2 : max_set_size;
21523
21524   len = INTVAL (operands[1]);
21525
21526   /* Upper bound check.  */
21527   if (len > max_set_size)
21528     return false;
21529
21530   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21531   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21532
21533   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
21534   src = expand_vector_broadcast (V16QImode, val);
21535   src = force_reg (V16QImode, src);
21536
21537   /* Convert len to bits to make the rest of the code simpler.  */
21538   n = len * BITS_PER_UNIT;
21539
21540   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
21541      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
21542      pattern is only turned on for TARGET_SIMD.  */
21543   const int copy_limit = (speed_p
21544                           && (aarch64_tune_params.extra_tuning_flags
21545                               & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21546                           ? GET_MODE_BITSIZE (TImode) : 256;
21547
21548   while (n > 0)
21549     {
21550       /* Find the largest mode in which to do the copy without
21551          over writing.  */
21552       opt_scalar_int_mode mode_iter;
21553       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21554         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
21555           cur_mode = mode_iter.require ();
21556
21557       gcc_assert (cur_mode != BLKmode);
21558
21559       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21560       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
21561
21562       n -= mode_bits;
21563
21564       /* Do certain trailing copies as overlapping if it's going to be
21565          cheaper.  i.e. less instructions to do so.  For instance doing a 15
21566          byte copy it's more efficient to do two overlapping 8 byte copies than
21567          8 + 4 + 2 + 1.  */
21568       if (n > 0 && n < copy_limit / 2)
21569         {
21570           next_mode = smallest_mode_for_size (n, MODE_INT);
21571           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21572           gcc_assert (n_bits <= mode_bits);
21573           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21574           n = n_bits;
21575         }
21576     }
21577
21578   return true;
21579 }
21580
21581
21582 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
21583    SImode stores.  Handle the case when the constant has identical
21584    bottom and top halves.  This is beneficial when the two stores can be
21585    merged into an STP and we avoid synthesising potentially expensive
21586    immediates twice.  Return true if such a split is possible.  */
21587
21588 bool
21589 aarch64_split_dimode_const_store (rtx dst, rtx src)
21590 {
21591   rtx lo = gen_lowpart (SImode, src);
21592   rtx hi = gen_highpart_mode (SImode, DImode, src);
21593
21594   bool size_p = optimize_function_for_size_p (cfun);
21595
21596   if (!rtx_equal_p (lo, hi))
21597     return false;
21598
21599   unsigned int orig_cost
21600     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
21601   unsigned int lo_cost
21602     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
21603
21604   /* We want to transform:
21605      MOV        x1, 49370
21606      MOVK       x1, 0x140, lsl 16
21607      MOVK       x1, 0xc0da, lsl 32
21608      MOVK       x1, 0x140, lsl 48
21609      STR        x1, [x0]
21610    into:
21611      MOV        w1, 49370
21612      MOVK       w1, 0x140, lsl 16
21613      STP        w1, w1, [x0]
21614    So we want to perform this only when we save two instructions
21615    or more.  When optimizing for size, however, accept any code size
21616    savings we can.  */
21617   if (size_p && orig_cost <= lo_cost)
21618     return false;
21619
21620   if (!size_p
21621       && (orig_cost <= lo_cost + 1))
21622     return false;
21623
21624   rtx mem_lo = adjust_address (dst, SImode, 0);
21625   if (!aarch64_mem_pair_operand (mem_lo, SImode))
21626     return false;
21627
21628   rtx tmp_reg = gen_reg_rtx (SImode);
21629   aarch64_expand_mov_immediate (tmp_reg, lo);
21630   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
21631   /* Don't emit an explicit store pair as this may not be always profitable.
21632      Let the sched-fusion logic decide whether to merge them.  */
21633   emit_move_insn (mem_lo, tmp_reg);
21634   emit_move_insn (mem_hi, tmp_reg);
21635
21636   return true;
21637 }
21638
21639 /* Generate RTL for a conditional branch with rtx comparison CODE in
21640    mode CC_MODE.  The destination of the unlikely conditional branch
21641    is LABEL_REF.  */
21642
21643 void
21644 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
21645                               rtx label_ref)
21646 {
21647   rtx x;
21648   x = gen_rtx_fmt_ee (code, VOIDmode,
21649                       gen_rtx_REG (cc_mode, CC_REGNUM),
21650                       const0_rtx);
21651
21652   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21653                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
21654                             pc_rtx);
21655   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21656 }
21657
21658 /* Generate DImode scratch registers for 128-bit (TImode) addition.
21659
21660    OP1 represents the TImode destination operand 1
21661    OP2 represents the TImode destination operand 2
21662    LOW_DEST represents the low half (DImode) of TImode operand 0
21663    LOW_IN1 represents the low half (DImode) of TImode operand 1
21664    LOW_IN2 represents the low half (DImode) of TImode operand 2
21665    HIGH_DEST represents the high half (DImode) of TImode operand 0
21666    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21667    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21668
21669 void
21670 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21671                             rtx *low_in1, rtx *low_in2,
21672                             rtx *high_dest, rtx *high_in1,
21673                             rtx *high_in2)
21674 {
21675   *low_dest = gen_reg_rtx (DImode);
21676   *low_in1 = gen_lowpart (DImode, op1);
21677   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21678                                   subreg_lowpart_offset (DImode, TImode));
21679   *high_dest = gen_reg_rtx (DImode);
21680   *high_in1 = gen_highpart (DImode, op1);
21681   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21682                                    subreg_highpart_offset (DImode, TImode));
21683 }
21684
21685 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21686
21687    This function differs from 'arch64_addti_scratch_regs' in that
21688    OP1 can be an immediate constant (zero). We must call
21689    subreg_highpart_offset with DImode and TImode arguments, otherwise
21690    VOIDmode will be used for the const_int which generates an internal
21691    error from subreg_size_highpart_offset which does not expect a size of zero.
21692
21693    OP1 represents the TImode destination operand 1
21694    OP2 represents the TImode destination operand 2
21695    LOW_DEST represents the low half (DImode) of TImode operand 0
21696    LOW_IN1 represents the low half (DImode) of TImode operand 1
21697    LOW_IN2 represents the low half (DImode) of TImode operand 2
21698    HIGH_DEST represents the high half (DImode) of TImode operand 0
21699    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21700    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21701
21702
21703 void
21704 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21705                              rtx *low_in1, rtx *low_in2,
21706                              rtx *high_dest, rtx *high_in1,
21707                              rtx *high_in2)
21708 {
21709   *low_dest = gen_reg_rtx (DImode);
21710   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21711                                   subreg_lowpart_offset (DImode, TImode));
21712
21713   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21714                                   subreg_lowpart_offset (DImode, TImode));
21715   *high_dest = gen_reg_rtx (DImode);
21716
21717   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21718                                    subreg_highpart_offset (DImode, TImode));
21719   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21720                                    subreg_highpart_offset (DImode, TImode));
21721 }
21722
21723 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
21724
21725    OP0 represents the TImode destination operand 0
21726    LOW_DEST represents the low half (DImode) of TImode operand 0
21727    LOW_IN1 represents the low half (DImode) of TImode operand 1
21728    LOW_IN2 represents the low half (DImode) of TImode operand 2
21729    HIGH_DEST represents the high half (DImode) of TImode operand 0
21730    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21731    HIGH_IN2 represents the high half (DImode) of TImode operand 2
21732    UNSIGNED_P is true if the operation is being performed on unsigned
21733    values.  */
21734 void
21735 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21736                        rtx low_in2, rtx high_dest, rtx high_in1,
21737                        rtx high_in2, bool unsigned_p)
21738 {
21739   if (low_in2 == const0_rtx)
21740     {
21741       low_dest = low_in1;
21742       high_in2 = force_reg (DImode, high_in2);
21743       if (unsigned_p)
21744         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21745       else
21746         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
21747     }
21748   else
21749     {
21750       if (aarch64_plus_immediate (low_in2, DImode))
21751         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21752                                             GEN_INT (-INTVAL (low_in2))));
21753       else
21754         {
21755           low_in2 = force_reg (DImode, low_in2);
21756           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
21757         }
21758       high_in2 = force_reg (DImode, high_in2);
21759
21760       if (unsigned_p)
21761         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21762       else
21763         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
21764     }
21765
21766   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21767   emit_move_insn (gen_highpart (DImode, op0), high_dest);
21768
21769 }
21770
21771 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
21772
21773 static unsigned HOST_WIDE_INT
21774 aarch64_asan_shadow_offset (void)
21775 {
21776   if (TARGET_ILP32)
21777     return (HOST_WIDE_INT_1 << 29);
21778   else
21779     return (HOST_WIDE_INT_1 << 36);
21780 }
21781
21782 static rtx
21783 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
21784                         int code, tree treeop0, tree treeop1)
21785 {
21786   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21787   rtx op0, op1;
21788   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21789   insn_code icode;
21790   struct expand_operand ops[4];
21791
21792   start_sequence ();
21793   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21794
21795   op_mode = GET_MODE (op0);
21796   if (op_mode == VOIDmode)
21797     op_mode = GET_MODE (op1);
21798
21799   switch (op_mode)
21800     {
21801     case E_QImode:
21802     case E_HImode:
21803     case E_SImode:
21804       cmp_mode = SImode;
21805       icode = CODE_FOR_cmpsi;
21806       break;
21807
21808     case E_DImode:
21809       cmp_mode = DImode;
21810       icode = CODE_FOR_cmpdi;
21811       break;
21812
21813     case E_SFmode:
21814       cmp_mode = SFmode;
21815       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21816       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21817       break;
21818
21819     case E_DFmode:
21820       cmp_mode = DFmode;
21821       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21822       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21823       break;
21824
21825     default:
21826       end_sequence ();
21827       return NULL_RTX;
21828     }
21829
21830   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21831   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
21832   if (!op0 || !op1)
21833     {
21834       end_sequence ();
21835       return NULL_RTX;
21836     }
21837   *prep_seq = get_insns ();
21838   end_sequence ();
21839
21840   create_fixed_operand (&ops[0], op0);
21841   create_fixed_operand (&ops[1], op1);
21842
21843   start_sequence ();
21844   if (!maybe_expand_insn (icode, 2, ops))
21845     {
21846       end_sequence ();
21847       return NULL_RTX;
21848     }
21849   *gen_seq = get_insns ();
21850   end_sequence ();
21851
21852   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21853                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
21854 }
21855
21856 static rtx
21857 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21858                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
21859 {
21860   rtx op0, op1, target;
21861   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21862   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21863   insn_code icode;
21864   struct expand_operand ops[6];
21865   int aarch64_cond;
21866
21867   push_to_sequence (*prep_seq);
21868   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21869
21870   op_mode = GET_MODE (op0);
21871   if (op_mode == VOIDmode)
21872     op_mode = GET_MODE (op1);
21873
21874   switch (op_mode)
21875     {
21876     case E_QImode:
21877     case E_HImode:
21878     case E_SImode:
21879       cmp_mode = SImode;
21880       break;
21881
21882     case E_DImode:
21883       cmp_mode = DImode;
21884       break;
21885
21886     case E_SFmode:
21887       cmp_mode = SFmode;
21888       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21889       break;
21890
21891     case E_DFmode:
21892       cmp_mode = DFmode;
21893       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21894       break;
21895
21896     default:
21897       end_sequence ();
21898       return NULL_RTX;
21899     }
21900
21901   icode = code_for_ccmp (cc_mode, cmp_mode);
21902
21903   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21904   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21905   if (!op0 || !op1)
21906     {
21907       end_sequence ();
21908       return NULL_RTX;
21909     }
21910   *prep_seq = get_insns ();
21911   end_sequence ();
21912
21913   target = gen_rtx_REG (cc_mode, CC_REGNUM);
21914   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
21915
21916   if (bit_code != AND)
21917     {
21918       /* Treat the ccmp patterns as canonical and use them where possible,
21919          but fall back to ccmp_rev patterns if there's no other option.  */
21920       rtx_code prev_code = GET_CODE (prev);
21921       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21922       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21923           && !(prev_code == EQ
21924                || prev_code == NE
21925                || prev_code == ORDERED
21926                || prev_code == UNORDERED))
21927         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21928       else
21929         {
21930           rtx_code code = reverse_condition (prev_code);
21931           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21932         }
21933       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21934     }
21935
21936   create_fixed_operand (&ops[0], XEXP (prev, 0));
21937   create_fixed_operand (&ops[1], target);
21938   create_fixed_operand (&ops[2], op0);
21939   create_fixed_operand (&ops[3], op1);
21940   create_fixed_operand (&ops[4], prev);
21941   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
21942
21943   push_to_sequence (*gen_seq);
21944   if (!maybe_expand_insn (icode, 6, ops))
21945     {
21946       end_sequence ();
21947       return NULL_RTX;
21948     }
21949
21950   *gen_seq = get_insns ();
21951   end_sequence ();
21952
21953   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21954 }
21955
21956 #undef TARGET_GEN_CCMP_FIRST
21957 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21958
21959 #undef TARGET_GEN_CCMP_NEXT
21960 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21961
21962 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
21963    instruction fusion of some sort.  */
21964
21965 static bool
21966 aarch64_macro_fusion_p (void)
21967 {
21968   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21969 }
21970
21971
21972 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
21973    should be kept together during scheduling.  */
21974
21975 static bool
21976 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21977 {
21978   rtx set_dest;
21979   rtx prev_set = single_set (prev);
21980   rtx curr_set = single_set (curr);
21981   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
21982   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21983
21984   if (!aarch64_macro_fusion_p ())
21985     return false;
21986
21987   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21988     {
21989       /* We are trying to match:
21990          prev (mov)  == (set (reg r0) (const_int imm16))
21991          curr (movk) == (set (zero_extract (reg r0)
21992                                            (const_int 16)
21993                                            (const_int 16))
21994                              (const_int imm16_1))  */
21995
21996       set_dest = SET_DEST (curr_set);
21997
21998       if (GET_CODE (set_dest) == ZERO_EXTRACT
21999           && CONST_INT_P (SET_SRC (curr_set))
22000           && CONST_INT_P (SET_SRC (prev_set))
22001           && CONST_INT_P (XEXP (set_dest, 2))
22002           && INTVAL (XEXP (set_dest, 2)) == 16
22003           && REG_P (XEXP (set_dest, 0))
22004           && REG_P (SET_DEST (prev_set))
22005           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
22006         {
22007           return true;
22008         }
22009     }
22010
22011   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
22012     {
22013
22014       /*  We're trying to match:
22015           prev (adrp) == (set (reg r1)
22016                               (high (symbol_ref ("SYM"))))
22017           curr (add) == (set (reg r0)
22018                              (lo_sum (reg r1)
22019                                      (symbol_ref ("SYM"))))
22020           Note that r0 need not necessarily be the same as r1, especially
22021           during pre-regalloc scheduling.  */
22022
22023       if (satisfies_constraint_Ush (SET_SRC (prev_set))
22024           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
22025         {
22026           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
22027               && REG_P (XEXP (SET_SRC (curr_set), 0))
22028               && REGNO (XEXP (SET_SRC (curr_set), 0))
22029                  == REGNO (SET_DEST (prev_set))
22030               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
22031                               XEXP (SET_SRC (curr_set), 1)))
22032             return true;
22033         }
22034     }
22035
22036   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
22037     {
22038
22039       /* We're trying to match:
22040          prev (movk) == (set (zero_extract (reg r0)
22041                                            (const_int 16)
22042                                            (const_int 32))
22043                              (const_int imm16_1))
22044          curr (movk) == (set (zero_extract (reg r0)
22045                                            (const_int 16)
22046                                            (const_int 48))
22047                              (const_int imm16_2))  */
22048
22049       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
22050           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
22051           && REG_P (XEXP (SET_DEST (prev_set), 0))
22052           && REG_P (XEXP (SET_DEST (curr_set), 0))
22053           && REGNO (XEXP (SET_DEST (prev_set), 0))
22054              == REGNO (XEXP (SET_DEST (curr_set), 0))
22055           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
22056           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
22057           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
22058           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
22059           && CONST_INT_P (SET_SRC (prev_set))
22060           && CONST_INT_P (SET_SRC (curr_set)))
22061         return true;
22062
22063     }
22064   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
22065     {
22066       /* We're trying to match:
22067           prev (adrp) == (set (reg r0)
22068                               (high (symbol_ref ("SYM"))))
22069           curr (ldr) == (set (reg r1)
22070                              (mem (lo_sum (reg r0)
22071                                              (symbol_ref ("SYM")))))
22072                  or
22073           curr (ldr) == (set (reg r1)
22074                              (zero_extend (mem
22075                                            (lo_sum (reg r0)
22076                                                    (symbol_ref ("SYM"))))))  */
22077       if (satisfies_constraint_Ush (SET_SRC (prev_set))
22078           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
22079         {
22080           rtx curr_src = SET_SRC (curr_set);
22081
22082           if (GET_CODE (curr_src) == ZERO_EXTEND)
22083             curr_src = XEXP (curr_src, 0);
22084
22085           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
22086               && REG_P (XEXP (XEXP (curr_src, 0), 0))
22087               && REGNO (XEXP (XEXP (curr_src, 0), 0))
22088                  == REGNO (SET_DEST (prev_set))
22089               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
22090                               XEXP (SET_SRC (prev_set), 0)))
22091               return true;
22092         }
22093     }
22094
22095   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
22096   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
22097       && prev_set && curr_set && any_condjump_p (curr)
22098       && GET_CODE (SET_SRC (prev_set)) == COMPARE
22099       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
22100       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
22101     return true;
22102
22103   /* Fuse flag-setting ALU instructions and conditional branch.  */
22104   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
22105       && any_condjump_p (curr))
22106     {
22107       unsigned int condreg1, condreg2;
22108       rtx cc_reg_1;
22109       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
22110       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
22111
22112       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
22113           && prev
22114           && modified_in_p (cc_reg_1, prev))
22115         {
22116           enum attr_type prev_type = get_attr_type (prev);
22117
22118           /* FIXME: this misses some which is considered simple arthematic
22119              instructions for ThunderX.  Simple shifts are missed here.  */
22120           if (prev_type == TYPE_ALUS_SREG
22121               || prev_type == TYPE_ALUS_IMM
22122               || prev_type == TYPE_LOGICS_REG
22123               || prev_type == TYPE_LOGICS_IMM)
22124             return true;
22125         }
22126     }
22127
22128   /* Fuse ALU instructions and CBZ/CBNZ.  */
22129   if (prev_set
22130       && curr_set
22131       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
22132       && any_condjump_p (curr))
22133     {
22134       /* We're trying to match:
22135           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
22136           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
22137                                                          (const_int 0))
22138                                                  (label_ref ("SYM"))
22139                                                  (pc))  */
22140       if (SET_DEST (curr_set) == (pc_rtx)
22141           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
22142           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
22143           && REG_P (SET_DEST (prev_set))
22144           && REGNO (SET_DEST (prev_set))
22145              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
22146         {
22147           /* Fuse ALU operations followed by conditional branch instruction.  */
22148           switch (get_attr_type (prev))
22149             {
22150             case TYPE_ALU_IMM:
22151             case TYPE_ALU_SREG:
22152             case TYPE_ADC_REG:
22153             case TYPE_ADC_IMM:
22154             case TYPE_ADCS_REG:
22155             case TYPE_ADCS_IMM:
22156             case TYPE_LOGIC_REG:
22157             case TYPE_LOGIC_IMM:
22158             case TYPE_CSEL:
22159             case TYPE_ADR:
22160             case TYPE_MOV_IMM:
22161             case TYPE_SHIFT_REG:
22162             case TYPE_SHIFT_IMM:
22163             case TYPE_BFM:
22164             case TYPE_RBIT:
22165             case TYPE_REV:
22166             case TYPE_EXTEND:
22167               return true;
22168
22169             default:;
22170             }
22171         }
22172     }
22173
22174   return false;
22175 }
22176
22177 /* Return true iff the instruction fusion described by OP is enabled.  */
22178
22179 bool
22180 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
22181 {
22182   return (aarch64_tune_params.fusible_ops & op) != 0;
22183 }
22184
22185 /* If MEM is in the form of [base+offset], extract the two parts
22186    of address and set to BASE and OFFSET, otherwise return false
22187    after clearing BASE and OFFSET.  */
22188
22189 bool
22190 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
22191 {
22192   rtx addr;
22193
22194   gcc_assert (MEM_P (mem));
22195
22196   addr = XEXP (mem, 0);
22197
22198   if (REG_P (addr))
22199     {
22200       *base = addr;
22201       *offset = const0_rtx;
22202       return true;
22203     }
22204
22205   if (GET_CODE (addr) == PLUS
22206       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
22207     {
22208       *base = XEXP (addr, 0);
22209       *offset = XEXP (addr, 1);
22210       return true;
22211     }
22212
22213   *base = NULL_RTX;
22214   *offset = NULL_RTX;
22215
22216   return false;
22217 }
22218
22219 /* Types for scheduling fusion.  */
22220 enum sched_fusion_type
22221 {
22222   SCHED_FUSION_NONE = 0,
22223   SCHED_FUSION_LD_SIGN_EXTEND,
22224   SCHED_FUSION_LD_ZERO_EXTEND,
22225   SCHED_FUSION_LD,
22226   SCHED_FUSION_ST,
22227   SCHED_FUSION_NUM
22228 };
22229
22230 /* If INSN is a load or store of address in the form of [base+offset],
22231    extract the two parts and set to BASE and OFFSET.  Return scheduling
22232    fusion type this INSN is.  */
22233
22234 static enum sched_fusion_type
22235 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
22236 {
22237   rtx x, dest, src;
22238   enum sched_fusion_type fusion = SCHED_FUSION_LD;
22239
22240   gcc_assert (INSN_P (insn));
22241   x = PATTERN (insn);
22242   if (GET_CODE (x) != SET)
22243     return SCHED_FUSION_NONE;
22244
22245   src = SET_SRC (x);
22246   dest = SET_DEST (x);
22247
22248   machine_mode dest_mode = GET_MODE (dest);
22249
22250   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
22251     return SCHED_FUSION_NONE;
22252
22253   if (GET_CODE (src) == SIGN_EXTEND)
22254     {
22255       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
22256       src = XEXP (src, 0);
22257       if (!MEM_P (src) || GET_MODE (src) != SImode)
22258         return SCHED_FUSION_NONE;
22259     }
22260   else if (GET_CODE (src) == ZERO_EXTEND)
22261     {
22262       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
22263       src = XEXP (src, 0);
22264       if (!MEM_P (src) || GET_MODE (src) != SImode)
22265         return SCHED_FUSION_NONE;
22266     }
22267
22268   if (MEM_P (src) && REG_P (dest))
22269     extract_base_offset_in_addr (src, base, offset);
22270   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
22271     {
22272       fusion = SCHED_FUSION_ST;
22273       extract_base_offset_in_addr (dest, base, offset);
22274     }
22275   else
22276     return SCHED_FUSION_NONE;
22277
22278   if (*base == NULL_RTX || *offset == NULL_RTX)
22279     fusion = SCHED_FUSION_NONE;
22280
22281   return fusion;
22282 }
22283
22284 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
22285
22286    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
22287    and PRI are only calculated for these instructions.  For other instruction,
22288    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
22289    type instruction fusion can be added by returning different priorities.
22290
22291    It's important that irrelevant instructions get the largest FUSION_PRI.  */
22292
22293 static void
22294 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
22295                                int *fusion_pri, int *pri)
22296 {
22297   int tmp, off_val;
22298   rtx base, offset;
22299   enum sched_fusion_type fusion;
22300
22301   gcc_assert (INSN_P (insn));
22302
22303   tmp = max_pri - 1;
22304   fusion = fusion_load_store (insn, &base, &offset);
22305   if (fusion == SCHED_FUSION_NONE)
22306     {
22307       *pri = tmp;
22308       *fusion_pri = tmp;
22309       return;
22310     }
22311
22312   /* Set FUSION_PRI according to fusion type and base register.  */
22313   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
22314
22315   /* Calculate PRI.  */
22316   tmp /= 2;
22317
22318   /* INSN with smaller offset goes first.  */
22319   off_val = (int)(INTVAL (offset));
22320   if (off_val >= 0)
22321     tmp -= (off_val & 0xfffff);
22322   else
22323     tmp += ((- off_val) & 0xfffff);
22324
22325   *pri = tmp;
22326   return;
22327 }
22328
22329 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
22330    Adjust priority of sha1h instructions so they are scheduled before
22331    other SHA1 instructions.  */
22332
22333 static int
22334 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
22335 {
22336   rtx x = PATTERN (insn);
22337
22338   if (GET_CODE (x) == SET)
22339     {
22340       x = SET_SRC (x);
22341
22342       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
22343         return priority + 10;
22344     }
22345
22346   return priority;
22347 }
22348
22349 /* Given OPERANDS of consecutive load/store, check if we can merge
22350    them into ldp/stp.  LOAD is true if they are load instructions.
22351    MODE is the mode of memory operands.  */
22352
22353 bool
22354 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
22355                                 machine_mode mode)
22356 {
22357   HOST_WIDE_INT offval_1, offval_2, msize;
22358   enum reg_class rclass_1, rclass_2;
22359   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
22360
22361   if (load)
22362     {
22363       mem_1 = operands[1];
22364       mem_2 = operands[3];
22365       reg_1 = operands[0];
22366       reg_2 = operands[2];
22367       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
22368       if (REGNO (reg_1) == REGNO (reg_2))
22369         return false;
22370     }
22371   else
22372     {
22373       mem_1 = operands[0];
22374       mem_2 = operands[2];
22375       reg_1 = operands[1];
22376       reg_2 = operands[3];
22377     }
22378
22379   /* The mems cannot be volatile.  */
22380   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
22381     return false;
22382
22383   /* If we have SImode and slow unaligned ldp,
22384      check the alignment to be at least 8 byte. */
22385   if (mode == SImode
22386       && (aarch64_tune_params.extra_tuning_flags
22387           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22388       && !optimize_size
22389       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
22390     return false;
22391
22392   /* Check if the addresses are in the form of [base+offset].  */
22393   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22394   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
22395     return false;
22396   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22397   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
22398     return false;
22399
22400   /* Check if the bases are same.  */
22401   if (!rtx_equal_p (base_1, base_2))
22402     return false;
22403
22404   /* The operands must be of the same size.  */
22405   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
22406                          GET_MODE_SIZE (GET_MODE (mem_2))));
22407
22408   offval_1 = INTVAL (offset_1);
22409   offval_2 = INTVAL (offset_2);
22410   /* We should only be trying this for fixed-sized modes.  There is no
22411      SVE LDP/STP instruction.  */
22412   msize = GET_MODE_SIZE (mode).to_constant ();
22413   /* Check if the offsets are consecutive.  */
22414   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
22415     return false;
22416
22417   /* Check if the addresses are clobbered by load.  */
22418   if (load)
22419     {
22420       if (reg_mentioned_p (reg_1, mem_1))
22421         return false;
22422
22423       /* In increasing order, the last load can clobber the address.  */
22424       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
22425         return false;
22426     }
22427
22428   /* One of the memory accesses must be a mempair operand.
22429      If it is not the first one, they need to be swapped by the
22430      peephole.  */
22431   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
22432        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
22433     return false;
22434
22435   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
22436     rclass_1 = FP_REGS;
22437   else
22438     rclass_1 = GENERAL_REGS;
22439
22440   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
22441     rclass_2 = FP_REGS;
22442   else
22443     rclass_2 = GENERAL_REGS;
22444
22445   /* Check if the registers are of same class.  */
22446   if (rclass_1 != rclass_2)
22447     return false;
22448
22449   return true;
22450 }
22451
22452 /* Given OPERANDS of consecutive load/store that can be merged,
22453    swap them if they are not in ascending order.  */
22454 void
22455 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
22456 {
22457   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
22458   HOST_WIDE_INT offval_1, offval_2;
22459
22460   if (load)
22461     {
22462       mem_1 = operands[1];
22463       mem_2 = operands[3];
22464     }
22465   else
22466     {
22467       mem_1 = operands[0];
22468       mem_2 = operands[2];
22469     }
22470
22471   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22472   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22473
22474   offval_1 = INTVAL (offset_1);
22475   offval_2 = INTVAL (offset_2);
22476
22477   if (offval_1 > offval_2)
22478     {
22479       /* Irrespective of whether this is a load or a store,
22480          we do the same swap.  */
22481       std::swap (operands[0], operands[2]);
22482       std::swap (operands[1], operands[3]);
22483     }
22484 }
22485
22486 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
22487    comparison between the two.  */
22488 int
22489 aarch64_host_wide_int_compare (const void *x, const void *y)
22490 {
22491   return wi::cmps (* ((const HOST_WIDE_INT *) x),
22492                    * ((const HOST_WIDE_INT *) y));
22493 }
22494
22495 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
22496    other pointing to a REG rtx containing an offset, compare the offsets
22497    of the two pairs.
22498
22499    Return:
22500
22501         1 iff offset (X) > offset (Y)
22502         0 iff offset (X) == offset (Y)
22503         -1 iff offset (X) < offset (Y)  */
22504 int
22505 aarch64_ldrstr_offset_compare (const void *x, const void *y)
22506 {
22507   const rtx * operands_1 = (const rtx *) x;
22508   const rtx * operands_2 = (const rtx *) y;
22509   rtx mem_1, mem_2, base, offset_1, offset_2;
22510
22511   if (MEM_P (operands_1[0]))
22512     mem_1 = operands_1[0];
22513   else
22514     mem_1 = operands_1[1];
22515
22516   if (MEM_P (operands_2[0]))
22517     mem_2 = operands_2[0];
22518   else
22519     mem_2 = operands_2[1];
22520
22521   /* Extract the offsets.  */
22522   extract_base_offset_in_addr (mem_1, &base, &offset_1);
22523   extract_base_offset_in_addr (mem_2, &base, &offset_2);
22524
22525   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
22526
22527   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
22528 }
22529
22530 /* Given OPERANDS of consecutive load/store, check if we can merge
22531    them into ldp/stp by adjusting the offset.  LOAD is true if they
22532    are load instructions.  MODE is the mode of memory operands.
22533
22534    Given below consecutive stores:
22535
22536      str  w1, [xb, 0x100]
22537      str  w1, [xb, 0x104]
22538      str  w1, [xb, 0x108]
22539      str  w1, [xb, 0x10c]
22540
22541    Though the offsets are out of the range supported by stp, we can
22542    still pair them after adjusting the offset, like:
22543
22544      add  scratch, xb, 0x100
22545      stp  w1, w1, [scratch]
22546      stp  w1, w1, [scratch, 0x8]
22547
22548    The peephole patterns detecting this opportunity should guarantee
22549    the scratch register is avaliable.  */
22550
22551 bool
22552 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
22553                                        machine_mode mode)
22554 {
22555   const int num_insns = 4;
22556   enum reg_class rclass;
22557   HOST_WIDE_INT offvals[num_insns], msize;
22558   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
22559
22560   if (load)
22561     {
22562       for (int i = 0; i < num_insns; i++)
22563         {
22564           reg[i] = operands[2 * i];
22565           mem[i] = operands[2 * i + 1];
22566
22567           gcc_assert (REG_P (reg[i]));
22568         }
22569
22570       /* Do not attempt to merge the loads if the loads clobber each other.  */
22571       for (int i = 0; i < 8; i += 2)
22572         for (int j = i + 2; j < 8; j += 2)
22573           if (reg_overlap_mentioned_p (operands[i], operands[j]))
22574             return false;
22575     }
22576   else
22577     for (int i = 0; i < num_insns; i++)
22578       {
22579         mem[i] = operands[2 * i];
22580         reg[i] = operands[2 * i + 1];
22581       }
22582
22583   /* Skip if memory operand is by itself valid for ldp/stp.  */
22584   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
22585     return false;
22586
22587   for (int i = 0; i < num_insns; i++)
22588     {
22589       /* The mems cannot be volatile.  */
22590       if (MEM_VOLATILE_P (mem[i]))
22591         return false;
22592
22593       /* Check if the addresses are in the form of [base+offset].  */
22594       extract_base_offset_in_addr (mem[i], base + i, offset + i);
22595       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
22596         return false;
22597     }
22598
22599   /* Check if the registers are of same class.  */
22600   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
22601     ? FP_REGS : GENERAL_REGS;
22602
22603   for (int i = 1; i < num_insns; i++)
22604     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
22605       {
22606         if (rclass != FP_REGS)
22607           return false;
22608       }
22609     else
22610       {
22611         if (rclass != GENERAL_REGS)
22612           return false;
22613       }
22614
22615   /* Only the last register in the order in which they occur
22616      may be clobbered by the load.  */
22617   if (rclass == GENERAL_REGS && load)
22618     for (int i = 0; i < num_insns - 1; i++)
22619       if (reg_mentioned_p (reg[i], mem[i]))
22620         return false;
22621
22622   /* Check if the bases are same.  */
22623   for (int i = 0; i < num_insns - 1; i++)
22624     if (!rtx_equal_p (base[i], base[i + 1]))
22625       return false;
22626
22627   for (int i = 0; i < num_insns; i++)
22628     offvals[i] = INTVAL (offset[i]);
22629
22630   msize = GET_MODE_SIZE (mode).to_constant ();
22631
22632   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
22633   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
22634          aarch64_host_wide_int_compare);
22635
22636   if (!(offvals[1] == offvals[0] + msize
22637         && offvals[3] == offvals[2] + msize))
22638     return false;
22639
22640   /* Check that offsets are within range of each other.  The ldp/stp
22641      instructions have 7 bit immediate offsets, so use 0x80.  */
22642   if (offvals[2] - offvals[0] >= msize * 0x80)
22643     return false;
22644
22645   /* The offsets must be aligned with respect to each other.  */
22646   if (offvals[0] % msize != offvals[2] % msize)
22647     return false;
22648
22649   /* If we have SImode and slow unaligned ldp,
22650      check the alignment to be at least 8 byte. */
22651   if (mode == SImode
22652       && (aarch64_tune_params.extra_tuning_flags
22653           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22654       && !optimize_size
22655       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
22656     return false;
22657
22658   return true;
22659 }
22660
22661 /* Given OPERANDS of consecutive load/store, this function pairs them
22662    into LDP/STP after adjusting the offset.  It depends on the fact
22663    that the operands can be sorted so the offsets are correct for STP.
22664    MODE is the mode of memory operands.  CODE is the rtl operator
22665    which should be applied to all memory operands, it's SIGN_EXTEND,
22666    ZERO_EXTEND or UNKNOWN.  */
22667
22668 bool
22669 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
22670                              machine_mode mode, RTX_CODE code)
22671 {
22672   rtx base, offset_1, offset_3, t1, t2;
22673   rtx mem_1, mem_2, mem_3, mem_4;
22674   rtx temp_operands[8];
22675   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22676                 stp_off_upper_limit, stp_off_lower_limit, msize;
22677
22678   /* We make changes on a copy as we may still bail out.  */
22679   for (int i = 0; i < 8; i ++)
22680     temp_operands[i] = operands[i];
22681
22682   /* Sort the operands.  */
22683   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
22684
22685   /* Copy the memory operands so that if we have to bail for some
22686      reason the original addresses are unchanged.  */
22687   if (load)
22688     {
22689       mem_1 = copy_rtx (temp_operands[1]);
22690       mem_2 = copy_rtx (temp_operands[3]);
22691       mem_3 = copy_rtx (temp_operands[5]);
22692       mem_4 = copy_rtx (temp_operands[7]);
22693     }
22694   else
22695     {
22696       mem_1 = copy_rtx (temp_operands[0]);
22697       mem_2 = copy_rtx (temp_operands[2]);
22698       mem_3 = copy_rtx (temp_operands[4]);
22699       mem_4 = copy_rtx (temp_operands[6]);
22700       gcc_assert (code == UNKNOWN);
22701     }
22702
22703   extract_base_offset_in_addr (mem_1, &base, &offset_1);
22704   extract_base_offset_in_addr (mem_3, &base, &offset_3);
22705   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22706               && offset_3 != NULL_RTX);
22707
22708   /* Adjust offset so it can fit in LDP/STP instruction.  */
22709   msize = GET_MODE_SIZE (mode).to_constant();
22710   stp_off_upper_limit = msize * (0x40 - 1);
22711   stp_off_lower_limit = - msize * 0x40;
22712
22713   off_val_1 = INTVAL (offset_1);
22714   off_val_3 = INTVAL (offset_3);
22715
22716   /* The base offset is optimally half way between the two STP/LDP offsets.  */
22717   if (msize <= 4)
22718     base_off = (off_val_1 + off_val_3) / 2;
22719   else
22720     /* However, due to issues with negative LDP/STP offset generation for
22721        larger modes, for DF, DI and vector modes. we must not use negative
22722        addresses smaller than 9 signed unadjusted bits can store.  This
22723        provides the most range in this case.  */
22724     base_off = off_val_1;
22725
22726   /* Adjust the base so that it is aligned with the addresses but still
22727      optimal.  */
22728   if (base_off % msize != off_val_1 % msize)
22729     /* Fix the offset, bearing in mind we want to make it bigger not
22730        smaller.  */
22731     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22732   else if (msize <= 4)
22733     /* The negative range of LDP/STP is one larger than the positive range.  */
22734     base_off += msize;
22735
22736   /* Check if base offset is too big or too small.  We can attempt to resolve
22737      this issue by setting it to the maximum value and seeing if the offsets
22738      still fit.  */
22739   if (base_off >= 0x1000)
22740     {
22741       base_off = 0x1000 - 1;
22742       /* We must still make sure that the base offset is aligned with respect
22743          to the address.  But it may not be made any bigger.  */
22744       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22745     }
22746
22747   /* Likewise for the case where the base is too small.  */
22748   if (base_off <= -0x1000)
22749     {
22750       base_off = -0x1000 + 1;
22751       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22752     }
22753
22754   /* Offset of the first STP/LDP.  */
22755   new_off_1 = off_val_1 - base_off;
22756
22757   /* Offset of the second STP/LDP.  */
22758   new_off_3 = off_val_3 - base_off;
22759
22760   /* The offsets must be within the range of the LDP/STP instructions.  */
22761   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22762       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
22763     return false;
22764
22765   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22766                                                   new_off_1), true);
22767   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22768                                                   new_off_1 + msize), true);
22769   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22770                                                   new_off_3), true);
22771   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22772                                                   new_off_3 + msize), true);
22773
22774   if (!aarch64_mem_pair_operand (mem_1, mode)
22775       || !aarch64_mem_pair_operand (mem_3, mode))
22776     return false;
22777
22778   if (code == ZERO_EXTEND)
22779     {
22780       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22781       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22782       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22783       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22784     }
22785   else if (code == SIGN_EXTEND)
22786     {
22787       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22788       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22789       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22790       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22791     }
22792
22793   if (load)
22794     {
22795       operands[0] = temp_operands[0];
22796       operands[1] = mem_1;
22797       operands[2] = temp_operands[2];
22798       operands[3] = mem_2;
22799       operands[4] = temp_operands[4];
22800       operands[5] = mem_3;
22801       operands[6] = temp_operands[6];
22802       operands[7] = mem_4;
22803     }
22804   else
22805     {
22806       operands[0] = mem_1;
22807       operands[1] = temp_operands[1];
22808       operands[2] = mem_2;
22809       operands[3] = temp_operands[3];
22810       operands[4] = mem_3;
22811       operands[5] = temp_operands[5];
22812       operands[6] = mem_4;
22813       operands[7] = temp_operands[7];
22814     }
22815
22816   /* Emit adjusting instruction.  */
22817   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
22818   /* Emit ldp/stp instructions.  */
22819   t1 = gen_rtx_SET (operands[0], operands[1]);
22820   t2 = gen_rtx_SET (operands[2], operands[3]);
22821   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22822   t1 = gen_rtx_SET (operands[4], operands[5]);
22823   t2 = gen_rtx_SET (operands[6], operands[7]);
22824   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22825   return true;
22826 }
22827
22828 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
22829    it isn't worth branching around empty masked ops (including masked
22830    stores).  */
22831
22832 static bool
22833 aarch64_empty_mask_is_expensive (unsigned)
22834 {
22835   return false;
22836 }
22837
22838 /* Return 1 if pseudo register should be created and used to hold
22839    GOT address for PIC code.  */
22840
22841 bool
22842 aarch64_use_pseudo_pic_reg (void)
22843 {
22844   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22845 }
22846
22847 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
22848
22849 static int
22850 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22851 {
22852   switch (XINT (x, 1))
22853     {
22854     case UNSPEC_GOTSMALLPIC:
22855     case UNSPEC_GOTSMALLPIC28K:
22856     case UNSPEC_GOTTINYPIC:
22857       return 0;
22858     default:
22859       break;
22860     }
22861
22862   return default_unspec_may_trap_p (x, flags);
22863 }
22864
22865
22866 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
22867    return the log2 of that value.  Otherwise return -1.  */
22868
22869 int
22870 aarch64_fpconst_pow_of_2 (rtx x)
22871 {
22872   const REAL_VALUE_TYPE *r;
22873
22874   if (!CONST_DOUBLE_P (x))
22875     return -1;
22876
22877   r = CONST_DOUBLE_REAL_VALUE (x);
22878
22879   if (REAL_VALUE_NEGATIVE (*r)
22880       || REAL_VALUE_ISNAN (*r)
22881       || REAL_VALUE_ISINF (*r)
22882       || !real_isinteger (r, DFmode))
22883     return -1;
22884
22885   return exact_log2 (real_to_integer (r));
22886 }
22887
22888 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22889    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22890    return n. Otherwise return -1.  */
22891
22892 int
22893 aarch64_fpconst_pow2_recip (rtx x)
22894 {
22895   REAL_VALUE_TYPE r0;
22896
22897   if (!CONST_DOUBLE_P (x))
22898     return -1;
22899
22900   r0 = *CONST_DOUBLE_REAL_VALUE (x);
22901   if (exact_real_inverse (DFmode, &r0)
22902       && !REAL_VALUE_NEGATIVE (r0))
22903     {
22904         int ret = exact_log2 (real_to_integer (&r0));
22905         if (ret >= 1 && ret <= 32)
22906             return ret;
22907     }
22908   return -1;
22909 }
22910
22911 /* If X is a vector of equal CONST_DOUBLE values and that value is
22912    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
22913
22914 int
22915 aarch64_vec_fpconst_pow_of_2 (rtx x)
22916 {
22917   int nelts;
22918   if (GET_CODE (x) != CONST_VECTOR
22919       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
22920     return -1;
22921
22922   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22923     return -1;
22924
22925   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22926   if (firstval <= 0)
22927     return -1;
22928
22929   for (int i = 1; i < nelts; i++)
22930     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22931       return -1;
22932
22933   return firstval;
22934 }
22935
22936 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22937    to float.
22938
22939    __fp16 always promotes through this hook.
22940    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22941    through the generic excess precision logic rather than here.  */
22942
22943 static tree
22944 aarch64_promoted_type (const_tree t)
22945 {
22946   if (SCALAR_FLOAT_TYPE_P (t)
22947       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
22948     return float_type_node;
22949
22950   return NULL_TREE;
22951 }
22952
22953 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
22954
22955 static bool
22956 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22957                            optimization_type opt_type)
22958 {
22959   switch (op)
22960     {
22961     case rsqrt_optab:
22962       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22963
22964     default:
22965       return true;
22966     }
22967 }
22968
22969 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
22970
22971 static unsigned int
22972 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22973                                         int *offset)
22974 {
22975   /* Polynomial invariant 1 == (VG / 2) - 1.  */
22976   gcc_assert (i == 1);
22977   *factor = 2;
22978   *offset = 1;
22979   return AARCH64_DWARF_VG;
22980 }
22981
22982 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22983    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22984
22985 static bool
22986 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22987 {
22988   return (mode == HFmode
22989           ? true
22990           : default_libgcc_floating_mode_supported_p (mode));
22991 }
22992
22993 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22994    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22995
22996 static bool
22997 aarch64_scalar_mode_supported_p (scalar_mode mode)
22998 {
22999   return (mode == HFmode
23000           ? true
23001           : default_scalar_mode_supported_p (mode));
23002 }
23003
23004 /* Set the value of FLT_EVAL_METHOD.
23005    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
23006
23007     0: evaluate all operations and constants, whose semantic type has at
23008        most the range and precision of type float, to the range and
23009        precision of float; evaluate all other operations and constants to
23010        the range and precision of the semantic type;
23011
23012     N, where _FloatN is a supported interchange floating type
23013        evaluate all operations and constants, whose semantic type has at
23014        most the range and precision of _FloatN type, to the range and
23015        precision of the _FloatN type; evaluate all other operations and
23016        constants to the range and precision of the semantic type;
23017
23018    If we have the ARMv8.2-A extensions then we support _Float16 in native
23019    precision, so we should set this to 16.  Otherwise, we support the type,
23020    but want to evaluate expressions in float precision, so set this to
23021    0.  */
23022
23023 static enum flt_eval_method
23024 aarch64_excess_precision (enum excess_precision_type type)
23025 {
23026   switch (type)
23027     {
23028       case EXCESS_PRECISION_TYPE_FAST:
23029       case EXCESS_PRECISION_TYPE_STANDARD:
23030         /* We can calculate either in 16-bit range and precision or
23031            32-bit range and precision.  Make that decision based on whether
23032            we have native support for the ARMv8.2-A 16-bit floating-point
23033            instructions or not.  */
23034         return (TARGET_FP_F16INST
23035                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
23036                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
23037       case EXCESS_PRECISION_TYPE_IMPLICIT:
23038         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
23039       default:
23040         gcc_unreachable ();
23041     }
23042   return FLT_EVAL_METHOD_UNPREDICTABLE;
23043 }
23044
23045 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
23046    scheduled for speculative execution.  Reject the long-running division
23047    and square-root instructions.  */
23048
23049 static bool
23050 aarch64_sched_can_speculate_insn (rtx_insn *insn)
23051 {
23052   switch (get_attr_type (insn))
23053     {
23054       case TYPE_SDIV:
23055       case TYPE_UDIV:
23056       case TYPE_FDIVS:
23057       case TYPE_FDIVD:
23058       case TYPE_FSQRTS:
23059       case TYPE_FSQRTD:
23060       case TYPE_NEON_FP_SQRT_S:
23061       case TYPE_NEON_FP_SQRT_D:
23062       case TYPE_NEON_FP_SQRT_S_Q:
23063       case TYPE_NEON_FP_SQRT_D_Q:
23064       case TYPE_NEON_FP_DIV_S:
23065       case TYPE_NEON_FP_DIV_D:
23066       case TYPE_NEON_FP_DIV_S_Q:
23067       case TYPE_NEON_FP_DIV_D_Q:
23068         return false;
23069       default:
23070         return true;
23071     }
23072 }
23073
23074 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
23075
23076 static int
23077 aarch64_compute_pressure_classes (reg_class *classes)
23078 {
23079   int i = 0;
23080   classes[i++] = GENERAL_REGS;
23081   classes[i++] = FP_REGS;
23082   /* PR_REGS isn't a useful pressure class because many predicate pseudo
23083      registers need to go in PR_LO_REGS at some point during their
23084      lifetime.  Splitting it into two halves has the effect of making
23085      all predicates count against PR_LO_REGS, so that we try whenever
23086      possible to restrict the number of live predicates to 8.  This
23087      greatly reduces the amount of spilling in certain loops.  */
23088   classes[i++] = PR_LO_REGS;
23089   classes[i++] = PR_HI_REGS;
23090   return i;
23091 }
23092
23093 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
23094
23095 static bool
23096 aarch64_can_change_mode_class (machine_mode from,
23097                                machine_mode to, reg_class_t)
23098 {
23099   unsigned int from_flags = aarch64_classify_vector_mode (from);
23100   unsigned int to_flags = aarch64_classify_vector_mode (to);
23101
23102   bool from_sve_p = (from_flags & VEC_ANY_SVE);
23103   bool to_sve_p = (to_flags & VEC_ANY_SVE);
23104
23105   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
23106   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
23107
23108   bool from_pred_p = (from_flags & VEC_SVE_PRED);
23109   bool to_pred_p = (to_flags & VEC_SVE_PRED);
23110
23111   /* Don't allow changes between predicate modes and other modes.
23112      Only predicate registers can hold predicate modes and only
23113      non-predicate registers can hold non-predicate modes, so any
23114      attempt to mix them would require a round trip through memory.  */
23115   if (from_pred_p != to_pred_p)
23116     return false;
23117
23118   /* Don't allow changes between partial SVE modes and other modes.
23119      The contents of partial SVE modes are distributed evenly across
23120      the register, whereas GCC expects them to be clustered together.  */
23121   if (from_partial_sve_p != to_partial_sve_p)
23122     return false;
23123
23124   /* Similarly reject changes between partial SVE modes that have
23125      different patterns of significant and insignificant bits.  */
23126   if (from_partial_sve_p
23127       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
23128           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
23129     return false;
23130
23131   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
23132     {
23133       /* Don't allow changes between SVE modes and other modes that might
23134          be bigger than 128 bits.  In particular, OImode, CImode and XImode
23135          divide into 128-bit quantities while SVE modes divide into
23136          BITS_PER_SVE_VECTOR quantities.  */
23137       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
23138         return false;
23139       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
23140         return false;
23141     }
23142
23143   if (BYTES_BIG_ENDIAN)
23144     {
23145       /* Don't allow changes between SVE data modes and non-SVE modes.
23146          See the comment at the head of aarch64-sve.md for details.  */
23147       if (from_sve_p != to_sve_p)
23148         return false;
23149
23150       /* Don't allow changes in element size: lane 0 of the new vector
23151          would not then be lane 0 of the old vector.  See the comment
23152          above aarch64_maybe_expand_sve_subreg_move for a more detailed
23153          description.
23154
23155          In the worst case, this forces a register to be spilled in
23156          one mode and reloaded in the other, which handles the
23157          endianness correctly.  */
23158       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
23159         return false;
23160     }
23161   return true;
23162 }
23163
23164 /* Implement TARGET_EARLY_REMAT_MODES.  */
23165
23166 static void
23167 aarch64_select_early_remat_modes (sbitmap modes)
23168 {
23169   /* SVE values are not normally live across a call, so it should be
23170      worth doing early rematerialization even in VL-specific mode.  */
23171   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
23172     if (aarch64_sve_mode_p ((machine_mode) i))
23173       bitmap_set_bit (modes, i);
23174 }
23175
23176 /* Override the default target speculation_safe_value.  */
23177 static rtx
23178 aarch64_speculation_safe_value (machine_mode mode,
23179                                 rtx result, rtx val, rtx failval)
23180 {
23181   /* Maybe we should warn if falling back to hard barriers.  They are
23182      likely to be noticably more expensive than the alternative below.  */
23183   if (!aarch64_track_speculation)
23184     return default_speculation_safe_value (mode, result, val, failval);
23185
23186   if (!REG_P (val))
23187     val = copy_to_mode_reg (mode, val);
23188
23189   if (!aarch64_reg_or_zero (failval, mode))
23190     failval = copy_to_mode_reg (mode, failval);
23191
23192   emit_insn (gen_despeculate_copy (mode, result, val, failval));
23193   return result;
23194 }
23195
23196 /* Implement TARGET_ESTIMATED_POLY_VALUE.
23197    Look into the tuning structure for an estimate.
23198    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
23199    Advanced SIMD 128 bits.  */
23200
23201 static HOST_WIDE_INT
23202 aarch64_estimated_poly_value (poly_int64 val)
23203 {
23204   enum aarch64_sve_vector_bits_enum width_source
23205     = aarch64_tune_params.sve_width;
23206
23207   /* If we still don't have an estimate, use the default.  */
23208   if (width_source == SVE_SCALABLE)
23209     return default_estimated_poly_value (val);
23210
23211   HOST_WIDE_INT over_128 = width_source - 128;
23212   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
23213 }
23214
23215
23216 /* Return true for types that could be supported as SIMD return or
23217    argument types.  */
23218
23219 static bool
23220 supported_simd_type (tree t)
23221 {
23222   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
23223     {
23224       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
23225       return s == 1 || s == 2 || s == 4 || s == 8;
23226     }
23227   return false;
23228 }
23229
23230 /* Return true for types that currently are supported as SIMD return
23231    or argument types.  */
23232
23233 static bool
23234 currently_supported_simd_type (tree t, tree b)
23235 {
23236   if (COMPLEX_FLOAT_TYPE_P (t))
23237     return false;
23238
23239   if (TYPE_SIZE (t) != TYPE_SIZE (b))
23240     return false;
23241
23242   return supported_simd_type (t);
23243 }
23244
23245 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
23246
23247 static int
23248 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
23249                                         struct cgraph_simd_clone *clonei,
23250                                         tree base_type, int num)
23251 {
23252   tree t, ret_type, arg_type;
23253   unsigned int elt_bits, count;
23254   unsigned HOST_WIDE_INT const_simdlen;
23255   poly_uint64 vec_bits;
23256
23257   if (!TARGET_SIMD)
23258     return 0;
23259
23260   /* For now, SVE simdclones won't produce illegal simdlen, So only check
23261      const simdlens here.  */
23262   if (maybe_ne (clonei->simdlen, 0U)
23263       && clonei->simdlen.is_constant (&const_simdlen)
23264       && (const_simdlen < 2
23265           || const_simdlen > 1024
23266           || (const_simdlen & (const_simdlen - 1)) != 0))
23267     {
23268       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23269                   "unsupported simdlen %wd", const_simdlen);
23270       return 0;
23271     }
23272
23273   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
23274   if (TREE_CODE (ret_type) != VOID_TYPE
23275       && !currently_supported_simd_type (ret_type, base_type))
23276     {
23277       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
23278         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23279                     "GCC does not currently support mixed size types "
23280                     "for %<simd%> functions");
23281       else if (supported_simd_type (ret_type))
23282         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23283                     "GCC does not currently support return type %qT "
23284                     "for %<simd%> functions", ret_type);
23285       else
23286         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23287                     "unsupported return type %qT for %<simd%> functions",
23288                     ret_type);
23289       return 0;
23290     }
23291
23292   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
23293     {
23294       arg_type = TREE_TYPE (t);
23295
23296       if (!currently_supported_simd_type (arg_type, base_type))
23297         {
23298           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
23299             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23300                         "GCC does not currently support mixed size types "
23301                         "for %<simd%> functions");
23302           else
23303             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23304                         "GCC does not currently support argument type %qT "
23305                         "for %<simd%> functions", arg_type);
23306           return 0;
23307         }
23308     }
23309
23310   clonei->vecsize_mangle = 'n';
23311   clonei->mask_mode = VOIDmode;
23312   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
23313   if (known_eq (clonei->simdlen, 0U))
23314     {
23315       count = 2;
23316       vec_bits = (num == 0 ? 64 : 128);
23317       clonei->simdlen = exact_div (vec_bits, elt_bits);
23318     }
23319   else
23320     {
23321       count = 1;
23322       vec_bits = clonei->simdlen * elt_bits;
23323       /* For now, SVE simdclones won't produce illegal simdlen, So only check
23324          const simdlens here.  */
23325       if (clonei->simdlen.is_constant (&const_simdlen)
23326           && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
23327         {
23328           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23329                       "GCC does not currently support simdlen %wd for type %qT",
23330                       const_simdlen, base_type);
23331           return 0;
23332         }
23333     }
23334   clonei->vecsize_int = vec_bits;
23335   clonei->vecsize_float = vec_bits;
23336   return count;
23337 }
23338
23339 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
23340
23341 static void
23342 aarch64_simd_clone_adjust (struct cgraph_node *node)
23343 {
23344   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
23345      use the correct ABI.  */
23346
23347   tree t = TREE_TYPE (node->decl);
23348   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
23349                                         TYPE_ATTRIBUTES (t));
23350 }
23351
23352 /* Implement TARGET_SIMD_CLONE_USABLE.  */
23353
23354 static int
23355 aarch64_simd_clone_usable (struct cgraph_node *node)
23356 {
23357   switch (node->simdclone->vecsize_mangle)
23358     {
23359     case 'n':
23360       if (!TARGET_SIMD)
23361         return -1;
23362       return 0;
23363     default:
23364       gcc_unreachable ();
23365     }
23366 }
23367
23368 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
23369
23370 static int
23371 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
23372 {
23373   auto check_attr = [&](const char *name) {
23374     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
23375     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
23376     if (!attr1 && !attr2)
23377       return true;
23378
23379     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
23380   };
23381
23382   if (!check_attr ("aarch64_vector_pcs"))
23383     return 0;
23384   if (!check_attr ("Advanced SIMD type"))
23385     return 0;
23386   return 1;
23387 }
23388
23389 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
23390
23391 static const char *
23392 aarch64_get_multilib_abi_name (void)
23393 {
23394   if (TARGET_BIG_END)
23395     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
23396   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
23397 }
23398
23399 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
23400    global variable based guard use the default else
23401    return a null tree.  */
23402 static tree
23403 aarch64_stack_protect_guard (void)
23404 {
23405   if (aarch64_stack_protector_guard == SSP_GLOBAL)
23406     return default_stack_protect_guard ();
23407
23408   return NULL_TREE;
23409 }
23410
23411 /* Return the diagnostic message string if conversion from FROMTYPE to
23412    TOTYPE is not allowed, NULL otherwise.  */
23413
23414 static const char *
23415 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
23416 {
23417   if (element_mode (fromtype) != element_mode (totype))
23418     {
23419       /* Do no allow conversions to/from BFmode scalar types.  */
23420       if (TYPE_MODE (fromtype) == BFmode)
23421         return N_("invalid conversion from type %<bfloat16_t%>");
23422       if (TYPE_MODE (totype) == BFmode)
23423         return N_("invalid conversion to type %<bfloat16_t%>");
23424     }
23425
23426   /* Conversion allowed.  */
23427   return NULL;
23428 }
23429
23430 /* Return the diagnostic message string if the unary operation OP is
23431    not permitted on TYPE, NULL otherwise.  */
23432
23433 static const char *
23434 aarch64_invalid_unary_op (int op, const_tree type)
23435 {
23436   /* Reject all single-operand operations on BFmode except for &.  */
23437   if (element_mode (type) == BFmode && op != ADDR_EXPR)
23438     return N_("operation not permitted on type %<bfloat16_t%>");
23439
23440   /* Operation allowed.  */
23441   return NULL;
23442 }
23443
23444 /* Return the diagnostic message string if the binary operation OP is
23445    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
23446
23447 static const char *
23448 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
23449                            const_tree type2)
23450 {
23451   /* Reject all 2-operand operations on BFmode.  */
23452   if (element_mode (type1) == BFmode
23453       || element_mode (type2) == BFmode)
23454     return N_("operation not permitted on type %<bfloat16_t%>");
23455
23456   if (VECTOR_TYPE_P (type1)
23457       && VECTOR_TYPE_P (type2)
23458       && !TYPE_INDIVISIBLE_P (type1)
23459       && !TYPE_INDIVISIBLE_P (type2)
23460       && (aarch64_sve::builtin_type_p (type1)
23461           != aarch64_sve::builtin_type_p (type2)))
23462     return N_("cannot combine GNU and SVE vectors in a binary operation");
23463
23464   /* Operation allowed.  */
23465   return NULL;
23466 }
23467
23468 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
23469    compiler that we automatically ignore the top byte of our pointers, which
23470    allows using -fsanitize=hwaddress.  */
23471 bool
23472 aarch64_can_tag_addresses ()
23473 {
23474   return !TARGET_ILP32;
23475 }
23476
23477 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
23478    section at the end if needed.  */
23479 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
23480 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
23481 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
23482 void
23483 aarch64_file_end_indicate_exec_stack ()
23484 {
23485   file_end_indicate_exec_stack ();
23486
23487   unsigned feature_1_and = 0;
23488   if (aarch64_bti_enabled ())
23489     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
23490
23491   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
23492     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
23493
23494   if (feature_1_and)
23495     {
23496       /* Generate .note.gnu.property section.  */
23497       switch_to_section (get_section (".note.gnu.property",
23498                                       SECTION_NOTYPE, NULL));
23499
23500       /* PT_NOTE header: namesz, descsz, type.
23501          namesz = 4 ("GNU\0")
23502          descsz = 16 (Size of the program property array)
23503                   [(12 + padding) * Number of array elements]
23504          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
23505       assemble_align (POINTER_SIZE);
23506       assemble_integer (GEN_INT (4), 4, 32, 1);
23507       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
23508       assemble_integer (GEN_INT (5), 4, 32, 1);
23509
23510       /* PT_NOTE name.  */
23511       assemble_string ("GNU", 4);
23512
23513       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
23514          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
23515          datasz = 4
23516          data   = feature_1_and.  */
23517       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
23518       assemble_integer (GEN_INT (4), 4, 32, 1);
23519       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
23520
23521       /* Pad the size of the note to the required alignment.  */
23522       assemble_align (POINTER_SIZE);
23523     }
23524 }
23525 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
23526 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
23527 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
23528
23529 /* Helper function for straight line speculation.
23530    Return what barrier should be emitted for straight line speculation
23531    mitigation.
23532    When not mitigating against straight line speculation this function returns
23533    an empty string.
23534    When mitigating against straight line speculation, use:
23535    * SB when the v8.5-A SB extension is enabled.
23536    * DSB+ISB otherwise.  */
23537 const char *
23538 aarch64_sls_barrier (int mitigation_required)
23539 {
23540   return mitigation_required
23541     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
23542     : "";
23543 }
23544
23545 static GTY (()) tree aarch64_sls_shared_thunks[30];
23546 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
23547 const char *indirect_symbol_names[30] = {
23548     "__call_indirect_x0",
23549     "__call_indirect_x1",
23550     "__call_indirect_x2",
23551     "__call_indirect_x3",
23552     "__call_indirect_x4",
23553     "__call_indirect_x5",
23554     "__call_indirect_x6",
23555     "__call_indirect_x7",
23556     "__call_indirect_x8",
23557     "__call_indirect_x9",
23558     "__call_indirect_x10",
23559     "__call_indirect_x11",
23560     "__call_indirect_x12",
23561     "__call_indirect_x13",
23562     "__call_indirect_x14",
23563     "__call_indirect_x15",
23564     "", /* "__call_indirect_x16",  */
23565     "", /* "__call_indirect_x17",  */
23566     "__call_indirect_x18",
23567     "__call_indirect_x19",
23568     "__call_indirect_x20",
23569     "__call_indirect_x21",
23570     "__call_indirect_x22",
23571     "__call_indirect_x23",
23572     "__call_indirect_x24",
23573     "__call_indirect_x25",
23574     "__call_indirect_x26",
23575     "__call_indirect_x27",
23576     "__call_indirect_x28",
23577     "__call_indirect_x29",
23578 };
23579
23580 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
23581    line speculation.  Instead of a simple BLR that can be speculated past,
23582    we emit a BL to this thunk, and this thunk contains a BR to the relevant
23583    register.  These thunks have the relevant speculation barries put after
23584    their indirect branch so that speculation is blocked.
23585
23586    We use such a thunk so the speculation barriers are kept off the
23587    architecturally executed path in order to reduce the performance overhead.
23588
23589    When optimizing for size we use stubs shared by the linked object.
23590    When optimizing for performance we emit stubs for each function in the hope
23591    that the branch predictor can better train on jumps specific for a given
23592    function.  */
23593 rtx
23594 aarch64_sls_create_blr_label (int regnum)
23595 {
23596   gcc_assert (STUB_REGNUM_P (regnum));
23597   if (optimize_function_for_size_p (cfun))
23598     {
23599       /* For the thunks shared between different functions in this compilation
23600          unit we use a named symbol -- this is just for users to more easily
23601          understand the generated assembly.  */
23602       aarch64_sls_shared_thunks_needed = true;
23603       const char *thunk_name = indirect_symbol_names[regnum];
23604       if (aarch64_sls_shared_thunks[regnum] == NULL)
23605         {
23606           /* Build a decl representing this function stub and record it for
23607              later.  We build a decl here so we can use the GCC machinery for
23608              handling sections automatically (through `get_named_section` and
23609              `make_decl_one_only`).  That saves us a lot of trouble handling
23610              the specifics of different output file formats.  */
23611           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
23612                                   get_identifier (thunk_name),
23613                                   build_function_type_list (void_type_node,
23614                                                             NULL_TREE));
23615           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
23616                                            NULL_TREE, void_type_node);
23617           TREE_PUBLIC (decl) = 1;
23618           TREE_STATIC (decl) = 1;
23619           DECL_IGNORED_P (decl) = 1;
23620           DECL_ARTIFICIAL (decl) = 1;
23621           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
23622           resolve_unique_section (decl, 0, false);
23623           aarch64_sls_shared_thunks[regnum] = decl;
23624         }
23625
23626       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
23627     }
23628
23629   if (cfun->machine->call_via[regnum] == NULL)
23630     cfun->machine->call_via[regnum]
23631       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
23632   return cfun->machine->call_via[regnum];
23633 }
23634
23635 /* Helper function for aarch64_sls_emit_blr_function_thunks and
23636    aarch64_sls_emit_shared_blr_thunks below.  */
23637 static void
23638 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
23639 {
23640   /* Save in x16 and branch to that function so this transformation does
23641      not prevent jumping to `BTI c` instructions.  */
23642   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
23643   asm_fprintf (out_file, "\tbr\tx16\n");
23644 }
23645
23646 /* Emit all BLR stubs for this particular function.
23647    Here we emit all the BLR stubs needed for the current function.  Since we
23648    emit these stubs in a consecutive block we know there will be no speculation
23649    gadgets between each stub, and hence we only emit a speculation barrier at
23650    the end of the stub sequences.
23651
23652    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
23653 void
23654 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
23655 {
23656   if (! aarch64_harden_sls_blr_p ())
23657     return;
23658
23659   bool any_functions_emitted = false;
23660   /* We must save and restore the current function section since this assembly
23661      is emitted at the end of the function.  This means it can be emitted *just
23662      after* the cold section of a function.  That cold part would be emitted in
23663      a different section.  That switch would trigger a `.cfi_endproc` directive
23664      to be emitted in the original section and a `.cfi_startproc` directive to
23665      be emitted in the new section.  Switching to the original section without
23666      restoring would mean that the `.cfi_endproc` emitted as a function ends
23667      would happen in a different section -- leaving an unmatched
23668      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
23669      in the standard text section.  */
23670   section *save_text_section = in_section;
23671   switch_to_section (function_section (current_function_decl));
23672   for (int regnum = 0; regnum < 30; ++regnum)
23673     {
23674       rtx specu_label = cfun->machine->call_via[regnum];
23675       if (specu_label == NULL)
23676         continue;
23677
23678       targetm.asm_out.print_operand (out_file, specu_label, 0);
23679       asm_fprintf (out_file, ":\n");
23680       aarch64_sls_emit_function_stub (out_file, regnum);
23681       any_functions_emitted = true;
23682     }
23683   if (any_functions_emitted)
23684     /* Can use the SB if needs be here, since this stub will only be used
23685       by the current function, and hence for the current target.  */
23686     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
23687   switch_to_section (save_text_section);
23688 }
23689
23690 /* Emit shared BLR stubs for the current compilation unit.
23691    Over the course of compiling this unit we may have converted some BLR
23692    instructions to a BL to a shared stub function.  This is where we emit those
23693    stub functions.
23694    This function is for the stubs shared between different functions in this
23695    compilation unit.  We share when optimizing for size instead of speed.
23696
23697    This function is called through the TARGET_ASM_FILE_END hook.  */
23698 void
23699 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
23700 {
23701   if (! aarch64_sls_shared_thunks_needed)
23702     return;
23703
23704   for (int regnum = 0; regnum < 30; ++regnum)
23705     {
23706       tree decl = aarch64_sls_shared_thunks[regnum];
23707       if (!decl)
23708         continue;
23709
23710       const char *name = indirect_symbol_names[regnum];
23711       switch_to_section (get_named_section (decl, NULL, 0));
23712       ASM_OUTPUT_ALIGN (out_file, 2);
23713       targetm.asm_out.globalize_label (out_file, name);
23714       /* Only emits if the compiler is configured for an assembler that can
23715          handle visibility directives.  */
23716       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
23717       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
23718       ASM_OUTPUT_LABEL (out_file, name);
23719       aarch64_sls_emit_function_stub (out_file, regnum);
23720       /* Use the most conservative target to ensure it can always be used by any
23721          function in the translation unit.  */
23722       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
23723       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
23724     }
23725 }
23726
23727 /* Implement TARGET_ASM_FILE_END.  */
23728 void
23729 aarch64_asm_file_end ()
23730 {
23731   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
23732   /* Since this function will be called for the ASM_FILE_END hook, we ensure
23733      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
23734      for FreeBSD) still gets called.  */
23735 #ifdef TARGET_ASM_FILE_END
23736   TARGET_ASM_FILE_END ();
23737 #endif
23738 }
23739
23740 const char *
23741 aarch64_indirect_call_asm (rtx addr)
23742 {
23743   gcc_assert (REG_P (addr));
23744   if (aarch64_harden_sls_blr_p ())
23745     {
23746       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
23747       output_asm_insn ("bl\t%0", &stub_label);
23748     }
23749   else
23750    output_asm_insn ("blr\t%0", &addr);
23751   return "";
23752 }
23753
23754 /* Target-specific selftests.  */
23755
23756 #if CHECKING_P
23757
23758 namespace selftest {
23759
23760 /* Selftest for the RTL loader.
23761    Verify that the RTL loader copes with a dump from
23762    print_rtx_function.  This is essentially just a test that class
23763    function_reader can handle a real dump, but it also verifies
23764    that lookup_reg_by_dump_name correctly handles hard regs.
23765    The presence of hard reg names in the dump means that the test is
23766    target-specific, hence it is in this file.  */
23767
23768 static void
23769 aarch64_test_loading_full_dump ()
23770 {
23771   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
23772
23773   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
23774
23775   rtx_insn *insn_1 = get_insn_by_uid (1);
23776   ASSERT_EQ (NOTE, GET_CODE (insn_1));
23777
23778   rtx_insn *insn_15 = get_insn_by_uid (15);
23779   ASSERT_EQ (INSN, GET_CODE (insn_15));
23780   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
23781
23782   /* Verify crtl->return_rtx.  */
23783   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
23784   ASSERT_EQ (0, REGNO (crtl->return_rtx));
23785   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
23786 }
23787
23788 /* Run all target-specific selftests.  */
23789
23790 static void
23791 aarch64_run_selftests (void)
23792 {
23793   aarch64_test_loading_full_dump ();
23794 }
23795
23796 } // namespace selftest
23797
23798 #endif /* #if CHECKING_P */
23799
23800 #undef TARGET_STACK_PROTECT_GUARD
23801 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
23802
23803 #undef TARGET_ADDRESS_COST
23804 #define TARGET_ADDRESS_COST aarch64_address_cost
23805
23806 /* This hook will determines whether unnamed bitfields affect the alignment
23807    of the containing structure.  The hook returns true if the structure
23808    should inherit the alignment requirements of an unnamed bitfield's
23809    type.  */
23810 #undef TARGET_ALIGN_ANON_BITFIELD
23811 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
23812
23813 #undef TARGET_ASM_ALIGNED_DI_OP
23814 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
23815
23816 #undef TARGET_ASM_ALIGNED_HI_OP
23817 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
23818
23819 #undef TARGET_ASM_ALIGNED_SI_OP
23820 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
23821
23822 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
23823 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
23824   hook_bool_const_tree_hwi_hwi_const_tree_true
23825
23826 #undef TARGET_ASM_FILE_START
23827 #define TARGET_ASM_FILE_START aarch64_start_file
23828
23829 #undef TARGET_ASM_OUTPUT_MI_THUNK
23830 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
23831
23832 #undef TARGET_ASM_SELECT_RTX_SECTION
23833 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
23834
23835 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
23836 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
23837
23838 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
23839 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
23840
23841 #undef TARGET_BUILD_BUILTIN_VA_LIST
23842 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
23843
23844 #undef TARGET_CALLEE_COPIES
23845 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
23846
23847 #undef TARGET_CAN_ELIMINATE
23848 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
23849
23850 #undef TARGET_CAN_INLINE_P
23851 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
23852
23853 #undef TARGET_CANNOT_FORCE_CONST_MEM
23854 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
23855
23856 #undef TARGET_CASE_VALUES_THRESHOLD
23857 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
23858
23859 #undef TARGET_CONDITIONAL_REGISTER_USAGE
23860 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
23861
23862 #undef TARGET_MEMBER_TYPE_FORCES_BLK
23863 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
23864
23865 /* Only the least significant bit is used for initialization guard
23866    variables.  */
23867 #undef TARGET_CXX_GUARD_MASK_BIT
23868 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
23869
23870 #undef TARGET_C_MODE_FOR_SUFFIX
23871 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
23872
23873 #ifdef TARGET_BIG_ENDIAN_DEFAULT
23874 #undef  TARGET_DEFAULT_TARGET_FLAGS
23875 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
23876 #endif
23877
23878 #undef TARGET_CLASS_MAX_NREGS
23879 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
23880
23881 #undef TARGET_BUILTIN_DECL
23882 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
23883
23884 #undef TARGET_BUILTIN_RECIPROCAL
23885 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
23886
23887 #undef TARGET_C_EXCESS_PRECISION
23888 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
23889
23890 #undef  TARGET_EXPAND_BUILTIN
23891 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
23892
23893 #undef TARGET_EXPAND_BUILTIN_VA_START
23894 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
23895
23896 #undef TARGET_FOLD_BUILTIN
23897 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
23898
23899 #undef TARGET_FUNCTION_ARG
23900 #define TARGET_FUNCTION_ARG aarch64_function_arg
23901
23902 #undef TARGET_FUNCTION_ARG_ADVANCE
23903 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23904
23905 #undef TARGET_FUNCTION_ARG_BOUNDARY
23906 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23907
23908 #undef TARGET_FUNCTION_ARG_PADDING
23909 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23910
23911 #undef TARGET_GET_RAW_RESULT_MODE
23912 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23913 #undef TARGET_GET_RAW_ARG_MODE
23914 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23915
23916 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
23917 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23918
23919 #undef TARGET_FUNCTION_VALUE
23920 #define TARGET_FUNCTION_VALUE aarch64_function_value
23921
23922 #undef TARGET_FUNCTION_VALUE_REGNO_P
23923 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23924
23925 #undef TARGET_GIMPLE_FOLD_BUILTIN
23926 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
23927
23928 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
23929 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23930
23931 #undef  TARGET_INIT_BUILTINS
23932 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
23933
23934 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23935 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23936   aarch64_ira_change_pseudo_allocno_class
23937
23938 #undef TARGET_LEGITIMATE_ADDRESS_P
23939 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23940
23941 #undef TARGET_LEGITIMATE_CONSTANT_P
23942 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23943
23944 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23945 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23946   aarch64_legitimize_address_displacement
23947
23948 #undef TARGET_LIBGCC_CMP_RETURN_MODE
23949 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23950
23951 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23952 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23953 aarch64_libgcc_floating_mode_supported_p
23954
23955 #undef TARGET_MANGLE_TYPE
23956 #define TARGET_MANGLE_TYPE aarch64_mangle_type
23957
23958 #undef TARGET_INVALID_CONVERSION
23959 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23960
23961 #undef TARGET_INVALID_UNARY_OP
23962 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23963
23964 #undef TARGET_INVALID_BINARY_OP
23965 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23966
23967 #undef TARGET_VERIFY_TYPE_CONTEXT
23968 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23969
23970 #undef TARGET_MEMORY_MOVE_COST
23971 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23972
23973 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23974 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23975
23976 #undef TARGET_MUST_PASS_IN_STACK
23977 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23978
23979 /* This target hook should return true if accesses to volatile bitfields
23980    should use the narrowest mode possible.  It should return false if these
23981    accesses should use the bitfield container type.  */
23982 #undef TARGET_NARROW_VOLATILE_BITFIELD
23983 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
23984
23985 #undef  TARGET_OPTION_OVERRIDE
23986 #define TARGET_OPTION_OVERRIDE aarch64_override_options
23987
23988 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
23989 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
23990   aarch64_override_options_after_change
23991
23992 #undef TARGET_OFFLOAD_OPTIONS
23993 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
23994
23995 #undef TARGET_OPTION_SAVE
23996 #define TARGET_OPTION_SAVE aarch64_option_save
23997
23998 #undef TARGET_OPTION_RESTORE
23999 #define TARGET_OPTION_RESTORE aarch64_option_restore
24000
24001 #undef TARGET_OPTION_PRINT
24002 #define TARGET_OPTION_PRINT aarch64_option_print
24003
24004 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
24005 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
24006
24007 #undef TARGET_SET_CURRENT_FUNCTION
24008 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
24009
24010 #undef TARGET_PASS_BY_REFERENCE
24011 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
24012
24013 #undef TARGET_PREFERRED_RELOAD_CLASS
24014 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
24015
24016 #undef TARGET_SCHED_REASSOCIATION_WIDTH
24017 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
24018
24019 #undef TARGET_PROMOTED_TYPE
24020 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
24021
24022 #undef TARGET_SECONDARY_RELOAD
24023 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
24024
24025 #undef TARGET_SHIFT_TRUNCATION_MASK
24026 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
24027
24028 #undef TARGET_SETUP_INCOMING_VARARGS
24029 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
24030
24031 #undef TARGET_STRUCT_VALUE_RTX
24032 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
24033
24034 #undef TARGET_REGISTER_MOVE_COST
24035 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
24036
24037 #undef TARGET_RETURN_IN_MEMORY
24038 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
24039
24040 #undef TARGET_RETURN_IN_MSB
24041 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
24042
24043 #undef TARGET_RTX_COSTS
24044 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
24045
24046 #undef TARGET_SCALAR_MODE_SUPPORTED_P
24047 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
24048
24049 #undef TARGET_SCHED_ISSUE_RATE
24050 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
24051
24052 #undef TARGET_SCHED_VARIABLE_ISSUE
24053 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
24054
24055 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
24056 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
24057   aarch64_sched_first_cycle_multipass_dfa_lookahead
24058
24059 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
24060 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
24061   aarch64_first_cycle_multipass_dfa_lookahead_guard
24062
24063 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
24064 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
24065   aarch64_get_separate_components
24066
24067 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
24068 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
24069   aarch64_components_for_bb
24070
24071 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
24072 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
24073   aarch64_disqualify_components
24074
24075 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
24076 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
24077   aarch64_emit_prologue_components
24078
24079 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
24080 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
24081   aarch64_emit_epilogue_components
24082
24083 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
24084 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
24085   aarch64_set_handled_components
24086
24087 #undef TARGET_TRAMPOLINE_INIT
24088 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
24089
24090 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
24091 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
24092
24093 #undef TARGET_VECTOR_MODE_SUPPORTED_P
24094 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
24095
24096 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
24097 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
24098
24099 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
24100 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
24101   aarch64_builtin_support_vector_misalignment
24102
24103 #undef TARGET_ARRAY_MODE
24104 #define TARGET_ARRAY_MODE aarch64_array_mode
24105
24106 #undef TARGET_ARRAY_MODE_SUPPORTED_P
24107 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
24108
24109 #undef TARGET_VECTORIZE_ADD_STMT_COST
24110 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
24111
24112 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
24113 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
24114   aarch64_builtin_vectorization_cost
24115
24116 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
24117 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
24118
24119 #undef TARGET_VECTORIZE_BUILTINS
24120 #define TARGET_VECTORIZE_BUILTINS
24121
24122 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
24123 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
24124   aarch64_builtin_vectorized_function
24125
24126 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
24127 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
24128   aarch64_autovectorize_vector_modes
24129
24130 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
24131 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
24132   aarch64_atomic_assign_expand_fenv
24133
24134 /* Section anchor support.  */
24135
24136 #undef TARGET_MIN_ANCHOR_OFFSET
24137 #define TARGET_MIN_ANCHOR_OFFSET -256
24138
24139 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
24140    byte offset; we can do much more for larger data types, but have no way
24141    to determine the size of the access.  We assume accesses are aligned.  */
24142 #undef TARGET_MAX_ANCHOR_OFFSET
24143 #define TARGET_MAX_ANCHOR_OFFSET 4095
24144
24145 #undef TARGET_VECTOR_ALIGNMENT
24146 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
24147
24148 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
24149 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
24150   aarch64_vectorize_preferred_vector_alignment
24151 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
24152 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
24153   aarch64_simd_vector_alignment_reachable
24154
24155 /* vec_perm support.  */
24156
24157 #undef TARGET_VECTORIZE_VEC_PERM_CONST
24158 #define TARGET_VECTORIZE_VEC_PERM_CONST \
24159   aarch64_vectorize_vec_perm_const
24160
24161 #undef TARGET_VECTORIZE_RELATED_MODE
24162 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
24163 #undef TARGET_VECTORIZE_GET_MASK_MODE
24164 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
24165 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
24166 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
24167   aarch64_empty_mask_is_expensive
24168 #undef TARGET_PREFERRED_ELSE_VALUE
24169 #define TARGET_PREFERRED_ELSE_VALUE \
24170   aarch64_preferred_else_value
24171
24172 #undef TARGET_INIT_LIBFUNCS
24173 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
24174
24175 #undef TARGET_FIXED_CONDITION_CODE_REGS
24176 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
24177
24178 #undef TARGET_FLAGS_REGNUM
24179 #define TARGET_FLAGS_REGNUM CC_REGNUM
24180
24181 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
24182 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
24183
24184 #undef TARGET_ASAN_SHADOW_OFFSET
24185 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
24186
24187 #undef TARGET_LEGITIMIZE_ADDRESS
24188 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
24189
24190 #undef TARGET_SCHED_CAN_SPECULATE_INSN
24191 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
24192
24193 #undef TARGET_CAN_USE_DOLOOP_P
24194 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
24195
24196 #undef TARGET_SCHED_ADJUST_PRIORITY
24197 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
24198
24199 #undef TARGET_SCHED_MACRO_FUSION_P
24200 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
24201
24202 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
24203 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
24204
24205 #undef TARGET_SCHED_FUSION_PRIORITY
24206 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
24207
24208 #undef TARGET_UNSPEC_MAY_TRAP_P
24209 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
24210
24211 #undef TARGET_USE_PSEUDO_PIC_REG
24212 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
24213
24214 #undef TARGET_PRINT_OPERAND
24215 #define TARGET_PRINT_OPERAND aarch64_print_operand
24216
24217 #undef TARGET_PRINT_OPERAND_ADDRESS
24218 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
24219
24220 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
24221 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
24222
24223 #undef TARGET_OPTAB_SUPPORTED_P
24224 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
24225
24226 #undef TARGET_OMIT_STRUCT_RETURN_REG
24227 #define TARGET_OMIT_STRUCT_RETURN_REG true
24228
24229 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
24230 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
24231   aarch64_dwarf_poly_indeterminate_value
24232
24233 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
24234 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
24235 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
24236
24237 #undef TARGET_HARD_REGNO_NREGS
24238 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
24239 #undef TARGET_HARD_REGNO_MODE_OK
24240 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
24241
24242 #undef TARGET_MODES_TIEABLE_P
24243 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
24244
24245 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
24246 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
24247   aarch64_hard_regno_call_part_clobbered
24248
24249 #undef TARGET_INSN_CALLEE_ABI
24250 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
24251
24252 #undef TARGET_CONSTANT_ALIGNMENT
24253 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
24254
24255 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
24256 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
24257   aarch64_stack_clash_protection_alloca_probe_range
24258
24259 #undef TARGET_COMPUTE_PRESSURE_CLASSES
24260 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
24261
24262 #undef TARGET_CAN_CHANGE_MODE_CLASS
24263 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
24264
24265 #undef TARGET_SELECT_EARLY_REMAT_MODES
24266 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
24267
24268 #undef TARGET_SPECULATION_SAFE_VALUE
24269 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
24270
24271 #undef TARGET_ESTIMATED_POLY_VALUE
24272 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
24273
24274 #undef TARGET_ATTRIBUTE_TABLE
24275 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
24276
24277 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
24278 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
24279   aarch64_simd_clone_compute_vecsize_and_simdlen
24280
24281 #undef TARGET_SIMD_CLONE_ADJUST
24282 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
24283
24284 #undef TARGET_SIMD_CLONE_USABLE
24285 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
24286
24287 #undef TARGET_COMP_TYPE_ATTRIBUTES
24288 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
24289
24290 #undef TARGET_GET_MULTILIB_ABI_NAME
24291 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
24292
24293 #undef TARGET_FNTYPE_ABI
24294 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
24295
24296 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
24297 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
24298
24299 #if CHECKING_P
24300 #undef TARGET_RUN_TARGET_SELFTESTS
24301 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
24302 #endif /* #if CHECKING_P */
24303
24304 #undef TARGET_ASM_POST_CFI_STARTPROC
24305 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
24306
24307 #undef TARGET_STRICT_ARGUMENT_NAMING
24308 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
24309
24310 #undef TARGET_MD_ASM_ADJUST
24311 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
24312
24313 #undef TARGET_ASM_FILE_END
24314 #define TARGET_ASM_FILE_END aarch64_asm_file_end
24315
24316 #undef TARGET_ASM_FUNCTION_EPILOGUE
24317 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
24318
24319 struct gcc_target targetm = TARGET_INITIALIZER;
24320
24321 #include "gt-aarch64.h"