gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "cfgloop.h"
  32 #include "df.h"
  33 #include "tm_p.h"
  34 #include "stringpool.h"
  35 #include "optabs.h"
  36 #include "regs.h"
  37 #include "emit-rtl.h"
  38 #include "recog.h"
  39 #include "diagnostic.h"
  40 #include "insn-attr.h"
  41 #include "alias.h"
  42 #include "fold-const.h"
  43 #include "stor-layout.h"
  44 #include "calls.h"
  45 #include "varasm.h"
  46 #include "output.h"
  47 #include "flags.h"
  48 #include "explow.h"
  49 #include "expr.h"
  50 #include "reload.h"
  51 #include "langhooks.h"
  52 #include "opts.h"
  53 #include "params.h"
  54 #include "gimplify.h"
  55 #include "dwarf2.h"
  56 #include "gimple-iterator.h"
  57 #include "tree-vectorizer.h"
  58 #include "aarch64-cost-tables.h"
  59 #include "dumpfile.h"
  60 #include "builtins.h"
  61 #include "rtl-iter.h"
  62 #include "tm-constrs.h"
  63 #include "sched-int.h"
  64 #include "cortex-a57-fma-steering.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67
  68 /* This file should be included last.  */
  69 #include "target-def.h"
  70
  71 /* Defined for convenience.  */
  72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  73
  74 /* Classifies an address.
  75
  76    ADDRESS_REG_IMM
  77        A simple base register plus immediate offset.
  78
  79    ADDRESS_REG_WB
  80        A base register indexed by immediate offset with writeback.
  81
  82    ADDRESS_REG_REG
  83        A base register indexed by (optionally scaled) register.
  84
  85    ADDRESS_REG_UXTW
  86        A base register indexed by (optionally scaled) zero-extended register.
  87
  88    ADDRESS_REG_SXTW
  89        A base register indexed by (optionally scaled) sign-extended register.
  90
  91    ADDRESS_LO_SUM
  92        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  93
  94    ADDRESS_SYMBOLIC:
  95        A constant symbolic address, in pc-relative literal pool.  */
  96
  97 enum aarch64_address_type {
  98   ADDRESS_REG_IMM,
  99   ADDRESS_REG_WB,
 100   ADDRESS_REG_REG,
 101   ADDRESS_REG_UXTW,
 102   ADDRESS_REG_SXTW,
 103   ADDRESS_LO_SUM,
 104   ADDRESS_SYMBOLIC
 105 };
 106
 107 struct aarch64_address_info {
 108   enum aarch64_address_type type;
 109   rtx base;
 110   rtx offset;
 111   int shift;
 112   enum aarch64_symbol_type symbol_type;
 113 };
 114
 115 struct simd_immediate_info
 116 {
 117   rtx value;
 118   int shift;
 119   int element_width;
 120   bool mvn;
 121   bool msl;
 122 };
 123
 124 /* The current code model.  */
 125 enum aarch64_code_model aarch64_cmodel;
 126
 127 #ifdef HAVE_AS_TLS
 128 #undef TARGET_HAVE_TLS
 129 #define TARGET_HAVE_TLS 1
 130 #endif
 131
 132 static bool aarch64_composite_type_p (const_tree, machine_mode);
 133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 134                                                      const_tree,
 135                                                      machine_mode *, int *,
 136                                                      bool *);
 137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 139 static void aarch64_override_options_after_change (void);
 140 static bool aarch64_vector_mode_supported_p (machine_mode);
 141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 142                                                  const unsigned char *sel);
 143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 144
 145 /* Major revision number of the ARM Architecture implemented by the target.  */
 146 unsigned aarch64_architecture_version;
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* Mask to specify which instruction scheduling options should be used.  */
 152 unsigned long aarch64_tune_flags = 0;
 153
 154 /* Global flag for PC relative loads.  */
 155 bool aarch64_nopcrelative_literal_loads;
 156
 157 /* Support for command line parsing of boolean flags in the tuning
 158    structures.  */
 159 struct aarch64_flag_desc
 160 {
 161   const char* name;
 162   unsigned int flag;
 163 };
 164
 165 #define AARCH64_FUSION_PAIR(name, internal_name) \
 166   { name, AARCH64_FUSE_##internal_name },
 167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 168 {
 169   { "none", AARCH64_FUSE_NOTHING },
 170 #include "aarch64-fusion-pairs.def"
 171   { "all", AARCH64_FUSE_ALL },
 172   { NULL, AARCH64_FUSE_NOTHING }
 173 };
 174 #undef AARCH64_FUION_PAIR
 175
 176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 177   { name, AARCH64_EXTRA_TUNE_##internal_name },
 178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 179 {
 180   { "none", AARCH64_EXTRA_TUNE_NONE },
 181 #include "aarch64-tuning-flags.def"
 182   { "all", AARCH64_EXTRA_TUNE_ALL },
 183   { NULL, AARCH64_EXTRA_TUNE_NONE }
 184 };
 185 #undef AARCH64_EXTRA_TUNING_OPTION
 186
 187 /* Tuning parameters.  */
 188
 189 static const struct cpu_addrcost_table generic_addrcost_table =
 190 {
 191     {
 192       0, /* hi  */
 193       0, /* si  */
 194       0, /* di  */
 195       0, /* ti  */
 196     },
 197   0, /* pre_modify  */
 198   0, /* post_modify  */
 199   0, /* register_offset  */
 200   0, /* register_sextend  */
 201   0, /* register_zextend  */
 202   0 /* imm_offset  */
 203 };
 204
 205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 206 {
 207     {
 208       1, /* hi  */
 209       0, /* si  */
 210       0, /* di  */
 211       1, /* ti  */
 212     },
 213   0, /* pre_modify  */
 214   0, /* post_modify  */
 215   0, /* register_offset  */
 216   0, /* register_sextend  */
 217   0, /* register_zextend  */
 218   0, /* imm_offset  */
 219 };
 220
 221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 222 {
 223     {
 224       0, /* hi  */
 225       0, /* si  */
 226       0, /* di  */
 227       2, /* ti  */
 228     },
 229   0, /* pre_modify  */
 230   0, /* post_modify  */
 231   1, /* register_offset  */
 232   1, /* register_sextend  */
 233   2, /* register_zextend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_addrcost_table xgene1_addrcost_table =
 238 {
 239     {
 240       1, /* hi  */
 241       0, /* si  */
 242       0, /* di  */
 243       1, /* ti  */
 244     },
 245   1, /* pre_modify  */
 246   0, /* post_modify  */
 247   0, /* register_offset  */
 248   1, /* register_sextend  */
 249   1, /* register_zextend  */
 250   0, /* imm_offset  */
 251 };
 252
 253 static const struct cpu_regmove_cost generic_regmove_cost =
 254 {
 255   1, /* GP2GP  */
 256   /* Avoid the use of slow int<->fp moves for spilling by setting
 257      their cost higher than memmov_cost.  */
 258   5, /* GP2FP  */
 259   5, /* FP2GP  */
 260   2 /* FP2FP  */
 261 };
 262
 263 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 264 {
 265   1, /* GP2GP  */
 266   /* Avoid the use of slow int<->fp moves for spilling by setting
 267      their cost higher than memmov_cost.  */
 268   5, /* GP2FP  */
 269   5, /* FP2GP  */
 270   2 /* FP2FP  */
 271 };
 272
 273 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 274 {
 275   1, /* GP2GP  */
 276   /* Avoid the use of slow int<->fp moves for spilling by setting
 277      their cost higher than memmov_cost.  */
 278   5, /* GP2FP  */
 279   5, /* FP2GP  */
 280   2 /* FP2FP  */
 281 };
 282
 283 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 284 {
 285   1, /* GP2GP  */
 286   /* Avoid the use of slow int<->fp moves for spilling by setting
 287      their cost higher than memmov_cost (actual, 4 and 9).  */
 288   9, /* GP2FP  */
 289   9, /* FP2GP  */
 290   1 /* FP2FP  */
 291 };
 292
 293 static const struct cpu_regmove_cost thunderx_regmove_cost =
 294 {
 295   2, /* GP2GP  */
 296   2, /* GP2FP  */
 297   6, /* FP2GP  */
 298   4 /* FP2FP  */
 299 };
 300
 301 static const struct cpu_regmove_cost xgene1_regmove_cost =
 302 {
 303   1, /* GP2GP  */
 304   /* Avoid the use of slow int<->fp moves for spilling by setting
 305      their cost higher than memmov_cost.  */
 306   8, /* GP2FP  */
 307   8, /* FP2GP  */
 308   2 /* FP2FP  */
 309 };
 310
 311 /* Generic costs for vector insn classes.  */
 312 static const struct cpu_vector_cost generic_vector_cost =
 313 {
 314   1, /* scalar_stmt_cost  */
 315   1, /* scalar_load_cost  */
 316   1, /* scalar_store_cost  */
 317   1, /* vec_stmt_cost  */
 318   2, /* vec_permute_cost  */
 319   1, /* vec_to_scalar_cost  */
 320   1, /* scalar_to_vec_cost  */
 321   1, /* vec_align_load_cost  */
 322   1, /* vec_unalign_load_cost  */
 323   1, /* vec_unalign_store_cost  */
 324   1, /* vec_store_cost  */
 325   3, /* cond_taken_branch_cost  */
 326   1 /* cond_not_taken_branch_cost  */
 327 };
 328
 329 /* Generic costs for vector insn classes.  */
 330 static const struct cpu_vector_cost cortexa57_vector_cost =
 331 {
 332   1, /* scalar_stmt_cost  */
 333   4, /* scalar_load_cost  */
 334   1, /* scalar_store_cost  */
 335   3, /* vec_stmt_cost  */
 336   3, /* vec_permute_cost  */
 337   8, /* vec_to_scalar_cost  */
 338   8, /* scalar_to_vec_cost  */
 339   5, /* vec_align_load_cost  */
 340   5, /* vec_unalign_load_cost  */
 341   1, /* vec_unalign_store_cost  */
 342   1, /* vec_store_cost  */
 343   1, /* cond_taken_branch_cost  */
 344   1 /* cond_not_taken_branch_cost  */
 345 };
 346
 347 static const struct cpu_vector_cost exynosm1_vector_cost =
 348 {
 349   1, /* scalar_stmt_cost  */
 350   5, /* scalar_load_cost  */
 351   1, /* scalar_store_cost  */
 352   3, /* vec_stmt_cost  */
 353   3, /* vec_permute_cost  */
 354   3, /* vec_to_scalar_cost  */
 355   3, /* scalar_to_vec_cost  */
 356   5, /* vec_align_load_cost  */
 357   5, /* vec_unalign_load_cost  */
 358   1, /* vec_unalign_store_cost  */
 359   1, /* vec_store_cost  */
 360   1, /* cond_taken_branch_cost  */
 361   1 /* cond_not_taken_branch_cost  */
 362 };
 363
 364 /* Generic costs for vector insn classes.  */
 365 static const struct cpu_vector_cost xgene1_vector_cost =
 366 {
 367   1, /* scalar_stmt_cost  */
 368   5, /* scalar_load_cost  */
 369   1, /* scalar_store_cost  */
 370   2, /* vec_stmt_cost  */
 371   2, /* vec_permute_cost  */
 372   4, /* vec_to_scalar_cost  */
 373   4, /* scalar_to_vec_cost  */
 374   10, /* vec_align_load_cost  */
 375   10, /* vec_unalign_load_cost  */
 376   2, /* vec_unalign_store_cost  */
 377   2, /* vec_store_cost  */
 378   2, /* cond_taken_branch_cost  */
 379   1 /* cond_not_taken_branch_cost  */
 380 };
 381
 382 /* Generic costs for branch instructions.  */
 383 static const struct cpu_branch_cost generic_branch_cost =
 384 {
 385   2,  /* Predictable.  */
 386   2   /* Unpredictable.  */
 387 };
 388
 389 /* Branch costs for Cortex-A57.  */
 390 static const struct cpu_branch_cost cortexa57_branch_cost =
 391 {
 392   1,  /* Predictable.  */
 393   3   /* Unpredictable.  */
 394 };
 395
 396 static const struct tune_params generic_tunings =
 397 {
 398   &cortexa57_extra_costs,
 399   &generic_addrcost_table,
 400   &generic_regmove_cost,
 401   &generic_vector_cost,
 402   &generic_branch_cost,
 403   4, /* memmov_cost  */
 404   2, /* issue_rate  */
 405   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 406   8,    /* function_align.  */
 407   8,    /* jump_align.  */
 408   4,    /* loop_align.  */
 409   2,    /* int_reassoc_width.  */
 410   4,    /* fp_reassoc_width.  */
 411   1,    /* vec_reassoc_width.  */
 412   2,    /* min_div_recip_mul_sf.  */
 413   2,    /* min_div_recip_mul_df.  */
 414   0,    /* max_case_values.  */
 415   0,    /* cache_line_size.  */
 416   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 417   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 418 };
 419
 420 static const struct tune_params cortexa35_tunings =
 421 {
 422   &cortexa53_extra_costs,
 423   &generic_addrcost_table,
 424   &cortexa53_regmove_cost,
 425   &generic_vector_cost,
 426   &generic_branch_cost,
 427   4, /* memmov_cost  */
 428   1, /* issue_rate  */
 429   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 430    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 431   8,    /* function_align.  */
 432   8,    /* jump_align.  */
 433   4,    /* loop_align.  */
 434   2,    /* int_reassoc_width.  */
 435   4,    /* fp_reassoc_width.  */
 436   1,    /* vec_reassoc_width.  */
 437   2,    /* min_div_recip_mul_sf.  */
 438   2,    /* min_div_recip_mul_df.  */
 439   0,    /* max_case_values.  */
 440   0,    /* cache_line_size.  */
 441   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 442   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 443 };
 444
 445 static const struct tune_params cortexa53_tunings =
 446 {
 447   &cortexa53_extra_costs,
 448   &generic_addrcost_table,
 449   &cortexa53_regmove_cost,
 450   &generic_vector_cost,
 451   &generic_branch_cost,
 452   4, /* memmov_cost  */
 453   2, /* issue_rate  */
 454   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 455    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 456   8,    /* function_align.  */
 457   8,    /* jump_align.  */
 458   4,    /* loop_align.  */
 459   2,    /* int_reassoc_width.  */
 460   4,    /* fp_reassoc_width.  */
 461   1,    /* vec_reassoc_width.  */
 462   2,    /* min_div_recip_mul_sf.  */
 463   2,    /* min_div_recip_mul_df.  */
 464   0,    /* max_case_values.  */
 465   0,    /* cache_line_size.  */
 466   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 467   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 468 };
 469
 470 static const struct tune_params cortexa57_tunings =
 471 {
 472   &cortexa57_extra_costs,
 473   &cortexa57_addrcost_table,
 474   &cortexa57_regmove_cost,
 475   &cortexa57_vector_cost,
 476   &cortexa57_branch_cost,
 477   4, /* memmov_cost  */
 478   3, /* issue_rate  */
 479   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 480    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 481   16,   /* function_align.  */
 482   8,    /* jump_align.  */
 483   4,    /* loop_align.  */
 484   2,    /* int_reassoc_width.  */
 485   4,    /* fp_reassoc_width.  */
 486   1,    /* vec_reassoc_width.  */
 487   2,    /* min_div_recip_mul_sf.  */
 488   2,    /* min_div_recip_mul_df.  */
 489   0,    /* max_case_values.  */
 490   0,    /* cache_line_size.  */
 491   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 492   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 493 };
 494
 495 static const struct tune_params cortexa72_tunings =
 496 {
 497   &cortexa57_extra_costs,
 498   &cortexa57_addrcost_table,
 499   &cortexa57_regmove_cost,
 500   &cortexa57_vector_cost,
 501   &generic_branch_cost,
 502   4, /* memmov_cost  */
 503   3, /* issue_rate  */
 504   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 505    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 506   16,   /* function_align.  */
 507   8,    /* jump_align.  */
 508   4,    /* loop_align.  */
 509   2,    /* int_reassoc_width.  */
 510   4,    /* fp_reassoc_width.  */
 511   1,    /* vec_reassoc_width.  */
 512   2,    /* min_div_recip_mul_sf.  */
 513   2,    /* min_div_recip_mul_df.  */
 514   0,    /* max_case_values.  */
 515   0,    /* cache_line_size.  */
 516   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 517   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 518 };
 519
 520 static const struct tune_params exynosm1_tunings =
 521 {
 522   &exynosm1_extra_costs,
 523   &exynosm1_addrcost_table,
 524   &exynosm1_regmove_cost,
 525   &exynosm1_vector_cost,
 526   &generic_branch_cost,
 527   4,    /* memmov_cost  */
 528   3,    /* issue_rate  */
 529   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 530   4,    /* function_align.  */
 531   4,    /* jump_align.  */
 532   4,    /* loop_align.  */
 533   2,    /* int_reassoc_width.  */
 534   4,    /* fp_reassoc_width.  */
 535   1,    /* vec_reassoc_width.  */
 536   2,    /* min_div_recip_mul_sf.  */
 537   2,    /* min_div_recip_mul_df.  */
 538   48,   /* max_case_values.  */
 539   64,   /* cache_line_size.  */
 540   tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model.  */
 541   (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags.  */
 542 };
 543
 544 static const struct tune_params thunderx_tunings =
 545 {
 546   &thunderx_extra_costs,
 547   &generic_addrcost_table,
 548   &thunderx_regmove_cost,
 549   &generic_vector_cost,
 550   &generic_branch_cost,
 551   6, /* memmov_cost  */
 552   2, /* issue_rate  */
 553   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 554   8,    /* function_align.  */
 555   8,    /* jump_align.  */
 556   8,    /* loop_align.  */
 557   2,    /* int_reassoc_width.  */
 558   4,    /* fp_reassoc_width.  */
 559   1,    /* vec_reassoc_width.  */
 560   2,    /* min_div_recip_mul_sf.  */
 561   2,    /* min_div_recip_mul_df.  */
 562   0,    /* max_case_values.  */
 563   0,    /* cache_line_size.  */
 564   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 565   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 566 };
 567
 568 static const struct tune_params xgene1_tunings =
 569 {
 570   &xgene1_extra_costs,
 571   &xgene1_addrcost_table,
 572   &xgene1_regmove_cost,
 573   &xgene1_vector_cost,
 574   &generic_branch_cost,
 575   6, /* memmov_cost  */
 576   4, /* issue_rate  */
 577   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 578   16,   /* function_align.  */
 579   8,    /* jump_align.  */
 580   16,   /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   0,    /* cache_line_size.  */
 588   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 589   (AARCH64_EXTRA_TUNE_RECIP_SQRT)       /* tune_flags.  */
 590 };
 591
 592 /* Support for fine-grained override of the tuning structures.  */
 593 struct aarch64_tuning_override_function
 594 {
 595   const char* name;
 596   void (*parse_override)(const char*, struct tune_params*);
 597 };
 598
 599 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 600 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 601
 602 static const struct aarch64_tuning_override_function
 603 aarch64_tuning_override_functions[] =
 604 {
 605   { "fuse", aarch64_parse_fuse_string },
 606   { "tune", aarch64_parse_tune_string },
 607   { NULL, NULL }
 608 };
 609
 610 /* A processor implementing AArch64.  */
 611 struct processor
 612 {
 613   const char *const name;
 614   enum aarch64_processor ident;
 615   enum aarch64_processor sched_core;
 616   enum aarch64_arch arch;
 617   unsigned architecture_version;
 618   const unsigned long flags;
 619   const struct tune_params *const tune;
 620 };
 621
 622 /* Architectures implementing AArch64.  */
 623 static const struct processor all_architectures[] =
 624 {
 625 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 626   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 627 #include "aarch64-arches.def"
 628 #undef AARCH64_ARCH
 629   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 630 };
 631
 632 /* Processor cores implementing AArch64.  */
 633 static const struct processor all_cores[] =
 634 {
 635 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 636   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 637   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 638   FLAGS, &COSTS##_tunings},
 639 #include "aarch64-cores.def"
 640 #undef AARCH64_CORE
 641   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 642     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 643   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 644 };
 645
 646
 647 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 648    handling code or by target attributes.  */
 649 static const struct processor *selected_arch;
 650 static const struct processor *selected_cpu;
 651 static const struct processor *selected_tune;
 652
 653 /* The current tuning set.  */
 654 struct tune_params aarch64_tune_params = generic_tunings;
 655
 656 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 657
 658 /* An ISA extension in the co-processor and main instruction set space.  */
 659 struct aarch64_option_extension
 660 {
 661   const char *const name;
 662   const unsigned long flags_on;
 663   const unsigned long flags_off;
 664 };
 665
 666 /* ISA extensions in AArch64.  */
 667 static const struct aarch64_option_extension all_extensions[] =
 668 {
 669 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 670   {NAME, FLAGS_ON, FLAGS_OFF},
 671 #include "aarch64-option-extensions.def"
 672 #undef AARCH64_OPT_EXTENSION
 673   {NULL, 0, 0}
 674 };
 675
 676 typedef enum aarch64_cond_code
 677 {
 678   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 679   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 680   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 681 }
 682 aarch64_cc;
 683
 684 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 685
 686 /* The condition codes of the processor, and the inverse function.  */
 687 static const char * const aarch64_condition_codes[] =
 688 {
 689   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 690   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 691 };
 692
 693 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 694 const char *
 695 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 696                         const char * branch_format)
 697 {
 698     rtx_code_label * tmp_label = gen_label_rtx ();
 699     char label_buf[256];
 700     char buffer[128];
 701     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 702                                  CODE_LABEL_NUMBER (tmp_label));
 703     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 704     rtx dest_label = operands[pos_label];
 705     operands[pos_label] = tmp_label;
 706
 707     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 708     output_asm_insn (buffer, operands);
 709
 710     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 711     operands[pos_label] = dest_label;
 712     output_asm_insn (buffer, operands);
 713     return "";
 714 }
 715
 716 void
 717 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 718 {
 719   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 720   if (TARGET_GENERAL_REGS_ONLY)
 721     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 722   else
 723     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 724 }
 725
 726 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 727    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 728    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 729    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 730    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 731    irrespectively of its cost results in bad allocations with many redundant
 732    int<->FP moves which are expensive on various cores.
 733    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 734    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 735    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 736    Otherwise set the allocno class depending on the mode.
 737    The result of this is that it is no longer inefficient to have a higher
 738    memory move cost than the register move cost.
 739 */
 740
 741 static reg_class_t
 742 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 743                                          reg_class_t best_class)
 744 {
 745   enum machine_mode mode;
 746
 747   if (allocno_class != ALL_REGS)
 748     return allocno_class;
 749
 750   if (best_class != ALL_REGS)
 751     return best_class;
 752
 753   mode = PSEUDO_REGNO_MODE (regno);
 754   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 755 }
 756
 757 static unsigned int
 758 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 759 {
 760   if (GET_MODE_UNIT_SIZE (mode) == 4)
 761     return aarch64_tune_params.min_div_recip_mul_sf;
 762   return aarch64_tune_params.min_div_recip_mul_df;
 763 }
 764
 765 static int
 766 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 767                              enum machine_mode mode)
 768 {
 769   if (VECTOR_MODE_P (mode))
 770     return aarch64_tune_params.vec_reassoc_width;
 771   if (INTEGRAL_MODE_P (mode))
 772     return aarch64_tune_params.int_reassoc_width;
 773   if (FLOAT_MODE_P (mode))
 774     return aarch64_tune_params.fp_reassoc_width;
 775   return 1;
 776 }
 777
 778 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 779 unsigned
 780 aarch64_dbx_register_number (unsigned regno)
 781 {
 782    if (GP_REGNUM_P (regno))
 783      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 784    else if (regno == SP_REGNUM)
 785      return AARCH64_DWARF_SP;
 786    else if (FP_REGNUM_P (regno))
 787      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 788
 789    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 790       equivalent DWARF register.  */
 791    return DWARF_FRAME_REGISTERS;
 792 }
 793
 794 /* Return TRUE if MODE is any of the large INT modes.  */
 795 static bool
 796 aarch64_vect_struct_mode_p (machine_mode mode)
 797 {
 798   return mode == OImode || mode == CImode || mode == XImode;
 799 }
 800
 801 /* Return TRUE if MODE is any of the vector modes.  */
 802 static bool
 803 aarch64_vector_mode_p (machine_mode mode)
 804 {
 805   return aarch64_vector_mode_supported_p (mode)
 806          || aarch64_vect_struct_mode_p (mode);
 807 }
 808
 809 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 810 static bool
 811 aarch64_array_mode_supported_p (machine_mode mode,
 812                                 unsigned HOST_WIDE_INT nelems)
 813 {
 814   if (TARGET_SIMD
 815       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
 816           || AARCH64_VALID_SIMD_DREG_MODE (mode))
 817       && (nelems >= 2 && nelems <= 4))
 818     return true;
 819
 820   return false;
 821 }
 822
 823 /* Implement HARD_REGNO_NREGS.  */
 824
 825 int
 826 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 827 {
 828   switch (aarch64_regno_regclass (regno))
 829     {
 830     case FP_REGS:
 831     case FP_LO_REGS:
 832       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 833     default:
 834       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 835     }
 836   gcc_unreachable ();
 837 }
 838
 839 /* Implement HARD_REGNO_MODE_OK.  */
 840
 841 int
 842 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 843 {
 844   if (GET_MODE_CLASS (mode) == MODE_CC)
 845     return regno == CC_REGNUM;
 846
 847   if (regno == SP_REGNUM)
 848     /* The purpose of comparing with ptr_mode is to support the
 849        global register variable associated with the stack pointer
 850        register via the syntax of asm ("wsp") in ILP32.  */
 851     return mode == Pmode || mode == ptr_mode;
 852
 853   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 854     return mode == Pmode;
 855
 856   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 857     return 1;
 858
 859   if (FP_REGNUM_P (regno))
 860     {
 861       if (aarch64_vect_struct_mode_p (mode))
 862         return
 863           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 864       else
 865         return 1;
 866     }
 867
 868   return 0;
 869 }
 870
 871 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 872 machine_mode
 873 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 874                                      machine_mode mode)
 875 {
 876   /* Handle modes that fit within single registers.  */
 877   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 878     {
 879       if (GET_MODE_SIZE (mode) >= 4)
 880         return mode;
 881       else
 882         return SImode;
 883     }
 884   /* Fall back to generic for multi-reg and very large modes.  */
 885   else
 886     return choose_hard_reg_mode (regno, nregs, false);
 887 }
 888
 889 /* Return true if calls to DECL should be treated as
 890    long-calls (ie called via a register).  */
 891 static bool
 892 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 893 {
 894   return false;
 895 }
 896
 897 /* Return true if calls to symbol-ref SYM should be treated as
 898    long-calls (ie called via a register).  */
 899 bool
 900 aarch64_is_long_call_p (rtx sym)
 901 {
 902   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 903 }
 904
 905 /* Return true if calls to symbol-ref SYM should not go through
 906    plt stubs.  */
 907
 908 bool
 909 aarch64_is_noplt_call_p (rtx sym)
 910 {
 911   const_tree decl = SYMBOL_REF_DECL (sym);
 912
 913   if (flag_pic
 914       && decl
 915       && (!flag_plt
 916           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
 917       && !targetm.binds_local_p (decl))
 918     return true;
 919
 920   return false;
 921 }
 922
 923 /* Return true if the offsets to a zero/sign-extract operation
 924    represent an expression that matches an extend operation.  The
 925    operands represent the paramters from
 926
 927    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 928 bool
 929 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 930                                 rtx extract_imm)
 931 {
 932   HOST_WIDE_INT mult_val, extract_val;
 933
 934   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 935     return false;
 936
 937   mult_val = INTVAL (mult_imm);
 938   extract_val = INTVAL (extract_imm);
 939
 940   if (extract_val > 8
 941       && extract_val < GET_MODE_BITSIZE (mode)
 942       && exact_log2 (extract_val & ~7) > 0
 943       && (extract_val & 7) <= 4
 944       && mult_val == (1 << (extract_val & 7)))
 945     return true;
 946
 947   return false;
 948 }
 949
 950 /* Emit an insn that's a simple single-set.  Both the operands must be
 951    known to be valid.  */
 952 inline static rtx
 953 emit_set_insn (rtx x, rtx y)
 954 {
 955   return emit_insn (gen_rtx_SET (x, y));
 956 }
 957
 958 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 959    return the rtx for register 0 in the proper mode.  */
 960 rtx
 961 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 962 {
 963   machine_mode mode = SELECT_CC_MODE (code, x, y);
 964   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 965
 966   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 967   return cc_reg;
 968 }
 969
 970 /* Build the SYMBOL_REF for __tls_get_addr.  */
 971
 972 static GTY(()) rtx tls_get_addr_libfunc;
 973
 974 rtx
 975 aarch64_tls_get_addr (void)
 976 {
 977   if (!tls_get_addr_libfunc)
 978     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 979   return tls_get_addr_libfunc;
 980 }
 981
 982 /* Return the TLS model to use for ADDR.  */
 983
 984 static enum tls_model
 985 tls_symbolic_operand_type (rtx addr)
 986 {
 987   enum tls_model tls_kind = TLS_MODEL_NONE;
 988   rtx sym, addend;
 989
 990   if (GET_CODE (addr) == CONST)
 991     {
 992       split_const (addr, &sym, &addend);
 993       if (GET_CODE (sym) == SYMBOL_REF)
 994         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 995     }
 996   else if (GET_CODE (addr) == SYMBOL_REF)
 997     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 998
 999   return tls_kind;
1000 }
1001
1002 /* We'll allow lo_sum's in addresses in our legitimate addresses
1003    so that combine would take care of combining addresses where
1004    necessary, but for generation purposes, we'll generate the address
1005    as :
1006    RTL                               Absolute
1007    tmp = hi (symbol_ref);            adrp  x1, foo
1008    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1009                                      nop
1010
1011    PIC                               TLS
1012    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1013    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1014                                      bl   __tls_get_addr
1015                                      nop
1016
1017    Load TLS symbol, depending on TLS mechanism and TLS access model.
1018
1019    Global Dynamic - Traditional TLS:
1020    adrp tmp, :tlsgd:imm
1021    add  dest, tmp, #:tlsgd_lo12:imm
1022    bl   __tls_get_addr
1023
1024    Global Dynamic - TLS Descriptors:
1025    adrp dest, :tlsdesc:imm
1026    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1027    add  dest, dest, #:tlsdesc_lo12:imm
1028    blr  tmp
1029    mrs  tp, tpidr_el0
1030    add  dest, dest, tp
1031
1032    Initial Exec:
1033    mrs  tp, tpidr_el0
1034    adrp tmp, :gottprel:imm
1035    ldr  dest, [tmp, #:gottprel_lo12:imm]
1036    add  dest, dest, tp
1037
1038    Local Exec:
1039    mrs  tp, tpidr_el0
1040    add  t0, tp, #:tprel_hi12:imm, lsl #12
1041    add  t0, t0, #:tprel_lo12_nc:imm
1042 */
1043
1044 static void
1045 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1046                                    enum aarch64_symbol_type type)
1047 {
1048   switch (type)
1049     {
1050     case SYMBOL_SMALL_ABSOLUTE:
1051       {
1052         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1053         rtx tmp_reg = dest;
1054         machine_mode mode = GET_MODE (dest);
1055
1056         gcc_assert (mode == Pmode || mode == ptr_mode);
1057
1058         if (can_create_pseudo_p ())
1059           tmp_reg = gen_reg_rtx (mode);
1060
1061         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1062         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1063         return;
1064       }
1065
1066     case SYMBOL_TINY_ABSOLUTE:
1067       emit_insn (gen_rtx_SET (dest, imm));
1068       return;
1069
1070     case SYMBOL_SMALL_GOT_28K:
1071       {
1072         machine_mode mode = GET_MODE (dest);
1073         rtx gp_rtx = pic_offset_table_rtx;
1074         rtx insn;
1075         rtx mem;
1076
1077         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1078            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1079            decide rtx costs, in which case pic_offset_table_rtx is not
1080            initialized.  For that case no need to generate the first adrp
1081            instruction as the final cost for global variable access is
1082            one instruction.  */
1083         if (gp_rtx != NULL)
1084           {
1085             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1086                using the page base as GOT base, the first page may be wasted,
1087                in the worst scenario, there is only 28K space for GOT).
1088
1089                The generate instruction sequence for accessing global variable
1090                is:
1091
1092                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1093
1094                Only one instruction needed. But we must initialize
1095                pic_offset_table_rtx properly.  We generate initialize insn for
1096                every global access, and allow CSE to remove all redundant.
1097
1098                The final instruction sequences will look like the following
1099                for multiply global variables access.
1100
1101                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1102
1103                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1104                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1105                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1106                  ...  */
1107
1108             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1109             crtl->uses_pic_offset_table = 1;
1110             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1111
1112             if (mode != GET_MODE (gp_rtx))
1113               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1114           }
1115
1116         if (mode == ptr_mode)
1117           {
1118             if (mode == DImode)
1119               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1120             else
1121               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1122
1123             mem = XVECEXP (SET_SRC (insn), 0, 0);
1124           }
1125         else
1126           {
1127             gcc_assert (mode == Pmode);
1128
1129             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1130             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1131           }
1132
1133         /* The operand is expected to be MEM.  Whenever the related insn
1134            pattern changed, above code which calculate mem should be
1135            updated.  */
1136         gcc_assert (GET_CODE (mem) == MEM);
1137         MEM_READONLY_P (mem) = 1;
1138         MEM_NOTRAP_P (mem) = 1;
1139         emit_insn (insn);
1140         return;
1141       }
1142
1143     case SYMBOL_SMALL_GOT_4G:
1144       {
1145         /* In ILP32, the mode of dest can be either SImode or DImode,
1146            while the got entry is always of SImode size.  The mode of
1147            dest depends on how dest is used: if dest is assigned to a
1148            pointer (e.g. in the memory), it has SImode; it may have
1149            DImode if dest is dereferenced to access the memeory.
1150            This is why we have to handle three different ldr_got_small
1151            patterns here (two patterns for ILP32).  */
1152
1153         rtx insn;
1154         rtx mem;
1155         rtx tmp_reg = dest;
1156         machine_mode mode = GET_MODE (dest);
1157
1158         if (can_create_pseudo_p ())
1159           tmp_reg = gen_reg_rtx (mode);
1160
1161         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1162         if (mode == ptr_mode)
1163           {
1164             if (mode == DImode)
1165               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1166             else
1167               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1168
1169             mem = XVECEXP (SET_SRC (insn), 0, 0);
1170           }
1171         else
1172           {
1173             gcc_assert (mode == Pmode);
1174
1175             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1176             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1177           }
1178
1179         gcc_assert (GET_CODE (mem) == MEM);
1180         MEM_READONLY_P (mem) = 1;
1181         MEM_NOTRAP_P (mem) = 1;
1182         emit_insn (insn);
1183         return;
1184       }
1185
1186     case SYMBOL_SMALL_TLSGD:
1187       {
1188         rtx_insn *insns;
1189         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1190
1191         start_sequence ();
1192         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1193         insns = get_insns ();
1194         end_sequence ();
1195
1196         RTL_CONST_CALL_P (insns) = 1;
1197         emit_libcall_block (insns, dest, result, imm);
1198         return;
1199       }
1200
1201     case SYMBOL_SMALL_TLSDESC:
1202       {
1203         machine_mode mode = GET_MODE (dest);
1204         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1205         rtx tp;
1206
1207         gcc_assert (mode == Pmode || mode == ptr_mode);
1208
1209         /* In ILP32, the got entry is always of SImode size.  Unlike
1210            small GOT, the dest is fixed at reg 0.  */
1211         if (TARGET_ILP32)
1212           emit_insn (gen_tlsdesc_small_si (imm));
1213         else
1214           emit_insn (gen_tlsdesc_small_di (imm));
1215         tp = aarch64_load_tp (NULL);
1216
1217         if (mode != Pmode)
1218           tp = gen_lowpart (mode, tp);
1219
1220         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1221         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1222         return;
1223       }
1224
1225     case SYMBOL_SMALL_TLSIE:
1226       {
1227         /* In ILP32, the mode of dest can be either SImode or DImode,
1228            while the got entry is always of SImode size.  The mode of
1229            dest depends on how dest is used: if dest is assigned to a
1230            pointer (e.g. in the memory), it has SImode; it may have
1231            DImode if dest is dereferenced to access the memeory.
1232            This is why we have to handle three different tlsie_small
1233            patterns here (two patterns for ILP32).  */
1234         machine_mode mode = GET_MODE (dest);
1235         rtx tmp_reg = gen_reg_rtx (mode);
1236         rtx tp = aarch64_load_tp (NULL);
1237
1238         if (mode == ptr_mode)
1239           {
1240             if (mode == DImode)
1241               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1242             else
1243               {
1244                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1245                 tp = gen_lowpart (mode, tp);
1246               }
1247           }
1248         else
1249           {
1250             gcc_assert (mode == Pmode);
1251             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1252           }
1253
1254         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1255         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1256         return;
1257       }
1258
1259     case SYMBOL_TLSLE12:
1260     case SYMBOL_TLSLE24:
1261     case SYMBOL_TLSLE32:
1262     case SYMBOL_TLSLE48:
1263       {
1264         machine_mode mode = GET_MODE (dest);
1265         rtx tp = aarch64_load_tp (NULL);
1266
1267         if (mode != Pmode)
1268           tp = gen_lowpart (mode, tp);
1269
1270         switch (type)
1271           {
1272           case SYMBOL_TLSLE12:
1273             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1274                         (dest, tp, imm));
1275             break;
1276           case SYMBOL_TLSLE24:
1277             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1278                         (dest, tp, imm));
1279           break;
1280           case SYMBOL_TLSLE32:
1281             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1282                         (dest, imm));
1283             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1284                         (dest, dest, tp));
1285           break;
1286           case SYMBOL_TLSLE48:
1287             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1288                         (dest, imm));
1289             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1290                         (dest, dest, tp));
1291             break;
1292           default:
1293             gcc_unreachable ();
1294           }
1295
1296         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1297         return;
1298       }
1299
1300     case SYMBOL_TINY_GOT:
1301       emit_insn (gen_ldr_got_tiny (dest, imm));
1302       return;
1303
1304     case SYMBOL_TINY_TLSIE:
1305       {
1306         machine_mode mode = GET_MODE (dest);
1307         rtx tp = aarch64_load_tp (NULL);
1308
1309         if (mode == ptr_mode)
1310           {
1311             if (mode == DImode)
1312               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1313             else
1314               {
1315                 tp = gen_lowpart (mode, tp);
1316                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1317               }
1318           }
1319         else
1320           {
1321             gcc_assert (mode == Pmode);
1322             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1323           }
1324
1325         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1326         return;
1327       }
1328
1329     default:
1330       gcc_unreachable ();
1331     }
1332 }
1333
1334 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1335    handle all moves if !can_create_pseudo_p ().  The distinction is
1336    important because, unlike emit_move_insn, the move expanders know
1337    how to force Pmode objects into the constant pool even when the
1338    constant pool address is not itself legitimate.  */
1339 static rtx
1340 aarch64_emit_move (rtx dest, rtx src)
1341 {
1342   return (can_create_pseudo_p ()
1343           ? emit_move_insn (dest, src)
1344           : emit_move_insn_1 (dest, src));
1345 }
1346
1347 /* Split a 128-bit move operation into two 64-bit move operations,
1348    taking care to handle partial overlap of register to register
1349    copies.  Special cases are needed when moving between GP regs and
1350    FP regs.  SRC can be a register, constant or memory; DST a register
1351    or memory.  If either operand is memory it must not have any side
1352    effects.  */
1353 void
1354 aarch64_split_128bit_move (rtx dst, rtx src)
1355 {
1356   rtx dst_lo, dst_hi;
1357   rtx src_lo, src_hi;
1358
1359   machine_mode mode = GET_MODE (dst);
1360
1361   gcc_assert (mode == TImode || mode == TFmode);
1362   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1363   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1364
1365   if (REG_P (dst) && REG_P (src))
1366     {
1367       int src_regno = REGNO (src);
1368       int dst_regno = REGNO (dst);
1369
1370       /* Handle FP <-> GP regs.  */
1371       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1372         {
1373           src_lo = gen_lowpart (word_mode, src);
1374           src_hi = gen_highpart (word_mode, src);
1375
1376           if (mode == TImode)
1377             {
1378               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1379               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1380             }
1381           else
1382             {
1383               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1384               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1385             }
1386           return;
1387         }
1388       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1389         {
1390           dst_lo = gen_lowpart (word_mode, dst);
1391           dst_hi = gen_highpart (word_mode, dst);
1392
1393           if (mode == TImode)
1394             {
1395               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1396               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1397             }
1398           else
1399             {
1400               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1401               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1402             }
1403           return;
1404         }
1405     }
1406
1407   dst_lo = gen_lowpart (word_mode, dst);
1408   dst_hi = gen_highpart (word_mode, dst);
1409   src_lo = gen_lowpart (word_mode, src);
1410   src_hi = gen_highpart_mode (word_mode, mode, src);
1411
1412   /* At most one pairing may overlap.  */
1413   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1414     {
1415       aarch64_emit_move (dst_hi, src_hi);
1416       aarch64_emit_move (dst_lo, src_lo);
1417     }
1418   else
1419     {
1420       aarch64_emit_move (dst_lo, src_lo);
1421       aarch64_emit_move (dst_hi, src_hi);
1422     }
1423 }
1424
1425 bool
1426 aarch64_split_128bit_move_p (rtx dst, rtx src)
1427 {
1428   return (! REG_P (src)
1429           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1430 }
1431
1432 /* Split a complex SIMD combine.  */
1433
1434 void
1435 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1436 {
1437   machine_mode src_mode = GET_MODE (src1);
1438   machine_mode dst_mode = GET_MODE (dst);
1439
1440   gcc_assert (VECTOR_MODE_P (dst_mode));
1441
1442   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1443     {
1444       rtx (*gen) (rtx, rtx, rtx);
1445
1446       switch (src_mode)
1447         {
1448         case V8QImode:
1449           gen = gen_aarch64_simd_combinev8qi;
1450           break;
1451         case V4HImode:
1452           gen = gen_aarch64_simd_combinev4hi;
1453           break;
1454         case V2SImode:
1455           gen = gen_aarch64_simd_combinev2si;
1456           break;
1457         case V4HFmode:
1458           gen = gen_aarch64_simd_combinev4hf;
1459           break;
1460         case V2SFmode:
1461           gen = gen_aarch64_simd_combinev2sf;
1462           break;
1463         case DImode:
1464           gen = gen_aarch64_simd_combinedi;
1465           break;
1466         case DFmode:
1467           gen = gen_aarch64_simd_combinedf;
1468           break;
1469         default:
1470           gcc_unreachable ();
1471         }
1472
1473       emit_insn (gen (dst, src1, src2));
1474       return;
1475     }
1476 }
1477
1478 /* Split a complex SIMD move.  */
1479
1480 void
1481 aarch64_split_simd_move (rtx dst, rtx src)
1482 {
1483   machine_mode src_mode = GET_MODE (src);
1484   machine_mode dst_mode = GET_MODE (dst);
1485
1486   gcc_assert (VECTOR_MODE_P (dst_mode));
1487
1488   if (REG_P (dst) && REG_P (src))
1489     {
1490       rtx (*gen) (rtx, rtx);
1491
1492       gcc_assert (VECTOR_MODE_P (src_mode));
1493
1494       switch (src_mode)
1495         {
1496         case V16QImode:
1497           gen = gen_aarch64_split_simd_movv16qi;
1498           break;
1499         case V8HImode:
1500           gen = gen_aarch64_split_simd_movv8hi;
1501           break;
1502         case V4SImode:
1503           gen = gen_aarch64_split_simd_movv4si;
1504           break;
1505         case V2DImode:
1506           gen = gen_aarch64_split_simd_movv2di;
1507           break;
1508         case V8HFmode:
1509           gen = gen_aarch64_split_simd_movv8hf;
1510           break;
1511         case V4SFmode:
1512           gen = gen_aarch64_split_simd_movv4sf;
1513           break;
1514         case V2DFmode:
1515           gen = gen_aarch64_split_simd_movv2df;
1516           break;
1517         default:
1518           gcc_unreachable ();
1519         }
1520
1521       emit_insn (gen (dst, src));
1522       return;
1523     }
1524 }
1525
1526 bool
1527 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1528                               machine_mode ymode, rtx y)
1529 {
1530   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1531   gcc_assert (r != NULL);
1532   return rtx_equal_p (x, r);
1533 }
1534
1535
1536 static rtx
1537 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1538 {
1539   if (can_create_pseudo_p ())
1540     return force_reg (mode, value);
1541   else
1542     {
1543       x = aarch64_emit_move (x, value);
1544       return x;
1545     }
1546 }
1547
1548
1549 static rtx
1550 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1551 {
1552   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1553     {
1554       rtx high;
1555       /* Load the full offset into a register.  This
1556          might be improvable in the future.  */
1557       high = GEN_INT (offset);
1558       offset = 0;
1559       high = aarch64_force_temporary (mode, temp, high);
1560       reg = aarch64_force_temporary (mode, temp,
1561                                      gen_rtx_PLUS (mode, high, reg));
1562     }
1563   return plus_constant (mode, reg, offset);
1564 }
1565
1566 static int
1567 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1568                                 machine_mode mode)
1569 {
1570   int i;
1571   unsigned HOST_WIDE_INT val, val2, mask;
1572   int one_match, zero_match;
1573   int num_insns;
1574
1575   val = INTVAL (imm);
1576
1577   if (aarch64_move_imm (val, mode))
1578     {
1579       if (generate)
1580         emit_insn (gen_rtx_SET (dest, imm));
1581       return 1;
1582     }
1583
1584   if ((val >> 32) == 0 || mode == SImode)
1585     {
1586       if (generate)
1587         {
1588           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1589           if (mode == SImode)
1590             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1591                                        GEN_INT ((val >> 16) & 0xffff)));
1592           else
1593             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1594                                        GEN_INT ((val >> 16) & 0xffff)));
1595         }
1596       return 2;
1597     }
1598
1599   /* Remaining cases are all for DImode.  */
1600
1601   mask = 0xffff;
1602   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1603     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1604   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1605     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1606
1607   if (zero_match != 2 && one_match != 2)
1608     {
1609       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1610          For a 64-bit bitmask try whether changing 16 bits to all ones or
1611          zeroes creates a valid bitmask.  To check any repeated bitmask,
1612          try using 16 bits from the other 32-bit half of val.  */
1613
1614       for (i = 0; i < 64; i += 16, mask <<= 16)
1615         {
1616           val2 = val & ~mask;
1617           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1618             break;
1619           val2 = val | mask;
1620           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1621             break;
1622           val2 = val2 & ~mask;
1623           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1624           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1625             break;
1626         }
1627       if (i != 64)
1628         {
1629           if (generate)
1630             {
1631               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1632               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1633                                          GEN_INT ((val >> i) & 0xffff)));
1634             }
1635           return 2;
1636         }
1637     }
1638
1639   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1640      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1641      otherwise skip zero bits.  */
1642
1643   num_insns = 1;
1644   mask = 0xffff;
1645   val2 = one_match > zero_match ? ~val : val;
1646   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1647
1648   if (generate)
1649     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1650                                            ? (val | ~(mask << i))
1651                                            : (val & (mask << i)))));
1652   for (i += 16; i < 64; i += 16)
1653     {
1654       if ((val2 & (mask << i)) == 0)
1655         continue;
1656       if (generate)
1657         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1658                                    GEN_INT ((val >> i) & 0xffff)));
1659       num_insns ++;
1660     }
1661
1662   return num_insns;
1663 }
1664
1665
1666 void
1667 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1668 {
1669   machine_mode mode = GET_MODE (dest);
1670
1671   gcc_assert (mode == SImode || mode == DImode);
1672
1673   /* Check on what type of symbol it is.  */
1674   if (GET_CODE (imm) == SYMBOL_REF
1675       || GET_CODE (imm) == LABEL_REF
1676       || GET_CODE (imm) == CONST)
1677     {
1678       rtx mem, base, offset;
1679       enum aarch64_symbol_type sty;
1680
1681       /* If we have (const (plus symbol offset)), separate out the offset
1682          before we start classifying the symbol.  */
1683       split_const (imm, &base, &offset);
1684
1685       sty = aarch64_classify_symbol (base, offset);
1686       switch (sty)
1687         {
1688         case SYMBOL_FORCE_TO_MEM:
1689           if (offset != const0_rtx
1690               && targetm.cannot_force_const_mem (mode, imm))
1691             {
1692               gcc_assert (can_create_pseudo_p ());
1693               base = aarch64_force_temporary (mode, dest, base);
1694               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1695               aarch64_emit_move (dest, base);
1696               return;
1697             }
1698
1699           mem = force_const_mem (ptr_mode, imm);
1700           gcc_assert (mem);
1701
1702           /* If we aren't generating PC relative literals, then
1703              we need to expand the literal pool access carefully.
1704              This is something that needs to be done in a number
1705              of places, so could well live as a separate function.  */
1706           if (aarch64_nopcrelative_literal_loads)
1707             {
1708               gcc_assert (can_create_pseudo_p ());
1709               base = gen_reg_rtx (ptr_mode);
1710               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1711               mem = gen_rtx_MEM (ptr_mode, base);
1712             }
1713
1714           if (mode != ptr_mode)
1715             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1716
1717           emit_insn (gen_rtx_SET (dest, mem));
1718
1719           return;
1720
1721         case SYMBOL_SMALL_TLSGD:
1722         case SYMBOL_SMALL_TLSDESC:
1723         case SYMBOL_SMALL_TLSIE:
1724         case SYMBOL_SMALL_GOT_28K:
1725         case SYMBOL_SMALL_GOT_4G:
1726         case SYMBOL_TINY_GOT:
1727         case SYMBOL_TINY_TLSIE:
1728           if (offset != const0_rtx)
1729             {
1730               gcc_assert(can_create_pseudo_p ());
1731               base = aarch64_force_temporary (mode, dest, base);
1732               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1733               aarch64_emit_move (dest, base);
1734               return;
1735             }
1736           /* FALLTHRU */
1737
1738         case SYMBOL_SMALL_ABSOLUTE:
1739         case SYMBOL_TINY_ABSOLUTE:
1740         case SYMBOL_TLSLE12:
1741         case SYMBOL_TLSLE24:
1742         case SYMBOL_TLSLE32:
1743         case SYMBOL_TLSLE48:
1744           aarch64_load_symref_appropriately (dest, imm, sty);
1745           return;
1746
1747         default:
1748           gcc_unreachable ();
1749         }
1750     }
1751
1752   if (!CONST_INT_P (imm))
1753     {
1754       if (GET_CODE (imm) == HIGH)
1755         emit_insn (gen_rtx_SET (dest, imm));
1756       else
1757         {
1758           rtx mem = force_const_mem (mode, imm);
1759           gcc_assert (mem);
1760           emit_insn (gen_rtx_SET (dest, mem));
1761         }
1762
1763       return;
1764     }
1765
1766   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1767 }
1768
1769 static bool
1770 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1771                                  tree exp ATTRIBUTE_UNUSED)
1772 {
1773   /* Currently, always true.  */
1774   return true;
1775 }
1776
1777 /* Implement TARGET_PASS_BY_REFERENCE.  */
1778
1779 static bool
1780 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1781                            machine_mode mode,
1782                            const_tree type,
1783                            bool named ATTRIBUTE_UNUSED)
1784 {
1785   HOST_WIDE_INT size;
1786   machine_mode dummymode;
1787   int nregs;
1788
1789   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1790   size = (mode == BLKmode && type)
1791     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1792
1793   /* Aggregates are passed by reference based on their size.  */
1794   if (type && AGGREGATE_TYPE_P (type))
1795     {
1796       size = int_size_in_bytes (type);
1797     }
1798
1799   /* Variable sized arguments are always returned by reference.  */
1800   if (size < 0)
1801     return true;
1802
1803   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1804   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1805                                                &dummymode, &nregs,
1806                                                NULL))
1807     return false;
1808
1809   /* Arguments which are variable sized or larger than 2 registers are
1810      passed by reference unless they are a homogenous floating point
1811      aggregate.  */
1812   return size > 2 * UNITS_PER_WORD;
1813 }
1814
1815 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1816 static bool
1817 aarch64_return_in_msb (const_tree valtype)
1818 {
1819   machine_mode dummy_mode;
1820   int dummy_int;
1821
1822   /* Never happens in little-endian mode.  */
1823   if (!BYTES_BIG_ENDIAN)
1824     return false;
1825
1826   /* Only composite types smaller than or equal to 16 bytes can
1827      be potentially returned in registers.  */
1828   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1829       || int_size_in_bytes (valtype) <= 0
1830       || int_size_in_bytes (valtype) > 16)
1831     return false;
1832
1833   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1834      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1835      is always passed/returned in the least significant bits of fp/simd
1836      register(s).  */
1837   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1838                                                &dummy_mode, &dummy_int, NULL))
1839     return false;
1840
1841   return true;
1842 }
1843
1844 /* Implement TARGET_FUNCTION_VALUE.
1845    Define how to find the value returned by a function.  */
1846
1847 static rtx
1848 aarch64_function_value (const_tree type, const_tree func,
1849                         bool outgoing ATTRIBUTE_UNUSED)
1850 {
1851   machine_mode mode;
1852   int unsignedp;
1853   int count;
1854   machine_mode ag_mode;
1855
1856   mode = TYPE_MODE (type);
1857   if (INTEGRAL_TYPE_P (type))
1858     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1859
1860   if (aarch64_return_in_msb (type))
1861     {
1862       HOST_WIDE_INT size = int_size_in_bytes (type);
1863
1864       if (size % UNITS_PER_WORD != 0)
1865         {
1866           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1867           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1868         }
1869     }
1870
1871   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1872                                                &ag_mode, &count, NULL))
1873     {
1874       if (!aarch64_composite_type_p (type, mode))
1875         {
1876           gcc_assert (count == 1 && mode == ag_mode);
1877           return gen_rtx_REG (mode, V0_REGNUM);
1878         }
1879       else
1880         {
1881           int i;
1882           rtx par;
1883
1884           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1885           for (i = 0; i < count; i++)
1886             {
1887               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1888               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1889                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1890               XVECEXP (par, 0, i) = tmp;
1891             }
1892           return par;
1893         }
1894     }
1895   else
1896     return gen_rtx_REG (mode, R0_REGNUM);
1897 }
1898
1899 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1900    Return true if REGNO is the number of a hard register in which the values
1901    of called function may come back.  */
1902
1903 static bool
1904 aarch64_function_value_regno_p (const unsigned int regno)
1905 {
1906   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1907      of 16-byte return values are: 128-bit integers and 16-byte small
1908      structures (excluding homogeneous floating-point aggregates).  */
1909   if (regno == R0_REGNUM || regno == R1_REGNUM)
1910     return true;
1911
1912   /* Up to four fp/simd registers can return a function value, e.g. a
1913      homogeneous floating-point aggregate having four members.  */
1914   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1915     return TARGET_FLOAT;
1916
1917   return false;
1918 }
1919
1920 /* Implement TARGET_RETURN_IN_MEMORY.
1921
1922    If the type T of the result of a function is such that
1923      void func (T arg)
1924    would require that arg be passed as a value in a register (or set of
1925    registers) according to the parameter passing rules, then the result
1926    is returned in the same registers as would be used for such an
1927    argument.  */
1928
1929 static bool
1930 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1931 {
1932   HOST_WIDE_INT size;
1933   machine_mode ag_mode;
1934   int count;
1935
1936   if (!AGGREGATE_TYPE_P (type)
1937       && TREE_CODE (type) != COMPLEX_TYPE
1938       && TREE_CODE (type) != VECTOR_TYPE)
1939     /* Simple scalar types always returned in registers.  */
1940     return false;
1941
1942   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1943                                                type,
1944                                                &ag_mode,
1945                                                &count,
1946                                                NULL))
1947     return false;
1948
1949   /* Types larger than 2 registers returned in memory.  */
1950   size = int_size_in_bytes (type);
1951   return (size < 0 || size > 2 * UNITS_PER_WORD);
1952 }
1953
1954 static bool
1955 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1956                                const_tree type, int *nregs)
1957 {
1958   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1959   return aarch64_vfp_is_call_or_return_candidate (mode,
1960                                                   type,
1961                                                   &pcum->aapcs_vfp_rmode,
1962                                                   nregs,
1963                                                   NULL);
1964 }
1965
1966 /* Given MODE and TYPE of a function argument, return the alignment in
1967    bits.  The idea is to suppress any stronger alignment requested by
1968    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1969    This is a helper function for local use only.  */
1970
1971 static unsigned int
1972 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1973 {
1974   unsigned int alignment;
1975
1976   if (type)
1977     {
1978       if (!integer_zerop (TYPE_SIZE (type)))
1979         {
1980           if (TYPE_MODE (type) == mode)
1981             alignment = TYPE_ALIGN (type);
1982           else
1983             alignment = GET_MODE_ALIGNMENT (mode);
1984         }
1985       else
1986         alignment = 0;
1987     }
1988   else
1989     alignment = GET_MODE_ALIGNMENT (mode);
1990
1991   return alignment;
1992 }
1993
1994 /* Layout a function argument according to the AAPCS64 rules.  The rule
1995    numbers refer to the rule numbers in the AAPCS64.  */
1996
1997 static void
1998 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1999                     const_tree type,
2000                     bool named ATTRIBUTE_UNUSED)
2001 {
2002   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2003   int ncrn, nvrn, nregs;
2004   bool allocate_ncrn, allocate_nvrn;
2005   HOST_WIDE_INT size;
2006
2007   /* We need to do this once per argument.  */
2008   if (pcum->aapcs_arg_processed)
2009     return;
2010
2011   pcum->aapcs_arg_processed = true;
2012
2013   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2014   size
2015     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2016                 UNITS_PER_WORD);
2017
2018   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2019   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2020                                                  mode,
2021                                                  type,
2022                                                  &nregs);
2023
2024   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2025      The following code thus handles passing by SIMD/FP registers first.  */
2026
2027   nvrn = pcum->aapcs_nvrn;
2028
2029   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2030      and homogenous short-vector aggregates (HVA).  */
2031   if (allocate_nvrn)
2032     {
2033       if (!TARGET_FLOAT)
2034         aarch64_err_no_fpadvsimd (mode, "argument");
2035
2036       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2037         {
2038           pcum->aapcs_nextnvrn = nvrn + nregs;
2039           if (!aarch64_composite_type_p (type, mode))
2040             {
2041               gcc_assert (nregs == 1);
2042               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2043             }
2044           else
2045             {
2046               rtx par;
2047               int i;
2048               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2049               for (i = 0; i < nregs; i++)
2050                 {
2051                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2052                                          V0_REGNUM + nvrn + i);
2053                   tmp = gen_rtx_EXPR_LIST
2054                     (VOIDmode, tmp,
2055                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2056                   XVECEXP (par, 0, i) = tmp;
2057                 }
2058               pcum->aapcs_reg = par;
2059             }
2060           return;
2061         }
2062       else
2063         {
2064           /* C.3 NSRN is set to 8.  */
2065           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2066           goto on_stack;
2067         }
2068     }
2069
2070   ncrn = pcum->aapcs_ncrn;
2071   nregs = size / UNITS_PER_WORD;
2072
2073   /* C6 - C9.  though the sign and zero extension semantics are
2074      handled elsewhere.  This is the case where the argument fits
2075      entirely general registers.  */
2076   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2077     {
2078       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2079
2080       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2081
2082       /* C.8 if the argument has an alignment of 16 then the NGRN is
2083          rounded up to the next even number.  */
2084       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2085         {
2086           ++ncrn;
2087           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2088         }
2089       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2090          A reg is still generated for it, but the caller should be smart
2091          enough not to use it.  */
2092       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2093         {
2094           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2095         }
2096       else
2097         {
2098           rtx par;
2099           int i;
2100
2101           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2102           for (i = 0; i < nregs; i++)
2103             {
2104               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2105               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2106                                        GEN_INT (i * UNITS_PER_WORD));
2107               XVECEXP (par, 0, i) = tmp;
2108             }
2109           pcum->aapcs_reg = par;
2110         }
2111
2112       pcum->aapcs_nextncrn = ncrn + nregs;
2113       return;
2114     }
2115
2116   /* C.11  */
2117   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2118
2119   /* The argument is passed on stack; record the needed number of words for
2120      this argument and align the total size if necessary.  */
2121 on_stack:
2122   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2123   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2124     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2125                                        16 / UNITS_PER_WORD);
2126   return;
2127 }
2128
2129 /* Implement TARGET_FUNCTION_ARG.  */
2130
2131 static rtx
2132 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2133                       const_tree type, bool named)
2134 {
2135   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2136   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2137
2138   if (mode == VOIDmode)
2139     return NULL_RTX;
2140
2141   aarch64_layout_arg (pcum_v, mode, type, named);
2142   return pcum->aapcs_reg;
2143 }
2144
2145 void
2146 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2147                            const_tree fntype ATTRIBUTE_UNUSED,
2148                            rtx libname ATTRIBUTE_UNUSED,
2149                            const_tree fndecl ATTRIBUTE_UNUSED,
2150                            unsigned n_named ATTRIBUTE_UNUSED)
2151 {
2152   pcum->aapcs_ncrn = 0;
2153   pcum->aapcs_nvrn = 0;
2154   pcum->aapcs_nextncrn = 0;
2155   pcum->aapcs_nextnvrn = 0;
2156   pcum->pcs_variant = ARM_PCS_AAPCS64;
2157   pcum->aapcs_reg = NULL_RTX;
2158   pcum->aapcs_arg_processed = false;
2159   pcum->aapcs_stack_words = 0;
2160   pcum->aapcs_stack_size = 0;
2161
2162   if (!TARGET_FLOAT
2163       && fndecl && TREE_PUBLIC (fndecl)
2164       && fntype && fntype != error_mark_node)
2165     {
2166       const_tree type = TREE_TYPE (fntype);
2167       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2168       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2169       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2170                                                    &mode, &nregs, NULL))
2171         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2172     }
2173   return;
2174 }
2175
2176 static void
2177 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2178                               machine_mode mode,
2179                               const_tree type,
2180                               bool named)
2181 {
2182   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2183   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2184     {
2185       aarch64_layout_arg (pcum_v, mode, type, named);
2186       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2187                   != (pcum->aapcs_stack_words != 0));
2188       pcum->aapcs_arg_processed = false;
2189       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2190       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2191       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2192       pcum->aapcs_stack_words = 0;
2193       pcum->aapcs_reg = NULL_RTX;
2194     }
2195 }
2196
2197 bool
2198 aarch64_function_arg_regno_p (unsigned regno)
2199 {
2200   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2201           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2202 }
2203
2204 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2205    PARM_BOUNDARY bits of alignment, but will be given anything up
2206    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2207    that both before and after the layout of each argument, the Next
2208    Stacked Argument Address (NSAA) will have a minimum alignment of
2209    8 bytes.  */
2210
2211 static unsigned int
2212 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2213 {
2214   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2215
2216   if (alignment < PARM_BOUNDARY)
2217     alignment = PARM_BOUNDARY;
2218   if (alignment > STACK_BOUNDARY)
2219     alignment = STACK_BOUNDARY;
2220   return alignment;
2221 }
2222
2223 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2224
2225    Return true if an argument passed on the stack should be padded upwards,
2226    i.e. if the least-significant byte of the stack slot has useful data.
2227
2228    Small aggregate types are placed in the lowest memory address.
2229
2230    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2231
2232 bool
2233 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2234 {
2235   /* On little-endian targets, the least significant byte of every stack
2236      argument is passed at the lowest byte address of the stack slot.  */
2237   if (!BYTES_BIG_ENDIAN)
2238     return true;
2239
2240   /* Otherwise, integral, floating-point and pointer types are padded downward:
2241      the least significant byte of a stack argument is passed at the highest
2242      byte address of the stack slot.  */
2243   if (type
2244       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2245          || POINTER_TYPE_P (type))
2246       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2247     return false;
2248
2249   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2250   return true;
2251 }
2252
2253 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2254
2255    It specifies padding for the last (may also be the only)
2256    element of a block move between registers and memory.  If
2257    assuming the block is in the memory, padding upward means that
2258    the last element is padded after its highest significant byte,
2259    while in downward padding, the last element is padded at the
2260    its least significant byte side.
2261
2262    Small aggregates and small complex types are always padded
2263    upwards.
2264
2265    We don't need to worry about homogeneous floating-point or
2266    short-vector aggregates; their move is not affected by the
2267    padding direction determined here.  Regardless of endianness,
2268    each element of such an aggregate is put in the least
2269    significant bits of a fp/simd register.
2270
2271    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2272    register has useful data, and return the opposite if the most
2273    significant byte does.  */
2274
2275 bool
2276 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2277                      bool first ATTRIBUTE_UNUSED)
2278 {
2279
2280   /* Small composite types are always padded upward.  */
2281   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2282     {
2283       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2284                             : GET_MODE_SIZE (mode));
2285       if (size < 2 * UNITS_PER_WORD)
2286         return true;
2287     }
2288
2289   /* Otherwise, use the default padding.  */
2290   return !BYTES_BIG_ENDIAN;
2291 }
2292
2293 static machine_mode
2294 aarch64_libgcc_cmp_return_mode (void)
2295 {
2296   return SImode;
2297 }
2298
2299 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2300
2301 /* We use the 12-bit shifted immediate arithmetic instructions so values
2302    must be multiple of (1 << 12), i.e. 4096.  */
2303 #define ARITH_FACTOR 4096
2304
2305 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2306 #error Cannot use simple address calculation for stack probing
2307 #endif
2308
2309 /* The pair of scratch registers used for stack probing.  */
2310 #define PROBE_STACK_FIRST_REG  9
2311 #define PROBE_STACK_SECOND_REG 10
2312
2313 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2314    inclusive.  These are offsets from the current stack pointer.  */
2315
2316 static void
2317 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2318 {
2319   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2320
2321   /* See the same assertion on PROBE_INTERVAL above.  */
2322   gcc_assert ((first % ARITH_FACTOR) == 0);
2323
2324   /* See if we have a constant small number of probes to generate.  If so,
2325      that's the easy case.  */
2326   if (size <= PROBE_INTERVAL)
2327     {
2328       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2329
2330       emit_set_insn (reg1,
2331                      plus_constant (ptr_mode,
2332                                     stack_pointer_rtx, -(first + base)));
2333       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2334     }
2335
2336   /* The run-time loop is made up of 8 insns in the generic case while the
2337      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2338   else if (size <= 4 * PROBE_INTERVAL)
2339     {
2340       HOST_WIDE_INT i, rem;
2341
2342       emit_set_insn (reg1,
2343                      plus_constant (ptr_mode,
2344                                     stack_pointer_rtx,
2345                                     -(first + PROBE_INTERVAL)));
2346       emit_stack_probe (reg1);
2347
2348       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2349          it exceeds SIZE.  If only two probes are needed, this will not
2350          generate any code.  Then probe at FIRST + SIZE.  */
2351       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2352         {
2353           emit_set_insn (reg1,
2354                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2355           emit_stack_probe (reg1);
2356         }
2357
2358       rem = size - (i - PROBE_INTERVAL);
2359       if (rem > 256)
2360         {
2361           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2362
2363           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2364           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2365         }
2366       else
2367         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2368     }
2369
2370   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2371      extra careful with variables wrapping around because we might be at
2372      the very top (or the very bottom) of the address space and we have
2373      to be able to handle this case properly; in particular, we use an
2374      equality test for the loop condition.  */
2375   else
2376     {
2377       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2378
2379       /* Step 1: round SIZE to the previous multiple of the interval.  */
2380
2381       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2382
2383
2384       /* Step 2: compute initial and final value of the loop counter.  */
2385
2386       /* TEST_ADDR = SP + FIRST.  */
2387       emit_set_insn (reg1,
2388                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2389
2390       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2391       emit_set_insn (reg2,
2392                      plus_constant (ptr_mode, stack_pointer_rtx,
2393                                     -(first + rounded_size)));
2394
2395
2396       /* Step 3: the loop
2397
2398          do
2399            {
2400              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2401              probe at TEST_ADDR
2402            }
2403          while (TEST_ADDR != LAST_ADDR)
2404
2405          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2406          until it is equal to ROUNDED_SIZE.  */
2407
2408       if (ptr_mode == DImode)
2409         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2410       else
2411         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2412
2413
2414       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2415          that SIZE is equal to ROUNDED_SIZE.  */
2416
2417       if (size != rounded_size)
2418         {
2419           HOST_WIDE_INT rem = size - rounded_size;
2420
2421           if (rem > 256)
2422             {
2423               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2424
2425               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2426               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2427             }
2428           else
2429             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2430         }
2431     }
2432
2433   /* Make sure nothing is scheduled before we are done.  */
2434   emit_insn (gen_blockage ());
2435 }
2436
2437 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2438    absolute addresses.  */
2439
2440 const char *
2441 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2442 {
2443   static int labelno = 0;
2444   char loop_lab[32];
2445   rtx xops[2];
2446
2447   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2448
2449   /* Loop.  */
2450   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2451
2452   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2453   xops[0] = reg1;
2454   xops[1] = GEN_INT (PROBE_INTERVAL);
2455   output_asm_insn ("sub\t%0, %0, %1", xops);
2456
2457   /* Probe at TEST_ADDR.  */
2458   output_asm_insn ("str\txzr, [%0]", xops);
2459
2460   /* Test if TEST_ADDR == LAST_ADDR.  */
2461   xops[1] = reg2;
2462   output_asm_insn ("cmp\t%0, %1", xops);
2463
2464   /* Branch.  */
2465   fputs ("\tb.ne\t", asm_out_file);
2466   assemble_name_raw (asm_out_file, loop_lab);
2467   fputc ('\n', asm_out_file);
2468
2469   return "";
2470 }
2471
2472 static bool
2473 aarch64_frame_pointer_required (void)
2474 {
2475   /* In aarch64_override_options_after_change
2476      flag_omit_leaf_frame_pointer turns off the frame pointer by
2477      default.  Turn it back on now if we've not got a leaf
2478      function.  */
2479   if (flag_omit_leaf_frame_pointer
2480       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2481     return true;
2482
2483   return false;
2484 }
2485
2486 /* Mark the registers that need to be saved by the callee and calculate
2487    the size of the callee-saved registers area and frame record (both FP
2488    and LR may be omitted).  */
2489 static void
2490 aarch64_layout_frame (void)
2491 {
2492   HOST_WIDE_INT offset = 0;
2493   int regno;
2494
2495   if (reload_completed && cfun->machine->frame.laid_out)
2496     return;
2497
2498 #define SLOT_NOT_REQUIRED (-2)
2499 #define SLOT_REQUIRED     (-1)
2500
2501   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2502   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2503
2504   /* First mark all the registers that really need to be saved...  */
2505   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2506     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2507
2508   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2509     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2510
2511   /* ... that includes the eh data registers (if needed)...  */
2512   if (crtl->calls_eh_return)
2513     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2514       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2515         = SLOT_REQUIRED;
2516
2517   /* ... and any callee saved register that dataflow says is live.  */
2518   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2519     if (df_regs_ever_live_p (regno)
2520         && (regno == R30_REGNUM
2521             || !call_used_regs[regno]))
2522       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2523
2524   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2525     if (df_regs_ever_live_p (regno)
2526         && !call_used_regs[regno])
2527       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2528
2529   if (frame_pointer_needed)
2530     {
2531       /* FP and LR are placed in the linkage record.  */
2532       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2533       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2534       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2535       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2536       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2537       offset += 2 * UNITS_PER_WORD;
2538     }
2539
2540   /* Now assign stack slots for them.  */
2541   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2542     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2543       {
2544         cfun->machine->frame.reg_offset[regno] = offset;
2545         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2546           cfun->machine->frame.wb_candidate1 = regno;
2547         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2548           cfun->machine->frame.wb_candidate2 = regno;
2549         offset += UNITS_PER_WORD;
2550       }
2551
2552   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2553     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2554       {
2555         cfun->machine->frame.reg_offset[regno] = offset;
2556         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2557           cfun->machine->frame.wb_candidate1 = regno;
2558         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2559                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2560           cfun->machine->frame.wb_candidate2 = regno;
2561         offset += UNITS_PER_WORD;
2562       }
2563
2564   cfun->machine->frame.padding0 =
2565     (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2566   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2567
2568   cfun->machine->frame.saved_regs_size = offset;
2569
2570   cfun->machine->frame.hard_fp_offset
2571     = ROUND_UP (cfun->machine->frame.saved_varargs_size
2572                 + get_frame_size ()
2573                 + cfun->machine->frame.saved_regs_size,
2574                 STACK_BOUNDARY / BITS_PER_UNIT);
2575
2576   cfun->machine->frame.frame_size
2577     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2578                 + crtl->outgoing_args_size,
2579                 STACK_BOUNDARY / BITS_PER_UNIT);
2580
2581   cfun->machine->frame.laid_out = true;
2582 }
2583
2584 static bool
2585 aarch64_register_saved_on_entry (int regno)
2586 {
2587   return cfun->machine->frame.reg_offset[regno] >= 0;
2588 }
2589
2590 static unsigned
2591 aarch64_next_callee_save (unsigned regno, unsigned limit)
2592 {
2593   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2594     regno ++;
2595   return regno;
2596 }
2597
2598 static void
2599 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2600                            HOST_WIDE_INT adjustment)
2601  {
2602   rtx base_rtx = stack_pointer_rtx;
2603   rtx insn, reg, mem;
2604
2605   reg = gen_rtx_REG (mode, regno);
2606   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2607                             plus_constant (Pmode, base_rtx, -adjustment));
2608   mem = gen_rtx_MEM (mode, mem);
2609
2610   insn = emit_move_insn (mem, reg);
2611   RTX_FRAME_RELATED_P (insn) = 1;
2612 }
2613
2614 static rtx
2615 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2616                           HOST_WIDE_INT adjustment)
2617 {
2618   switch (mode)
2619     {
2620     case DImode:
2621       return gen_storewb_pairdi_di (base, base, reg, reg2,
2622                                     GEN_INT (-adjustment),
2623                                     GEN_INT (UNITS_PER_WORD - adjustment));
2624     case DFmode:
2625       return gen_storewb_pairdf_di (base, base, reg, reg2,
2626                                     GEN_INT (-adjustment),
2627                                     GEN_INT (UNITS_PER_WORD - adjustment));
2628     default:
2629       gcc_unreachable ();
2630     }
2631 }
2632
2633 static void
2634 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2635                          unsigned regno2, HOST_WIDE_INT adjustment)
2636 {
2637   rtx_insn *insn;
2638   rtx reg1 = gen_rtx_REG (mode, regno1);
2639   rtx reg2 = gen_rtx_REG (mode, regno2);
2640
2641   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2642                                               reg2, adjustment));
2643   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2644   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2645   RTX_FRAME_RELATED_P (insn) = 1;
2646 }
2647
2648 static rtx
2649 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2650                          HOST_WIDE_INT adjustment)
2651 {
2652   switch (mode)
2653     {
2654     case DImode:
2655       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2656                                    GEN_INT (UNITS_PER_WORD));
2657     case DFmode:
2658       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2659                                    GEN_INT (UNITS_PER_WORD));
2660     default:
2661       gcc_unreachable ();
2662     }
2663 }
2664
2665 static rtx
2666 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2667                         rtx reg2)
2668 {
2669   switch (mode)
2670     {
2671     case DImode:
2672       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2673
2674     case DFmode:
2675       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2676
2677     default:
2678       gcc_unreachable ();
2679     }
2680 }
2681
2682 static rtx
2683 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2684                        rtx mem2)
2685 {
2686   switch (mode)
2687     {
2688     case DImode:
2689       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2690
2691     case DFmode:
2692       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2693
2694     default:
2695       gcc_unreachable ();
2696     }
2697 }
2698
2699
2700 static void
2701 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2702                            unsigned start, unsigned limit, bool skip_wb)
2703 {
2704   rtx_insn *insn;
2705   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2706                                                  ? gen_frame_mem : gen_rtx_MEM);
2707   unsigned regno;
2708   unsigned regno2;
2709
2710   for (regno = aarch64_next_callee_save (start, limit);
2711        regno <= limit;
2712        regno = aarch64_next_callee_save (regno + 1, limit))
2713     {
2714       rtx reg, mem;
2715       HOST_WIDE_INT offset;
2716
2717       if (skip_wb
2718           && (regno == cfun->machine->frame.wb_candidate1
2719               || regno == cfun->machine->frame.wb_candidate2))
2720         continue;
2721
2722       reg = gen_rtx_REG (mode, regno);
2723       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2724       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2725                                               offset));
2726
2727       regno2 = aarch64_next_callee_save (regno + 1, limit);
2728
2729       if (regno2 <= limit
2730           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2731               == cfun->machine->frame.reg_offset[regno2]))
2732
2733         {
2734           rtx reg2 = gen_rtx_REG (mode, regno2);
2735           rtx mem2;
2736
2737           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2738           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2739                                                    offset));
2740           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2741                                                     reg2));
2742
2743           /* The first part of a frame-related parallel insn is
2744              always assumed to be relevant to the frame
2745              calculations; subsequent parts, are only
2746              frame-related if explicitly marked.  */
2747           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2748           regno = regno2;
2749         }
2750       else
2751         insn = emit_move_insn (mem, reg);
2752
2753       RTX_FRAME_RELATED_P (insn) = 1;
2754     }
2755 }
2756
2757 static void
2758 aarch64_restore_callee_saves (machine_mode mode,
2759                               HOST_WIDE_INT start_offset, unsigned start,
2760                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2761 {
2762   rtx base_rtx = stack_pointer_rtx;
2763   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2764                                                  ? gen_frame_mem : gen_rtx_MEM);
2765   unsigned regno;
2766   unsigned regno2;
2767   HOST_WIDE_INT offset;
2768
2769   for (regno = aarch64_next_callee_save (start, limit);
2770        regno <= limit;
2771        regno = aarch64_next_callee_save (regno + 1, limit))
2772     {
2773       rtx reg, mem;
2774
2775       if (skip_wb
2776           && (regno == cfun->machine->frame.wb_candidate1
2777               || regno == cfun->machine->frame.wb_candidate2))
2778         continue;
2779
2780       reg = gen_rtx_REG (mode, regno);
2781       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2782       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2783
2784       regno2 = aarch64_next_callee_save (regno + 1, limit);
2785
2786       if (regno2 <= limit
2787           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2788               == cfun->machine->frame.reg_offset[regno2]))
2789         {
2790           rtx reg2 = gen_rtx_REG (mode, regno2);
2791           rtx mem2;
2792
2793           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2794           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2795           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2796
2797           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2798           regno = regno2;
2799         }
2800       else
2801         emit_move_insn (reg, mem);
2802       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2803     }
2804 }
2805
2806 /* AArch64 stack frames generated by this compiler look like:
2807
2808         +-------------------------------+
2809         |                               |
2810         |  incoming stack arguments     |
2811         |                               |
2812         +-------------------------------+
2813         |                               | <-- incoming stack pointer (aligned)
2814         |  callee-allocated save area   |
2815         |  for register varargs         |
2816         |                               |
2817         +-------------------------------+
2818         |  local variables              | <-- frame_pointer_rtx
2819         |                               |
2820         +-------------------------------+
2821         |  padding0                     | \
2822         +-------------------------------+  |
2823         |  callee-saved registers       |  | frame.saved_regs_size
2824         +-------------------------------+  |
2825         |  LR'                          |  |
2826         +-------------------------------+  |
2827         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2828         +-------------------------------+
2829         |  dynamic allocation           |
2830         +-------------------------------+
2831         |  padding                      |
2832         +-------------------------------+
2833         |  outgoing stack arguments     | <-- arg_pointer
2834         |                               |
2835         +-------------------------------+
2836         |                               | <-- stack_pointer_rtx (aligned)
2837
2838    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2839    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2840    unchanged.  */
2841
2842 /* Generate the prologue instructions for entry into a function.
2843    Establish the stack frame by decreasing the stack pointer with a
2844    properly calculated size and, if necessary, create a frame record
2845    filled with the values of LR and previous frame pointer.  The
2846    current FP is also set up if it is in use.  */
2847
2848 void
2849 aarch64_expand_prologue (void)
2850 {
2851   /* sub sp, sp, #<frame_size>
2852      stp {fp, lr}, [sp, #<frame_size> - 16]
2853      add fp, sp, #<frame_size> - hardfp_offset
2854      stp {cs_reg}, [fp, #-16] etc.
2855
2856      sub sp, sp, <final_adjustment_if_any>
2857   */
2858   HOST_WIDE_INT frame_size, offset;
2859   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2860   HOST_WIDE_INT hard_fp_offset;
2861   rtx_insn *insn;
2862
2863   aarch64_layout_frame ();
2864
2865   offset = frame_size = cfun->machine->frame.frame_size;
2866   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2867   fp_offset = frame_size - hard_fp_offset;
2868
2869   if (flag_stack_usage_info)
2870     current_function_static_stack_size = frame_size;
2871
2872   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
2873     {
2874       if (crtl->is_leaf && !cfun->calls_alloca)
2875         {
2876           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
2877             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
2878                                             frame_size - STACK_CHECK_PROTECT);
2879         }
2880       else if (frame_size > 0)
2881         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
2882     }
2883
2884   /* Store pairs and load pairs have a range only -512 to 504.  */
2885   if (offset >= 512)
2886     {
2887       /* When the frame has a large size, an initial decrease is done on
2888          the stack pointer to jump over the callee-allocated save area for
2889          register varargs, the local variable area and/or the callee-saved
2890          register area.  This will allow the pre-index write-back
2891          store pair instructions to be used for setting up the stack frame
2892          efficiently.  */
2893       offset = hard_fp_offset;
2894       if (offset >= 512)
2895         offset = cfun->machine->frame.saved_regs_size;
2896
2897       frame_size -= (offset + crtl->outgoing_args_size);
2898       fp_offset = 0;
2899
2900       if (frame_size >= 0x1000000)
2901         {
2902           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2903           emit_move_insn (op0, GEN_INT (-frame_size));
2904           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2905
2906           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2907                         gen_rtx_SET (stack_pointer_rtx,
2908                                      plus_constant (Pmode, stack_pointer_rtx,
2909                                                     -frame_size)));
2910           RTX_FRAME_RELATED_P (insn) = 1;
2911         }
2912       else if (frame_size > 0)
2913         {
2914           int hi_ofs = frame_size & 0xfff000;
2915           int lo_ofs = frame_size & 0x000fff;
2916
2917           if (hi_ofs)
2918             {
2919               insn = emit_insn (gen_add2_insn
2920                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2921               RTX_FRAME_RELATED_P (insn) = 1;
2922             }
2923           if (lo_ofs)
2924             {
2925               insn = emit_insn (gen_add2_insn
2926                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2927               RTX_FRAME_RELATED_P (insn) = 1;
2928             }
2929         }
2930     }
2931   else
2932     frame_size = -1;
2933
2934   if (offset > 0)
2935     {
2936       bool skip_wb = false;
2937
2938       if (frame_pointer_needed)
2939         {
2940           skip_wb = true;
2941
2942           if (fp_offset)
2943             {
2944               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2945                                                GEN_INT (-offset)));
2946               RTX_FRAME_RELATED_P (insn) = 1;
2947
2948               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2949                                          R30_REGNUM, false);
2950             }
2951           else
2952             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2953
2954           /* Set up frame pointer to point to the location of the
2955              previous frame pointer on the stack.  */
2956           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2957                                            stack_pointer_rtx,
2958                                            GEN_INT (fp_offset)));
2959           RTX_FRAME_RELATED_P (insn) = 1;
2960           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2961         }
2962       else
2963         {
2964           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2965           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2966
2967           if (fp_offset
2968               || reg1 == FIRST_PSEUDO_REGISTER
2969               || (reg2 == FIRST_PSEUDO_REGISTER
2970                   && offset >= 256))
2971             {
2972               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2973                                                GEN_INT (-offset)));
2974               RTX_FRAME_RELATED_P (insn) = 1;
2975             }
2976           else
2977             {
2978               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2979
2980               skip_wb = true;
2981
2982               if (reg2 == FIRST_PSEUDO_REGISTER)
2983                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2984               else
2985                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2986             }
2987         }
2988
2989       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2990                                  skip_wb);
2991       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2992                                  skip_wb);
2993     }
2994
2995   /* when offset >= 512,
2996      sub sp, sp, #<outgoing_args_size> */
2997   if (frame_size > -1)
2998     {
2999       if (crtl->outgoing_args_size > 0)
3000         {
3001           insn = emit_insn (gen_add2_insn
3002                             (stack_pointer_rtx,
3003                              GEN_INT (- crtl->outgoing_args_size)));
3004           RTX_FRAME_RELATED_P (insn) = 1;
3005         }
3006     }
3007 }
3008
3009 /* Return TRUE if we can use a simple_return insn.
3010
3011    This function checks whether the callee saved stack is empty, which
3012    means no restore actions are need. The pro_and_epilogue will use
3013    this to check whether shrink-wrapping opt is feasible.  */
3014
3015 bool
3016 aarch64_use_return_insn_p (void)
3017 {
3018   if (!reload_completed)
3019     return false;
3020
3021   if (crtl->profile)
3022     return false;
3023
3024   aarch64_layout_frame ();
3025
3026   return cfun->machine->frame.frame_size == 0;
3027 }
3028
3029 /* Generate the epilogue instructions for returning from a function.  */
3030 void
3031 aarch64_expand_epilogue (bool for_sibcall)
3032 {
3033   HOST_WIDE_INT frame_size, offset;
3034   HOST_WIDE_INT fp_offset;
3035   HOST_WIDE_INT hard_fp_offset;
3036   rtx_insn *insn;
3037   /* We need to add memory barrier to prevent read from deallocated stack.  */
3038   bool need_barrier_p = (get_frame_size () != 0
3039                          || cfun->machine->frame.saved_varargs_size);
3040
3041   aarch64_layout_frame ();
3042
3043   offset = frame_size = cfun->machine->frame.frame_size;
3044   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
3045   fp_offset = frame_size - hard_fp_offset;
3046
3047   /* Store pairs and load pairs have a range only -512 to 504.  */
3048   if (offset >= 512)
3049     {
3050       offset = hard_fp_offset;
3051       if (offset >= 512)
3052         offset = cfun->machine->frame.saved_regs_size;
3053
3054       frame_size -= (offset + crtl->outgoing_args_size);
3055       fp_offset = 0;
3056       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
3057         {
3058           insn = emit_insn (gen_add2_insn
3059                             (stack_pointer_rtx,
3060                              GEN_INT (crtl->outgoing_args_size)));
3061           RTX_FRAME_RELATED_P (insn) = 1;
3062         }
3063     }
3064   else
3065     frame_size = -1;
3066
3067   /* If there were outgoing arguments or we've done dynamic stack
3068      allocation, then restore the stack pointer from the frame
3069      pointer.  This is at most one insn and more efficient than using
3070      GCC's internal mechanism.  */
3071   if (frame_pointer_needed
3072       && (crtl->outgoing_args_size || cfun->calls_alloca))
3073     {
3074       if (cfun->calls_alloca)
3075         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3076
3077       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3078                                        hard_frame_pointer_rtx,
3079                                        GEN_INT (0)));
3080       offset = offset - fp_offset;
3081     }
3082
3083   if (offset > 0)
3084     {
3085       unsigned reg1 = cfun->machine->frame.wb_candidate1;
3086       unsigned reg2 = cfun->machine->frame.wb_candidate2;
3087       bool skip_wb = true;
3088       rtx cfi_ops = NULL;
3089
3090       if (frame_pointer_needed)
3091         fp_offset = 0;
3092       else if (fp_offset
3093                || reg1 == FIRST_PSEUDO_REGISTER
3094                || (reg2 == FIRST_PSEUDO_REGISTER
3095                    && offset >= 256))
3096         skip_wb = false;
3097
3098       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
3099                                     skip_wb, &cfi_ops);
3100       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
3101                                     skip_wb, &cfi_ops);
3102
3103       if (need_barrier_p)
3104         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3105
3106       if (skip_wb)
3107         {
3108           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
3109           rtx rreg1 = gen_rtx_REG (mode1, reg1);
3110
3111           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
3112           if (reg2 == FIRST_PSEUDO_REGISTER)
3113             {
3114               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
3115               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3116               mem = gen_rtx_MEM (mode1, mem);
3117               insn = emit_move_insn (rreg1, mem);
3118             }
3119           else
3120             {
3121               rtx rreg2 = gen_rtx_REG (mode1, reg2);
3122
3123               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
3124               insn = emit_insn (aarch64_gen_loadwb_pair
3125                                 (mode1, stack_pointer_rtx, rreg1,
3126                                  rreg2, offset));
3127             }
3128         }
3129       else
3130         {
3131           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
3132                                            GEN_INT (offset)));
3133         }
3134
3135       /* Reset the CFA to be SP + FRAME_SIZE.  */
3136       rtx new_cfa = stack_pointer_rtx;
3137       if (frame_size > 0)
3138         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
3139       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3140       REG_NOTES (insn) = cfi_ops;
3141       RTX_FRAME_RELATED_P (insn) = 1;
3142     }
3143
3144   if (frame_size > 0)
3145     {
3146       if (need_barrier_p)
3147         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3148
3149       if (frame_size >= 0x1000000)
3150         {
3151           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3152           emit_move_insn (op0, GEN_INT (frame_size));
3153           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
3154         }
3155       else
3156         {
3157           int hi_ofs = frame_size & 0xfff000;
3158           int lo_ofs = frame_size & 0x000fff;
3159
3160           if (hi_ofs && lo_ofs)
3161             {
3162               insn = emit_insn (gen_add2_insn
3163                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
3164               RTX_FRAME_RELATED_P (insn) = 1;
3165               frame_size = lo_ofs;
3166             }
3167           insn = emit_insn (gen_add2_insn
3168                             (stack_pointer_rtx, GEN_INT (frame_size)));
3169         }
3170
3171       /* Reset the CFA to be SP + 0.  */
3172       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
3173       RTX_FRAME_RELATED_P (insn) = 1;
3174     }
3175
3176   /* Stack adjustment for exception handler.  */
3177   if (crtl->calls_eh_return)
3178     {
3179       /* We need to unwind the stack by the offset computed by
3180          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3181          to be SP; letting the CFA move during this adjustment
3182          is just as correct as retaining the CFA from the body
3183          of the function.  Therefore, do nothing special.  */
3184       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3185     }
3186
3187   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3188   if (!for_sibcall)
3189     emit_jump_insn (ret_rtx);
3190 }
3191
3192 /* Return the place to copy the exception unwinding return address to.
3193    This will probably be a stack slot, but could (in theory be the
3194    return register).  */
3195 rtx
3196 aarch64_final_eh_return_addr (void)
3197 {
3198   HOST_WIDE_INT fp_offset;
3199
3200   aarch64_layout_frame ();
3201
3202   fp_offset = cfun->machine->frame.frame_size
3203               - cfun->machine->frame.hard_fp_offset;
3204
3205   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3206     return gen_rtx_REG (DImode, LR_REGNUM);
3207
3208   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
3209      result in a store to save LR introduced by builtin_eh_return () being
3210      incorrectly deleted because the alias is not detected.
3211      So in the calculation of the address to copy the exception unwinding
3212      return address to, we note 2 cases.
3213      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3214      we return a SP-relative location since all the addresses are SP-relative
3215      in this case.  This prevents the store from being optimized away.
3216      If the fp_offset is not 0, then the addresses will be FP-relative and
3217      therefore we return a FP-relative location.  */
3218
3219   if (frame_pointer_needed)
3220     {
3221       if (fp_offset)
3222         return gen_frame_mem (DImode,
3223                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3224       else
3225         return gen_frame_mem (DImode,
3226                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3227     }
3228
3229   /* If FP is not needed, we calculate the location of LR, which would be
3230      at the top of the saved registers block.  */
3231
3232   return gen_frame_mem (DImode,
3233                         plus_constant (Pmode,
3234                                        stack_pointer_rtx,
3235                                        fp_offset
3236                                        + cfun->machine->frame.saved_regs_size
3237                                        - 2 * UNITS_PER_WORD));
3238 }
3239
3240 /* Possibly output code to build up a constant in a register.  For
3241    the benefit of the costs infrastructure, returns the number of
3242    instructions which would be emitted.  GENERATE inhibits or
3243    enables code generation.  */
3244
3245 static int
3246 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
3247 {
3248   int insns = 0;
3249
3250   if (aarch64_bitmask_imm (val, DImode))
3251     {
3252       if (generate)
3253         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
3254       insns = 1;
3255     }
3256   else
3257     {
3258       int i;
3259       int ncount = 0;
3260       int zcount = 0;
3261       HOST_WIDE_INT valp = val >> 16;
3262       HOST_WIDE_INT valm;
3263       HOST_WIDE_INT tval;
3264
3265       for (i = 16; i < 64; i += 16)
3266         {
3267           valm = (valp & 0xffff);
3268
3269           if (valm != 0)
3270             ++ zcount;
3271
3272           if (valm != 0xffff)
3273             ++ ncount;
3274
3275           valp >>= 16;
3276         }
3277
3278       /* zcount contains the number of additional MOVK instructions
3279          required if the constant is built up with an initial MOVZ instruction,
3280          while ncount is the number of MOVK instructions required if starting
3281          with a MOVN instruction.  Choose the sequence that yields the fewest
3282          number of instructions, preferring MOVZ instructions when they are both
3283          the same.  */
3284       if (ncount < zcount)
3285         {
3286           if (generate)
3287             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3288                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
3289           tval = 0xffff;
3290           insns++;
3291         }
3292       else
3293         {
3294           if (generate)
3295             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3296                             GEN_INT (val & 0xffff));
3297           tval = 0;
3298           insns++;
3299         }
3300
3301       val >>= 16;
3302
3303       for (i = 16; i < 64; i += 16)
3304         {
3305           if ((val & 0xffff) != tval)
3306             {
3307               if (generate)
3308                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3309                                            GEN_INT (i),
3310                                            GEN_INT (val & 0xffff)));
3311               insns++;
3312             }
3313           val >>= 16;
3314         }
3315     }
3316   return insns;
3317 }
3318
3319 static void
3320 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3321 {
3322   HOST_WIDE_INT mdelta = delta;
3323   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3324   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3325
3326   if (mdelta < 0)
3327     mdelta = -mdelta;
3328
3329   if (mdelta >= 4096 * 4096)
3330     {
3331       (void) aarch64_build_constant (scratchreg, delta, true);
3332       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3333     }
3334   else if (mdelta > 0)
3335     {
3336       if (mdelta >= 4096)
3337         {
3338           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3339           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3340           if (delta < 0)
3341             emit_insn (gen_rtx_SET (this_rtx,
3342                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3343           else
3344             emit_insn (gen_rtx_SET (this_rtx,
3345                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3346         }
3347       if (mdelta % 4096 != 0)
3348         {
3349           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3350           emit_insn (gen_rtx_SET (this_rtx,
3351                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3352         }
3353     }
3354 }
3355
3356 /* Output code to add DELTA to the first argument, and then jump
3357    to FUNCTION.  Used for C++ multiple inheritance.  */
3358 static void
3359 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3360                          HOST_WIDE_INT delta,
3361                          HOST_WIDE_INT vcall_offset,
3362                          tree function)
3363 {
3364   /* The this pointer is always in x0.  Note that this differs from
3365      Arm where the this pointer maybe bumped to r1 if r0 is required
3366      to return a pointer to an aggregate.  On AArch64 a result value
3367      pointer will be in x8.  */
3368   int this_regno = R0_REGNUM;
3369   rtx this_rtx, temp0, temp1, addr, funexp;
3370   rtx_insn *insn;
3371
3372   reload_completed = 1;
3373   emit_note (NOTE_INSN_PROLOGUE_END);
3374
3375   if (vcall_offset == 0)
3376     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3377   else
3378     {
3379       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3380
3381       this_rtx = gen_rtx_REG (Pmode, this_regno);
3382       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3383       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3384
3385       addr = this_rtx;
3386       if (delta != 0)
3387         {
3388           if (delta >= -256 && delta < 256)
3389             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3390                                        plus_constant (Pmode, this_rtx, delta));
3391           else
3392             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3393         }
3394
3395       if (Pmode == ptr_mode)
3396         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3397       else
3398         aarch64_emit_move (temp0,
3399                            gen_rtx_ZERO_EXTEND (Pmode,
3400                                                 gen_rtx_MEM (ptr_mode, addr)));
3401
3402       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3403           addr = plus_constant (Pmode, temp0, vcall_offset);
3404       else
3405         {
3406           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3407           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3408         }
3409
3410       if (Pmode == ptr_mode)
3411         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3412       else
3413         aarch64_emit_move (temp1,
3414                            gen_rtx_SIGN_EXTEND (Pmode,
3415                                                 gen_rtx_MEM (ptr_mode, addr)));
3416
3417       emit_insn (gen_add2_insn (this_rtx, temp1));
3418     }
3419
3420   /* Generate a tail call to the target function.  */
3421   if (!TREE_USED (function))
3422     {
3423       assemble_external (function);
3424       TREE_USED (function) = 1;
3425     }
3426   funexp = XEXP (DECL_RTL (function), 0);
3427   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3428   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3429   SIBLING_CALL_P (insn) = 1;
3430
3431   insn = get_insns ();
3432   shorten_branches (insn);
3433   final_start_function (insn, file, 1);
3434   final (insn, file, 1);
3435   final_end_function ();
3436
3437   /* Stop pretending to be a post-reload pass.  */
3438   reload_completed = 0;
3439 }
3440
3441 static bool
3442 aarch64_tls_referenced_p (rtx x)
3443 {
3444   if (!TARGET_HAVE_TLS)
3445     return false;
3446   subrtx_iterator::array_type array;
3447   FOR_EACH_SUBRTX (iter, array, x, ALL)
3448     {
3449       const_rtx x = *iter;
3450       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3451         return true;
3452       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3453          TLS offsets, not real symbol references.  */
3454       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3455         iter.skip_subrtxes ();
3456     }
3457   return false;
3458 }
3459
3460
3461 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3462    a left shift of 0 or 12 bits.  */
3463 bool
3464 aarch64_uimm12_shift (HOST_WIDE_INT val)
3465 {
3466   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3467           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3468           );
3469 }
3470
3471
3472 /* Return true if val is an immediate that can be loaded into a
3473    register by a MOVZ instruction.  */
3474 static bool
3475 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3476 {
3477   if (GET_MODE_SIZE (mode) > 4)
3478     {
3479       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3480           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3481         return 1;
3482     }
3483   else
3484     {
3485       /* Ignore sign extension.  */
3486       val &= (HOST_WIDE_INT) 0xffffffff;
3487     }
3488   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3489           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3490 }
3491
3492 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3493
3494 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3495   {
3496     0x0000000100000001ull,
3497     0x0001000100010001ull,
3498     0x0101010101010101ull,
3499     0x1111111111111111ull,
3500     0x5555555555555555ull,
3501   };
3502
3503
3504 /* Return true if val is a valid bitmask immediate.  */
3505
3506 bool
3507 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3508 {
3509   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3510   int bits;
3511
3512   /* Check for a single sequence of one bits and return quickly if so.
3513      The special cases of all ones and all zeroes returns false.  */
3514   val = (unsigned HOST_WIDE_INT) val_in;
3515   tmp = val + (val & -val);
3516
3517   if (tmp == (tmp & -tmp))
3518     return (val + 1) > 1;
3519
3520   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3521   if (mode == SImode)
3522     val = (val << 32) | (val & 0xffffffff);
3523
3524   /* Invert if the immediate doesn't start with a zero bit - this means we
3525      only need to search for sequences of one bits.  */
3526   if (val & 1)
3527     val = ~val;
3528
3529   /* Find the first set bit and set tmp to val with the first sequence of one
3530      bits removed.  Return success if there is a single sequence of ones.  */
3531   first_one = val & -val;
3532   tmp = val & (val + first_one);
3533
3534   if (tmp == 0)
3535     return true;
3536
3537   /* Find the next set bit and compute the difference in bit position.  */
3538   next_one = tmp & -tmp;
3539   bits = clz_hwi (first_one) - clz_hwi (next_one);
3540   mask = val ^ tmp;
3541
3542   /* Check the bit position difference is a power of 2, and that the first
3543      sequence of one bits fits within 'bits' bits.  */
3544   if ((mask >> bits) != 0 || bits != (bits & -bits))
3545     return false;
3546
3547   /* Check the sequence of one bits is repeated 64/bits times.  */
3548   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3549 }
3550
3551
3552 /* Return true if val is an immediate that can be loaded into a
3553    register in a single instruction.  */
3554 bool
3555 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3556 {
3557   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3558     return 1;
3559   return aarch64_bitmask_imm (val, mode);
3560 }
3561
3562 static bool
3563 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3564 {
3565   rtx base, offset;
3566
3567   if (GET_CODE (x) == HIGH)
3568     return true;
3569
3570   split_const (x, &base, &offset);
3571   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3572     {
3573       if (aarch64_classify_symbol (base, offset)
3574           != SYMBOL_FORCE_TO_MEM)
3575         return true;
3576       else
3577         /* Avoid generating a 64-bit relocation in ILP32; leave
3578            to aarch64_expand_mov_immediate to handle it properly.  */
3579         return mode != ptr_mode;
3580     }
3581
3582   return aarch64_tls_referenced_p (x);
3583 }
3584
3585 /* Implement TARGET_CASE_VALUES_THRESHOLD.  */
3586
3587 static unsigned int
3588 aarch64_case_values_threshold (void)
3589 {
3590   /* Use the specified limit for the number of cases before using jump
3591      tables at higher optimization levels.  */
3592   if (optimize > 2
3593       && selected_cpu->tune->max_case_values != 0)
3594     return selected_cpu->tune->max_case_values;
3595   else
3596     return default_case_values_threshold ();
3597 }
3598
3599 /* Return true if register REGNO is a valid index register.
3600    STRICT_P is true if REG_OK_STRICT is in effect.  */
3601
3602 bool
3603 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3604 {
3605   if (!HARD_REGISTER_NUM_P (regno))
3606     {
3607       if (!strict_p)
3608         return true;
3609
3610       if (!reg_renumber)
3611         return false;
3612
3613       regno = reg_renumber[regno];
3614     }
3615   return GP_REGNUM_P (regno);
3616 }
3617
3618 /* Return true if register REGNO is a valid base register for mode MODE.
3619    STRICT_P is true if REG_OK_STRICT is in effect.  */
3620
3621 bool
3622 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3623 {
3624   if (!HARD_REGISTER_NUM_P (regno))
3625     {
3626       if (!strict_p)
3627         return true;
3628
3629       if (!reg_renumber)
3630         return false;
3631
3632       regno = reg_renumber[regno];
3633     }
3634
3635   /* The fake registers will be eliminated to either the stack or
3636      hard frame pointer, both of which are usually valid base registers.
3637      Reload deals with the cases where the eliminated form isn't valid.  */
3638   return (GP_REGNUM_P (regno)
3639           || regno == SP_REGNUM
3640           || regno == FRAME_POINTER_REGNUM
3641           || regno == ARG_POINTER_REGNUM);
3642 }
3643
3644 /* Return true if X is a valid base register for mode MODE.
3645    STRICT_P is true if REG_OK_STRICT is in effect.  */
3646
3647 static bool
3648 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3649 {
3650   if (!strict_p && GET_CODE (x) == SUBREG)
3651     x = SUBREG_REG (x);
3652
3653   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3654 }
3655
3656 /* Return true if address offset is a valid index.  If it is, fill in INFO
3657    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3658
3659 static bool
3660 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3661                         machine_mode mode, bool strict_p)
3662 {
3663   enum aarch64_address_type type;
3664   rtx index;
3665   int shift;
3666
3667   /* (reg:P) */
3668   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3669       && GET_MODE (x) == Pmode)
3670     {
3671       type = ADDRESS_REG_REG;
3672       index = x;
3673       shift = 0;
3674     }
3675   /* (sign_extend:DI (reg:SI)) */
3676   else if ((GET_CODE (x) == SIGN_EXTEND
3677             || GET_CODE (x) == ZERO_EXTEND)
3678            && GET_MODE (x) == DImode
3679            && GET_MODE (XEXP (x, 0)) == SImode)
3680     {
3681       type = (GET_CODE (x) == SIGN_EXTEND)
3682         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3683       index = XEXP (x, 0);
3684       shift = 0;
3685     }
3686   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3687   else if (GET_CODE (x) == MULT
3688            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3689                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3690            && GET_MODE (XEXP (x, 0)) == DImode
3691            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3692            && CONST_INT_P (XEXP (x, 1)))
3693     {
3694       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3695         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3696       index = XEXP (XEXP (x, 0), 0);
3697       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3698     }
3699   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3700   else if (GET_CODE (x) == ASHIFT
3701            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3702                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3703            && GET_MODE (XEXP (x, 0)) == DImode
3704            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3705            && CONST_INT_P (XEXP (x, 1)))
3706     {
3707       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3708         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3709       index = XEXP (XEXP (x, 0), 0);
3710       shift = INTVAL (XEXP (x, 1));
3711     }
3712   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3713   else if ((GET_CODE (x) == SIGN_EXTRACT
3714             || GET_CODE (x) == ZERO_EXTRACT)
3715            && GET_MODE (x) == DImode
3716            && GET_CODE (XEXP (x, 0)) == MULT
3717            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3718            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3719     {
3720       type = (GET_CODE (x) == SIGN_EXTRACT)
3721         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3722       index = XEXP (XEXP (x, 0), 0);
3723       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3724       if (INTVAL (XEXP (x, 1)) != 32 + shift
3725           || INTVAL (XEXP (x, 2)) != 0)
3726         shift = -1;
3727     }
3728   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3729      (const_int 0xffffffff<<shift)) */
3730   else if (GET_CODE (x) == AND
3731            && GET_MODE (x) == DImode
3732            && GET_CODE (XEXP (x, 0)) == MULT
3733            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3734            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3735            && CONST_INT_P (XEXP (x, 1)))
3736     {
3737       type = ADDRESS_REG_UXTW;
3738       index = XEXP (XEXP (x, 0), 0);
3739       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3740       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3741         shift = -1;
3742     }
3743   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3744   else if ((GET_CODE (x) == SIGN_EXTRACT
3745             || GET_CODE (x) == ZERO_EXTRACT)
3746            && GET_MODE (x) == DImode
3747            && GET_CODE (XEXP (x, 0)) == ASHIFT
3748            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3749            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3750     {
3751       type = (GET_CODE (x) == SIGN_EXTRACT)
3752         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3753       index = XEXP (XEXP (x, 0), 0);
3754       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3755       if (INTVAL (XEXP (x, 1)) != 32 + shift
3756           || INTVAL (XEXP (x, 2)) != 0)
3757         shift = -1;
3758     }
3759   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3760      (const_int 0xffffffff<<shift)) */
3761   else if (GET_CODE (x) == AND
3762            && GET_MODE (x) == DImode
3763            && GET_CODE (XEXP (x, 0)) == ASHIFT
3764            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3765            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3766            && CONST_INT_P (XEXP (x, 1)))
3767     {
3768       type = ADDRESS_REG_UXTW;
3769       index = XEXP (XEXP (x, 0), 0);
3770       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3771       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3772         shift = -1;
3773     }
3774   /* (mult:P (reg:P) (const_int scale)) */
3775   else if (GET_CODE (x) == MULT
3776            && GET_MODE (x) == Pmode
3777            && GET_MODE (XEXP (x, 0)) == Pmode
3778            && CONST_INT_P (XEXP (x, 1)))
3779     {
3780       type = ADDRESS_REG_REG;
3781       index = XEXP (x, 0);
3782       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3783     }
3784   /* (ashift:P (reg:P) (const_int shift)) */
3785   else if (GET_CODE (x) == ASHIFT
3786            && GET_MODE (x) == Pmode
3787            && GET_MODE (XEXP (x, 0)) == Pmode
3788            && CONST_INT_P (XEXP (x, 1)))
3789     {
3790       type = ADDRESS_REG_REG;
3791       index = XEXP (x, 0);
3792       shift = INTVAL (XEXP (x, 1));
3793     }
3794   else
3795     return false;
3796
3797   if (GET_CODE (index) == SUBREG)
3798     index = SUBREG_REG (index);
3799
3800   if ((shift == 0 ||
3801        (shift > 0 && shift <= 3
3802         && (1 << shift) == GET_MODE_SIZE (mode)))
3803       && REG_P (index)
3804       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3805     {
3806       info->type = type;
3807       info->offset = index;
3808       info->shift = shift;
3809       return true;
3810     }
3811
3812   return false;
3813 }
3814
3815 bool
3816 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3817 {
3818   return (offset >= -64 * GET_MODE_SIZE (mode)
3819           && offset < 64 * GET_MODE_SIZE (mode)
3820           && offset % GET_MODE_SIZE (mode) == 0);
3821 }
3822
3823 static inline bool
3824 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3825                                HOST_WIDE_INT offset)
3826 {
3827   return offset >= -256 && offset < 256;
3828 }
3829
3830 static inline bool
3831 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3832 {
3833   return (offset >= 0
3834           && offset < 4096 * GET_MODE_SIZE (mode)
3835           && offset % GET_MODE_SIZE (mode) == 0);
3836 }
3837
3838 /* Return true if MODE is one of the modes for which we
3839    support LDP/STP operations.  */
3840
3841 static bool
3842 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3843 {
3844   return mode == SImode || mode == DImode
3845          || mode == SFmode || mode == DFmode
3846          || (aarch64_vector_mode_supported_p (mode)
3847              && GET_MODE_SIZE (mode) == 8);
3848 }
3849
3850 /* Return true if X is a valid address for machine mode MODE.  If it is,
3851    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3852    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3853
3854 static bool
3855 aarch64_classify_address (struct aarch64_address_info *info,
3856                           rtx x, machine_mode mode,
3857                           RTX_CODE outer_code, bool strict_p)
3858 {
3859   enum rtx_code code = GET_CODE (x);
3860   rtx op0, op1;
3861
3862   /* On BE, we use load/store pair for all large int mode load/stores.  */
3863   bool load_store_pair_p = (outer_code == PARALLEL
3864                             || (BYTES_BIG_ENDIAN
3865                                 && aarch64_vect_struct_mode_p (mode)));
3866
3867   bool allow_reg_index_p =
3868     !load_store_pair_p
3869     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3870     && !aarch64_vect_struct_mode_p (mode);
3871
3872   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3873      REG addressing.  */
3874   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3875       && (code != POST_INC && code != REG))
3876     return false;
3877
3878   switch (code)
3879     {
3880     case REG:
3881     case SUBREG:
3882       info->type = ADDRESS_REG_IMM;
3883       info->base = x;
3884       info->offset = const0_rtx;
3885       return aarch64_base_register_rtx_p (x, strict_p);
3886
3887     case PLUS:
3888       op0 = XEXP (x, 0);
3889       op1 = XEXP (x, 1);
3890
3891       if (! strict_p
3892           && REG_P (op0)
3893           && (op0 == virtual_stack_vars_rtx
3894               || op0 == frame_pointer_rtx
3895               || op0 == arg_pointer_rtx)
3896           && CONST_INT_P (op1))
3897         {
3898           info->type = ADDRESS_REG_IMM;
3899           info->base = op0;
3900           info->offset = op1;
3901
3902           return true;
3903         }
3904
3905       if (GET_MODE_SIZE (mode) != 0
3906           && CONST_INT_P (op1)
3907           && aarch64_base_register_rtx_p (op0, strict_p))
3908         {
3909           HOST_WIDE_INT offset = INTVAL (op1);
3910
3911           info->type = ADDRESS_REG_IMM;
3912           info->base = op0;
3913           info->offset = op1;
3914
3915           /* TImode and TFmode values are allowed in both pairs of X
3916              registers and individual Q registers.  The available
3917              address modes are:
3918              X,X: 7-bit signed scaled offset
3919              Q:   9-bit signed offset
3920              We conservatively require an offset representable in either mode.
3921            */
3922           if (mode == TImode || mode == TFmode)
3923             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3924                     && offset_9bit_signed_unscaled_p (mode, offset));
3925
3926           /* A 7bit offset check because OImode will emit a ldp/stp
3927              instruction (only big endian will get here).
3928              For ldp/stp instructions, the offset is scaled for the size of a
3929              single element of the pair.  */
3930           if (mode == OImode)
3931             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3932
3933           /* Three 9/12 bit offsets checks because CImode will emit three
3934              ldr/str instructions (only big endian will get here).  */
3935           if (mode == CImode)
3936             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3937                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3938                         || offset_12bit_unsigned_scaled_p (V16QImode,
3939                                                            offset + 32)));
3940
3941           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3942              instructions (only big endian will get here).  */
3943           if (mode == XImode)
3944             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3945                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3946                                                             offset + 32));
3947
3948           if (load_store_pair_p)
3949             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3950                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3951           else
3952             return (offset_9bit_signed_unscaled_p (mode, offset)
3953                     || offset_12bit_unsigned_scaled_p (mode, offset));
3954         }
3955
3956       if (allow_reg_index_p)
3957         {
3958           /* Look for base + (scaled/extended) index register.  */
3959           if (aarch64_base_register_rtx_p (op0, strict_p)
3960               && aarch64_classify_index (info, op1, mode, strict_p))
3961             {
3962               info->base = op0;
3963               return true;
3964             }
3965           if (aarch64_base_register_rtx_p (op1, strict_p)
3966               && aarch64_classify_index (info, op0, mode, strict_p))
3967             {
3968               info->base = op1;
3969               return true;
3970             }
3971         }
3972
3973       return false;
3974
3975     case POST_INC:
3976     case POST_DEC:
3977     case PRE_INC:
3978     case PRE_DEC:
3979       info->type = ADDRESS_REG_WB;
3980       info->base = XEXP (x, 0);
3981       info->offset = NULL_RTX;
3982       return aarch64_base_register_rtx_p (info->base, strict_p);
3983
3984     case POST_MODIFY:
3985     case PRE_MODIFY:
3986       info->type = ADDRESS_REG_WB;
3987       info->base = XEXP (x, 0);
3988       if (GET_CODE (XEXP (x, 1)) == PLUS
3989           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3990           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3991           && aarch64_base_register_rtx_p (info->base, strict_p))
3992         {
3993           HOST_WIDE_INT offset;
3994           info->offset = XEXP (XEXP (x, 1), 1);
3995           offset = INTVAL (info->offset);
3996
3997           /* TImode and TFmode values are allowed in both pairs of X
3998              registers and individual Q registers.  The available
3999              address modes are:
4000              X,X: 7-bit signed scaled offset
4001              Q:   9-bit signed offset
4002              We conservatively require an offset representable in either mode.
4003            */
4004           if (mode == TImode || mode == TFmode)
4005             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4006                     && offset_9bit_signed_unscaled_p (mode, offset));
4007
4008           if (load_store_pair_p)
4009             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4010                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4011           else
4012             return offset_9bit_signed_unscaled_p (mode, offset);
4013         }
4014       return false;
4015
4016     case CONST:
4017     case SYMBOL_REF:
4018     case LABEL_REF:
4019       /* load literal: pc-relative constant pool entry.  Only supported
4020          for SI mode or larger.  */
4021       info->type = ADDRESS_SYMBOLIC;
4022
4023       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4024         {
4025           rtx sym, addend;
4026
4027           split_const (x, &sym, &addend);
4028           return ((GET_CODE (sym) == LABEL_REF
4029                    || (GET_CODE (sym) == SYMBOL_REF
4030                        && CONSTANT_POOL_ADDRESS_P (sym)
4031                        && !aarch64_nopcrelative_literal_loads)));
4032         }
4033       return false;
4034
4035     case LO_SUM:
4036       info->type = ADDRESS_LO_SUM;
4037       info->base = XEXP (x, 0);
4038       info->offset = XEXP (x, 1);
4039       if (allow_reg_index_p
4040           && aarch64_base_register_rtx_p (info->base, strict_p))
4041         {
4042           rtx sym, offs;
4043           split_const (info->offset, &sym, &offs);
4044           if (GET_CODE (sym) == SYMBOL_REF
4045               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4046             {
4047               /* The symbol and offset must be aligned to the access size.  */
4048               unsigned int align;
4049               unsigned int ref_size;
4050
4051               if (CONSTANT_POOL_ADDRESS_P (sym))
4052                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4053               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4054                 {
4055                   tree exp = SYMBOL_REF_DECL (sym);
4056                   align = TYPE_ALIGN (TREE_TYPE (exp));
4057                   align = CONSTANT_ALIGNMENT (exp, align);
4058                 }
4059               else if (SYMBOL_REF_DECL (sym))
4060                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4061               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4062                        && SYMBOL_REF_BLOCK (sym) != NULL)
4063                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4064               else
4065                 align = BITS_PER_UNIT;
4066
4067               ref_size = GET_MODE_SIZE (mode);
4068               if (ref_size == 0)
4069                 ref_size = GET_MODE_SIZE (DImode);
4070
4071               return ((INTVAL (offs) & (ref_size - 1)) == 0
4072                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4073             }
4074         }
4075       return false;
4076
4077     default:
4078       return false;
4079     }
4080 }
4081
4082 bool
4083 aarch64_symbolic_address_p (rtx x)
4084 {
4085   rtx offset;
4086
4087   split_const (x, &x, &offset);
4088   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4089 }
4090
4091 /* Classify the base of symbolic expression X.  */
4092
4093 enum aarch64_symbol_type
4094 aarch64_classify_symbolic_expression (rtx x)
4095 {
4096   rtx offset;
4097
4098   split_const (x, &x, &offset);
4099   return aarch64_classify_symbol (x, offset);
4100 }
4101
4102
4103 /* Return TRUE if X is a legitimate address for accessing memory in
4104    mode MODE.  */
4105 static bool
4106 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4107 {
4108   struct aarch64_address_info addr;
4109
4110   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4111 }
4112
4113 /* Return TRUE if X is a legitimate address for accessing memory in
4114    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4115    pair operation.  */
4116 bool
4117 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4118                               RTX_CODE outer_code, bool strict_p)
4119 {
4120   struct aarch64_address_info addr;
4121
4122   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4123 }
4124
4125 /* Return TRUE if rtx X is immediate constant 0.0 */
4126 bool
4127 aarch64_float_const_zero_rtx_p (rtx x)
4128 {
4129   if (GET_MODE (x) == VOIDmode)
4130     return false;
4131
4132   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4133     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4134   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4135 }
4136
4137 /* Return the fixed registers used for condition codes.  */
4138
4139 static bool
4140 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4141 {
4142   *p1 = CC_REGNUM;
4143   *p2 = INVALID_REGNUM;
4144   return true;
4145 }
4146
4147 /* Emit call insn with PAT and do aarch64-specific handling.  */
4148
4149 void
4150 aarch64_emit_call_insn (rtx pat)
4151 {
4152   rtx insn = emit_call_insn (pat);
4153
4154   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4155   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4156   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4157 }
4158
4159 machine_mode
4160 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4161 {
4162   /* All floating point compares return CCFP if it is an equality
4163      comparison, and CCFPE otherwise.  */
4164   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4165     {
4166       switch (code)
4167         {
4168         case EQ:
4169         case NE:
4170         case UNORDERED:
4171         case ORDERED:
4172         case UNLT:
4173         case UNLE:
4174         case UNGT:
4175         case UNGE:
4176         case UNEQ:
4177         case LTGT:
4178           return CCFPmode;
4179
4180         case LT:
4181         case LE:
4182         case GT:
4183         case GE:
4184           return CCFPEmode;
4185
4186         default:
4187           gcc_unreachable ();
4188         }
4189     }
4190
4191   /* Equality comparisons of short modes against zero can be performed
4192      using the TST instruction with the appropriate bitmask.  */
4193   if (y == const0_rtx && REG_P (x)
4194       && (code == EQ || code == NE)
4195       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4196     return CC_NZmode;
4197
4198   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4199       && y == const0_rtx
4200       && (code == EQ || code == NE || code == LT || code == GE)
4201       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4202           || GET_CODE (x) == NEG
4203           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4204               && CONST_INT_P (XEXP (x, 2)))))
4205     return CC_NZmode;
4206
4207   /* A compare with a shifted operand.  Because of canonicalization,
4208      the comparison will have to be swapped when we emit the assembly
4209      code.  */
4210   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4211       && (REG_P (y) || GET_CODE (y) == SUBREG)
4212       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4213           || GET_CODE (x) == LSHIFTRT
4214           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4215     return CC_SWPmode;
4216
4217   /* Similarly for a negated operand, but we can only do this for
4218      equalities.  */
4219   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4220       && (REG_P (y) || GET_CODE (y) == SUBREG)
4221       && (code == EQ || code == NE)
4222       && GET_CODE (x) == NEG)
4223     return CC_Zmode;
4224
4225   /* A compare of a mode narrower than SI mode against zero can be done
4226      by extending the value in the comparison.  */
4227   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
4228       && y == const0_rtx)
4229     /* Only use sign-extension if we really need it.  */
4230     return ((code == GT || code == GE || code == LE || code == LT)
4231             ? CC_SESWPmode : CC_ZESWPmode);
4232
4233   /* A test for unsigned overflow.  */
4234   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4235       && code == NE
4236       && GET_CODE (x) == PLUS
4237       && GET_CODE (y) == ZERO_EXTEND)
4238     return CC_Cmode;
4239
4240   /* For everything else, return CCmode.  */
4241   return CCmode;
4242 }
4243
4244 static int
4245 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4246
4247 int
4248 aarch64_get_condition_code (rtx x)
4249 {
4250   machine_mode mode = GET_MODE (XEXP (x, 0));
4251   enum rtx_code comp_code = GET_CODE (x);
4252
4253   if (GET_MODE_CLASS (mode) != MODE_CC)
4254     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4255   return aarch64_get_condition_code_1 (mode, comp_code);
4256 }
4257
4258 static int
4259 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4260 {
4261   switch (mode)
4262     {
4263     case CCFPmode:
4264     case CCFPEmode:
4265       switch (comp_code)
4266         {
4267         case GE: return AARCH64_GE;
4268         case GT: return AARCH64_GT;
4269         case LE: return AARCH64_LS;
4270         case LT: return AARCH64_MI;
4271         case NE: return AARCH64_NE;
4272         case EQ: return AARCH64_EQ;
4273         case ORDERED: return AARCH64_VC;
4274         case UNORDERED: return AARCH64_VS;
4275         case UNLT: return AARCH64_LT;
4276         case UNLE: return AARCH64_LE;
4277         case UNGT: return AARCH64_HI;
4278         case UNGE: return AARCH64_PL;
4279         default: return -1;
4280         }
4281       break;
4282
4283     case CCmode:
4284       switch (comp_code)
4285         {
4286         case NE: return AARCH64_NE;
4287         case EQ: return AARCH64_EQ;
4288         case GE: return AARCH64_GE;
4289         case GT: return AARCH64_GT;
4290         case LE: return AARCH64_LE;
4291         case LT: return AARCH64_LT;
4292         case GEU: return AARCH64_CS;
4293         case GTU: return AARCH64_HI;
4294         case LEU: return AARCH64_LS;
4295         case LTU: return AARCH64_CC;
4296         default: return -1;
4297         }
4298       break;
4299
4300     case CC_SWPmode:
4301     case CC_ZESWPmode:
4302     case CC_SESWPmode:
4303       switch (comp_code)
4304         {
4305         case NE: return AARCH64_NE;
4306         case EQ: return AARCH64_EQ;
4307         case GE: return AARCH64_LE;
4308         case GT: return AARCH64_LT;
4309         case LE: return AARCH64_GE;
4310         case LT: return AARCH64_GT;
4311         case GEU: return AARCH64_LS;
4312         case GTU: return AARCH64_CC;
4313         case LEU: return AARCH64_CS;
4314         case LTU: return AARCH64_HI;
4315         default: return -1;
4316         }
4317       break;
4318
4319     case CC_NZmode:
4320       switch (comp_code)
4321         {
4322         case NE: return AARCH64_NE;
4323         case EQ: return AARCH64_EQ;
4324         case GE: return AARCH64_PL;
4325         case LT: return AARCH64_MI;
4326         default: return -1;
4327         }
4328       break;
4329
4330     case CC_Zmode:
4331       switch (comp_code)
4332         {
4333         case NE: return AARCH64_NE;
4334         case EQ: return AARCH64_EQ;
4335         default: return -1;
4336         }
4337       break;
4338
4339     case CC_Cmode:
4340       switch (comp_code)
4341         {
4342         case NE: return AARCH64_CS;
4343         case EQ: return AARCH64_CC;
4344         default: return -1;
4345         }
4346       break;
4347
4348     default:
4349       return -1;
4350       break;
4351     }
4352
4353   return -1;
4354 }
4355
4356 bool
4357 aarch64_const_vec_all_same_in_range_p (rtx x,
4358                                   HOST_WIDE_INT minval,
4359                                   HOST_WIDE_INT maxval)
4360 {
4361   HOST_WIDE_INT firstval;
4362   int count, i;
4363
4364   if (GET_CODE (x) != CONST_VECTOR
4365       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4366     return false;
4367
4368   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4369   if (firstval < minval || firstval > maxval)
4370     return false;
4371
4372   count = CONST_VECTOR_NUNITS (x);
4373   for (i = 1; i < count; i++)
4374     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4375       return false;
4376
4377   return true;
4378 }
4379
4380 bool
4381 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4382 {
4383   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4384 }
4385
4386
4387 /* N Z C V.  */
4388 #define AARCH64_CC_V 1
4389 #define AARCH64_CC_C (1 << 1)
4390 #define AARCH64_CC_Z (1 << 2)
4391 #define AARCH64_CC_N (1 << 3)
4392
4393 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4394 static const int aarch64_nzcv_codes[] =
4395 {
4396   0,            /* EQ, Z == 1.  */
4397   AARCH64_CC_Z, /* NE, Z == 0.  */
4398   0,            /* CS, C == 1.  */
4399   AARCH64_CC_C, /* CC, C == 0.  */
4400   0,            /* MI, N == 1.  */
4401   AARCH64_CC_N, /* PL, N == 0.  */
4402   0,            /* VS, V == 1.  */
4403   AARCH64_CC_V, /* VC, V == 0.  */
4404   0,            /* HI, C ==1 && Z == 0.  */
4405   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4406   AARCH64_CC_V, /* GE, N == V.  */
4407   0,            /* LT, N != V.  */
4408   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4409   0,            /* LE, !(Z == 0 && N == V).  */
4410   0,            /* AL, Any.  */
4411   0             /* NV, Any.  */
4412 };
4413
4414 static void
4415 aarch64_print_operand (FILE *f, rtx x, int code)
4416 {
4417   switch (code)
4418     {
4419     /* An integer or symbol address without a preceding # sign.  */
4420     case 'c':
4421       switch (GET_CODE (x))
4422         {
4423         case CONST_INT:
4424           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4425           break;
4426
4427         case SYMBOL_REF:
4428           output_addr_const (f, x);
4429           break;
4430
4431         case CONST:
4432           if (GET_CODE (XEXP (x, 0)) == PLUS
4433               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4434             {
4435               output_addr_const (f, x);
4436               break;
4437             }
4438           /* Fall through.  */
4439
4440         default:
4441           output_operand_lossage ("Unsupported operand for code '%c'", code);
4442         }
4443       break;
4444
4445     case 'e':
4446       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4447       {
4448         int n;
4449
4450         if (!CONST_INT_P (x)
4451             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4452           {
4453             output_operand_lossage ("invalid operand for '%%%c'", code);
4454             return;
4455           }
4456
4457         switch (n)
4458           {
4459           case 3:
4460             fputc ('b', f);
4461             break;
4462           case 4:
4463             fputc ('h', f);
4464             break;
4465           case 5:
4466             fputc ('w', f);
4467             break;
4468           default:
4469             output_operand_lossage ("invalid operand for '%%%c'", code);
4470             return;
4471           }
4472       }
4473       break;
4474
4475     case 'p':
4476       {
4477         int n;
4478
4479         /* Print N such that 2^N == X.  */
4480         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4481           {
4482             output_operand_lossage ("invalid operand for '%%%c'", code);
4483             return;
4484           }
4485
4486         asm_fprintf (f, "%d", n);
4487       }
4488       break;
4489
4490     case 'P':
4491       /* Print the number of non-zero bits in X (a const_int).  */
4492       if (!CONST_INT_P (x))
4493         {
4494           output_operand_lossage ("invalid operand for '%%%c'", code);
4495           return;
4496         }
4497
4498       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4499       break;
4500
4501     case 'H':
4502       /* Print the higher numbered register of a pair (TImode) of regs.  */
4503       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4504         {
4505           output_operand_lossage ("invalid operand for '%%%c'", code);
4506           return;
4507         }
4508
4509       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4510       break;
4511
4512     case 'M':
4513     case 'm':
4514       {
4515         int cond_code;
4516         /* Print a condition (eq, ne, etc) or its inverse.  */
4517
4518         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4519         if (x == const_true_rtx)
4520           {
4521             if (code == 'M')
4522               fputs ("nv", f);
4523             return;
4524           }
4525
4526         if (!COMPARISON_P (x))
4527           {
4528             output_operand_lossage ("invalid operand for '%%%c'", code);
4529             return;
4530           }
4531
4532         cond_code = aarch64_get_condition_code (x);
4533         gcc_assert (cond_code >= 0);
4534         if (code == 'M')
4535           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4536         fputs (aarch64_condition_codes[cond_code], f);
4537       }
4538       break;
4539
4540     case 'b':
4541     case 'h':
4542     case 's':
4543     case 'd':
4544     case 'q':
4545       /* Print a scalar FP/SIMD register name.  */
4546       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4547         {
4548           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4549           return;
4550         }
4551       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4552       break;
4553
4554     case 'S':
4555     case 'T':
4556     case 'U':
4557     case 'V':
4558       /* Print the first FP/SIMD register name in a list.  */
4559       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4560         {
4561           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4562           return;
4563         }
4564       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4565       break;
4566
4567     case 'R':
4568       /* Print a scalar FP/SIMD register name + 1.  */
4569       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4570         {
4571           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4572           return;
4573         }
4574       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4575       break;
4576
4577     case 'X':
4578       /* Print bottom 16 bits of integer constant in hex.  */
4579       if (!CONST_INT_P (x))
4580         {
4581           output_operand_lossage ("invalid operand for '%%%c'", code);
4582           return;
4583         }
4584       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4585       break;
4586
4587     case 'w':
4588     case 'x':
4589       /* Print a general register name or the zero register (32-bit or
4590          64-bit).  */
4591       if (x == const0_rtx
4592           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4593         {
4594           asm_fprintf (f, "%czr", code);
4595           break;
4596         }
4597
4598       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4599         {
4600           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4601           break;
4602         }
4603
4604       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4605         {
4606           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4607           break;
4608         }
4609
4610       /* Fall through */
4611
4612     case 0:
4613       /* Print a normal operand, if it's a general register, then we
4614          assume DImode.  */
4615       if (x == NULL)
4616         {
4617           output_operand_lossage ("missing operand");
4618           return;
4619         }
4620
4621       switch (GET_CODE (x))
4622         {
4623         case REG:
4624           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4625           break;
4626
4627         case MEM:
4628           output_address (GET_MODE (x), XEXP (x, 0));
4629           break;
4630
4631         case CONST:
4632         case LABEL_REF:
4633         case SYMBOL_REF:
4634           output_addr_const (asm_out_file, x);
4635           break;
4636
4637         case CONST_INT:
4638           asm_fprintf (f, "%wd", INTVAL (x));
4639           break;
4640
4641         case CONST_VECTOR:
4642           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4643             {
4644               gcc_assert (
4645                   aarch64_const_vec_all_same_in_range_p (x,
4646                                                          HOST_WIDE_INT_MIN,
4647                                                          HOST_WIDE_INT_MAX));
4648               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4649             }
4650           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4651             {
4652               fputc ('0', f);
4653             }
4654           else
4655             gcc_unreachable ();
4656           break;
4657
4658         case CONST_DOUBLE:
4659           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4660              be getting CONST_DOUBLEs holding integers.  */
4661           gcc_assert (GET_MODE (x) != VOIDmode);
4662           if (aarch64_float_const_zero_rtx_p (x))
4663             {
4664               fputc ('0', f);
4665               break;
4666             }
4667           else if (aarch64_float_const_representable_p (x))
4668             {
4669 #define buf_size 20
4670               char float_buf[buf_size] = {'\0'};
4671               real_to_decimal_for_mode (float_buf,
4672                                         CONST_DOUBLE_REAL_VALUE (x),
4673                                         buf_size, buf_size,
4674                                         1, GET_MODE (x));
4675               asm_fprintf (asm_out_file, "%s", float_buf);
4676               break;
4677 #undef buf_size
4678             }
4679           output_operand_lossage ("invalid constant");
4680           return;
4681         default:
4682           output_operand_lossage ("invalid operand");
4683           return;
4684         }
4685       break;
4686
4687     case 'A':
4688       if (GET_CODE (x) == HIGH)
4689         x = XEXP (x, 0);
4690
4691       switch (aarch64_classify_symbolic_expression (x))
4692         {
4693         case SYMBOL_SMALL_GOT_4G:
4694           asm_fprintf (asm_out_file, ":got:");
4695           break;
4696
4697         case SYMBOL_SMALL_TLSGD:
4698           asm_fprintf (asm_out_file, ":tlsgd:");
4699           break;
4700
4701         case SYMBOL_SMALL_TLSDESC:
4702           asm_fprintf (asm_out_file, ":tlsdesc:");
4703           break;
4704
4705         case SYMBOL_SMALL_TLSIE:
4706           asm_fprintf (asm_out_file, ":gottprel:");
4707           break;
4708
4709         case SYMBOL_TLSLE24:
4710           asm_fprintf (asm_out_file, ":tprel:");
4711           break;
4712
4713         case SYMBOL_TINY_GOT:
4714           gcc_unreachable ();
4715           break;
4716
4717         default:
4718           break;
4719         }
4720       output_addr_const (asm_out_file, x);
4721       break;
4722
4723     case 'L':
4724       switch (aarch64_classify_symbolic_expression (x))
4725         {
4726         case SYMBOL_SMALL_GOT_4G:
4727           asm_fprintf (asm_out_file, ":lo12:");
4728           break;
4729
4730         case SYMBOL_SMALL_TLSGD:
4731           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4732           break;
4733
4734         case SYMBOL_SMALL_TLSDESC:
4735           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4736           break;
4737
4738         case SYMBOL_SMALL_TLSIE:
4739           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4740           break;
4741
4742         case SYMBOL_TLSLE12:
4743           asm_fprintf (asm_out_file, ":tprel_lo12:");
4744           break;
4745
4746         case SYMBOL_TLSLE24:
4747           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4748           break;
4749
4750         case SYMBOL_TINY_GOT:
4751           asm_fprintf (asm_out_file, ":got:");
4752           break;
4753
4754         case SYMBOL_TINY_TLSIE:
4755           asm_fprintf (asm_out_file, ":gottprel:");
4756           break;
4757
4758         default:
4759           break;
4760         }
4761       output_addr_const (asm_out_file, x);
4762       break;
4763
4764     case 'G':
4765
4766       switch (aarch64_classify_symbolic_expression (x))
4767         {
4768         case SYMBOL_TLSLE24:
4769           asm_fprintf (asm_out_file, ":tprel_hi12:");
4770           break;
4771         default:
4772           break;
4773         }
4774       output_addr_const (asm_out_file, x);
4775       break;
4776
4777     case 'k':
4778       {
4779         HOST_WIDE_INT cond_code;
4780         /* Print nzcv.  */
4781
4782         if (!CONST_INT_P (x))
4783           {
4784             output_operand_lossage ("invalid operand for '%%%c'", code);
4785             return;
4786           }
4787
4788         cond_code = INTVAL (x);
4789         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4790         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4791       }
4792       break;
4793
4794     default:
4795       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4796       return;
4797     }
4798 }
4799
4800 static void
4801 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4802 {
4803   struct aarch64_address_info addr;
4804
4805   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4806     switch (addr.type)
4807       {
4808       case ADDRESS_REG_IMM:
4809         if (addr.offset == const0_rtx)
4810           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4811         else
4812           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4813                        INTVAL (addr.offset));
4814         return;
4815
4816       case ADDRESS_REG_REG:
4817         if (addr.shift == 0)
4818           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4819                        reg_names [REGNO (addr.offset)]);
4820         else
4821           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4822                        reg_names [REGNO (addr.offset)], addr.shift);
4823         return;
4824
4825       case ADDRESS_REG_UXTW:
4826         if (addr.shift == 0)
4827           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4828                        REGNO (addr.offset) - R0_REGNUM);
4829         else
4830           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4831                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4832         return;
4833
4834       case ADDRESS_REG_SXTW:
4835         if (addr.shift == 0)
4836           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4837                        REGNO (addr.offset) - R0_REGNUM);
4838         else
4839           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4840                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4841         return;
4842
4843       case ADDRESS_REG_WB:
4844         switch (GET_CODE (x))
4845           {
4846           case PRE_INC:
4847             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4848                          GET_MODE_SIZE (mode));
4849             return;
4850           case POST_INC:
4851             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4852                          GET_MODE_SIZE (mode));
4853             return;
4854           case PRE_DEC:
4855             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4856                          GET_MODE_SIZE (mode));
4857             return;
4858           case POST_DEC:
4859             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4860                          GET_MODE_SIZE (mode));
4861             return;
4862           case PRE_MODIFY:
4863             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4864                          INTVAL (addr.offset));
4865             return;
4866           case POST_MODIFY:
4867             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4868                          INTVAL (addr.offset));
4869             return;
4870           default:
4871             break;
4872           }
4873         break;
4874
4875       case ADDRESS_LO_SUM:
4876         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4877         output_addr_const (f, addr.offset);
4878         asm_fprintf (f, "]");
4879         return;
4880
4881       case ADDRESS_SYMBOLIC:
4882         break;
4883       }
4884
4885   output_addr_const (f, x);
4886 }
4887
4888 bool
4889 aarch64_label_mentioned_p (rtx x)
4890 {
4891   const char *fmt;
4892   int i;
4893
4894   if (GET_CODE (x) == LABEL_REF)
4895     return true;
4896
4897   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4898      referencing instruction, but they are constant offsets, not
4899      symbols.  */
4900   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4901     return false;
4902
4903   fmt = GET_RTX_FORMAT (GET_CODE (x));
4904   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4905     {
4906       if (fmt[i] == 'E')
4907         {
4908           int j;
4909
4910           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4911             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4912               return 1;
4913         }
4914       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4915         return 1;
4916     }
4917
4918   return 0;
4919 }
4920
4921 /* Implement REGNO_REG_CLASS.  */
4922
4923 enum reg_class
4924 aarch64_regno_regclass (unsigned regno)
4925 {
4926   if (GP_REGNUM_P (regno))
4927     return GENERAL_REGS;
4928
4929   if (regno == SP_REGNUM)
4930     return STACK_REG;
4931
4932   if (regno == FRAME_POINTER_REGNUM
4933       || regno == ARG_POINTER_REGNUM)
4934     return POINTER_REGS;
4935
4936   if (FP_REGNUM_P (regno))
4937     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4938
4939   return NO_REGS;
4940 }
4941
4942 static rtx
4943 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4944 {
4945   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4946      where mask is selected by alignment and size of the offset.
4947      We try to pick as large a range for the offset as possible to
4948      maximize the chance of a CSE.  However, for aligned addresses
4949      we limit the range to 4k so that structures with different sized
4950      elements are likely to use the same base.  We need to be careful
4951      not to split a CONST for some forms of address expression, otherwise
4952      it will generate sub-optimal code.  */
4953
4954   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4955     {
4956       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4957       HOST_WIDE_INT base_offset;
4958
4959       if (GET_CODE (XEXP (x, 0)) == PLUS)
4960         {
4961           rtx op0 = XEXP (XEXP (x, 0), 0);
4962           rtx op1 = XEXP (XEXP (x, 0), 1);
4963
4964           /* Address expressions of the form Ra + Rb + CONST.
4965
4966              If CONST is within the range supported by the addressing
4967              mode "reg+offset", do not split CONST and use the
4968              sequence
4969                Rt = Ra + Rb;
4970                addr = Rt + CONST.  */
4971           if (REG_P (op0) && REG_P (op1))
4972             {
4973               machine_mode addr_mode = GET_MODE (x);
4974               rtx base = gen_reg_rtx (addr_mode);
4975               rtx addr = plus_constant (addr_mode, base, offset);
4976
4977               if (aarch64_legitimate_address_hook_p (mode, addr, false))
4978                 {
4979                   emit_insn (gen_adddi3 (base, op0, op1));
4980                   return addr;
4981                 }
4982             }
4983           /* Address expressions of the form Ra + Rb<<SCALE + CONST.
4984
4985              If Reg + Rb<<SCALE is a valid address expression, do not
4986              split CONST and use the sequence
4987                Rc = CONST;
4988                Rt = Ra + Rc;
4989                addr = Rt + Rb<<SCALE.
4990
4991              TODO: We really should split CONST out of memory referece
4992              because:
4993                a) We depend on GIMPLE optimizers to pick up common sub
4994                   expression involving the scaling operation.
4995                b) The index Rb is likely a loop iv, it's better to split
4996                   the CONST so that computation of new base Rt is a loop
4997                   invariant and can be moved out of loop.  This is more
4998                   important when the original base Ra is sfp related.
4999
5000              Unfortunately, GIMPLE optimizers (e.g., SLSR) can not handle
5001              this kind of CSE opportunity at the time of this change, we
5002              have to force register scaling expr out of memory ref now.  */
5003           else if (REG_P (op0) || REG_P (op1))
5004             {
5005               machine_mode addr_mode = GET_MODE (x);
5006               rtx base = gen_reg_rtx (addr_mode);
5007
5008               /* Switch to make sure that register is in op0.  */
5009               if (REG_P (op1))
5010                 std::swap (op0, op1);
5011
5012               rtx addr = plus_constant (addr_mode, base, offset);
5013
5014               if (aarch64_legitimate_address_hook_p (mode, addr, false))
5015                 {
5016                   base = force_operand (gen_rtx_PLUS (addr_mode, op1, op0),
5017                                         NULL_RTX);
5018                   return plus_constant (addr_mode, base, offset);
5019                 }
5020             }
5021         }
5022
5023       /* Does it look like we'll need a load/store-pair operation?  */
5024       if (GET_MODE_SIZE (mode) > 16
5025           || mode == TImode)
5026         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5027                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
5028       /* For offsets aren't a multiple of the access size, the limit is
5029          -256...255.  */
5030       else if (offset & (GET_MODE_SIZE (mode) - 1))
5031         base_offset = (offset + 0x100) & ~0x1ff;
5032       else
5033         base_offset = offset & ~0xfff;
5034
5035       if (base_offset == 0)
5036         return x;
5037
5038       offset -= base_offset;
5039       rtx base_reg = gen_reg_rtx (Pmode);
5040       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
5041                            NULL_RTX);
5042       emit_move_insn (base_reg, val);
5043       x = plus_constant (Pmode, base_reg, offset);
5044     }
5045
5046   return x;
5047 }
5048
5049 /* Try a machine-dependent way of reloading an illegitimate address
5050    operand.  If we find one, push the reload and return the new rtx.  */
5051
5052 rtx
5053 aarch64_legitimize_reload_address (rtx *x_p,
5054                                    machine_mode mode,
5055                                    int opnum, int type,
5056                                    int ind_levels ATTRIBUTE_UNUSED)
5057 {
5058   rtx x = *x_p;
5059
5060   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
5061   if (aarch64_vect_struct_mode_p (mode)
5062       && GET_CODE (x) == PLUS
5063       && REG_P (XEXP (x, 0))
5064       && CONST_INT_P (XEXP (x, 1)))
5065     {
5066       rtx orig_rtx = x;
5067       x = copy_rtx (x);
5068       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
5069                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5070                    opnum, (enum reload_type) type);
5071       return x;
5072     }
5073
5074   /* We must recognize output that we have already generated ourselves.  */
5075   if (GET_CODE (x) == PLUS
5076       && GET_CODE (XEXP (x, 0)) == PLUS
5077       && REG_P (XEXP (XEXP (x, 0), 0))
5078       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5079       && CONST_INT_P (XEXP (x, 1)))
5080     {
5081       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5082                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5083                    opnum, (enum reload_type) type);
5084       return x;
5085     }
5086
5087   /* We wish to handle large displacements off a base register by splitting
5088      the addend across an add and the mem insn.  This can cut the number of
5089      extra insns needed from 3 to 1.  It is only useful for load/store of a
5090      single register with 12 bit offset field.  */
5091   if (GET_CODE (x) == PLUS
5092       && REG_P (XEXP (x, 0))
5093       && CONST_INT_P (XEXP (x, 1))
5094       && HARD_REGISTER_P (XEXP (x, 0))
5095       && mode != TImode
5096       && mode != TFmode
5097       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
5098     {
5099       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
5100       HOST_WIDE_INT low = val & 0xfff;
5101       HOST_WIDE_INT high = val - low;
5102       HOST_WIDE_INT offs;
5103       rtx cst;
5104       machine_mode xmode = GET_MODE (x);
5105
5106       /* In ILP32, xmode can be either DImode or SImode.  */
5107       gcc_assert (xmode == DImode || xmode == SImode);
5108
5109       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
5110          BLKmode alignment.  */
5111       if (GET_MODE_SIZE (mode) == 0)
5112         return NULL_RTX;
5113
5114       offs = low % GET_MODE_SIZE (mode);
5115
5116       /* Align misaligned offset by adjusting high part to compensate.  */
5117       if (offs != 0)
5118         {
5119           if (aarch64_uimm12_shift (high + offs))
5120             {
5121               /* Align down.  */
5122               low = low - offs;
5123               high = high + offs;
5124             }
5125           else
5126             {
5127               /* Align up.  */
5128               offs = GET_MODE_SIZE (mode) - offs;
5129               low = low + offs;
5130               high = high + (low & 0x1000) - offs;
5131               low &= 0xfff;
5132             }
5133         }
5134
5135       /* Check for overflow.  */
5136       if (high + low != val)
5137         return NULL_RTX;
5138
5139       cst = GEN_INT (high);
5140       if (!aarch64_uimm12_shift (high))
5141         cst = force_const_mem (xmode, cst);
5142
5143       /* Reload high part into base reg, leaving the low part
5144          in the mem instruction.
5145          Note that replacing this gen_rtx_PLUS with plus_constant is
5146          wrong in this case because we rely on the
5147          (plus (plus reg c1) c2) structure being preserved so that
5148          XEXP (*p, 0) in push_reload below uses the correct term.  */
5149       x = gen_rtx_PLUS (xmode,
5150                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
5151                         GEN_INT (low));
5152
5153       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5154                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
5155                    opnum, (enum reload_type) type);
5156       return x;
5157     }
5158
5159   return NULL_RTX;
5160 }
5161
5162
5163 /* Return the reload icode required for a constant pool in mode.  */
5164 static enum insn_code
5165 aarch64_constant_pool_reload_icode (machine_mode mode)
5166 {
5167   switch (mode)
5168     {
5169     case SFmode:
5170       return CODE_FOR_aarch64_reload_movcpsfdi;
5171
5172     case DFmode:
5173       return CODE_FOR_aarch64_reload_movcpdfdi;
5174
5175     case TFmode:
5176       return CODE_FOR_aarch64_reload_movcptfdi;
5177
5178     case V8QImode:
5179       return CODE_FOR_aarch64_reload_movcpv8qidi;
5180
5181     case V16QImode:
5182       return CODE_FOR_aarch64_reload_movcpv16qidi;
5183
5184     case V4HImode:
5185       return CODE_FOR_aarch64_reload_movcpv4hidi;
5186
5187     case V8HImode:
5188       return CODE_FOR_aarch64_reload_movcpv8hidi;
5189
5190     case V2SImode:
5191       return CODE_FOR_aarch64_reload_movcpv2sidi;
5192
5193     case V4SImode:
5194       return CODE_FOR_aarch64_reload_movcpv4sidi;
5195
5196     case V2DImode:
5197       return CODE_FOR_aarch64_reload_movcpv2didi;
5198
5199     case V2DFmode:
5200       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5201
5202     default:
5203       gcc_unreachable ();
5204     }
5205
5206   gcc_unreachable ();
5207 }
5208 static reg_class_t
5209 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5210                           reg_class_t rclass,
5211                           machine_mode mode,
5212                           secondary_reload_info *sri)
5213 {
5214
5215   /* If we have to disable direct literal pool loads and stores because the
5216      function is too big, then we need a scratch register.  */
5217   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5218       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5219           || targetm.vector_mode_supported_p (GET_MODE (x)))
5220       && aarch64_nopcrelative_literal_loads)
5221     {
5222       sri->icode = aarch64_constant_pool_reload_icode (mode);
5223       return NO_REGS;
5224     }
5225
5226   /* Without the TARGET_SIMD instructions we cannot move a Q register
5227      to a Q register directly.  We need a scratch.  */
5228   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5229       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5230       && reg_class_subset_p (rclass, FP_REGS))
5231     {
5232       if (mode == TFmode)
5233         sri->icode = CODE_FOR_aarch64_reload_movtf;
5234       else if (mode == TImode)
5235         sri->icode = CODE_FOR_aarch64_reload_movti;
5236       return NO_REGS;
5237     }
5238
5239   /* A TFmode or TImode memory access should be handled via an FP_REGS
5240      because AArch64 has richer addressing modes for LDR/STR instructions
5241      than LDP/STP instructions.  */
5242   if (TARGET_FLOAT && rclass == GENERAL_REGS
5243       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5244     return FP_REGS;
5245
5246   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5247       return GENERAL_REGS;
5248
5249   return NO_REGS;
5250 }
5251
5252 static bool
5253 aarch64_can_eliminate (const int from, const int to)
5254 {
5255   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5256      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5257
5258   if (frame_pointer_needed)
5259     {
5260       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5261         return true;
5262       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5263         return false;
5264       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5265           && !cfun->calls_alloca)
5266         return true;
5267       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5268         return true;
5269
5270       return false;
5271     }
5272   else
5273     {
5274       /* If we decided that we didn't need a leaf frame pointer but then used
5275          LR in the function, then we'll want a frame pointer after all, so
5276          prevent this elimination to ensure a frame pointer is used.  */
5277       if (to == STACK_POINTER_REGNUM
5278           && flag_omit_leaf_frame_pointer
5279           && df_regs_ever_live_p (LR_REGNUM))
5280         return false;
5281     }
5282
5283   return true;
5284 }
5285
5286 HOST_WIDE_INT
5287 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5288 {
5289   aarch64_layout_frame ();
5290
5291   if (to == HARD_FRAME_POINTER_REGNUM)
5292     {
5293       if (from == ARG_POINTER_REGNUM)
5294         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
5295
5296       if (from == FRAME_POINTER_REGNUM)
5297         return (cfun->machine->frame.hard_fp_offset
5298                 - cfun->machine->frame.saved_varargs_size);
5299     }
5300
5301   if (to == STACK_POINTER_REGNUM)
5302     {
5303       if (from == FRAME_POINTER_REGNUM)
5304           return (cfun->machine->frame.frame_size
5305                   - cfun->machine->frame.saved_varargs_size);
5306     }
5307
5308   return cfun->machine->frame.frame_size;
5309 }
5310
5311 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5312    previous frame.  */
5313
5314 rtx
5315 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5316 {
5317   if (count != 0)
5318     return const0_rtx;
5319   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5320 }
5321
5322
5323 static void
5324 aarch64_asm_trampoline_template (FILE *f)
5325 {
5326   if (TARGET_ILP32)
5327     {
5328       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5329       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5330     }
5331   else
5332     {
5333       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5334       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5335     }
5336   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5337   assemble_aligned_integer (4, const0_rtx);
5338   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5339   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5340 }
5341
5342 static void
5343 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5344 {
5345   rtx fnaddr, mem, a_tramp;
5346   const int tramp_code_sz = 16;
5347
5348   /* Don't need to copy the trailing D-words, we fill those in below.  */
5349   emit_block_move (m_tramp, assemble_trampoline_template (),
5350                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5351   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5352   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5353   if (GET_MODE (fnaddr) != ptr_mode)
5354     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5355   emit_move_insn (mem, fnaddr);
5356
5357   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5358   emit_move_insn (mem, chain_value);
5359
5360   /* XXX We should really define a "clear_cache" pattern and use
5361      gen_clear_cache().  */
5362   a_tramp = XEXP (m_tramp, 0);
5363   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5364                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5365                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5366                      ptr_mode);
5367 }
5368
5369 static unsigned char
5370 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5371 {
5372   switch (regclass)
5373     {
5374     case CALLER_SAVE_REGS:
5375     case POINTER_REGS:
5376     case GENERAL_REGS:
5377     case ALL_REGS:
5378     case FP_REGS:
5379     case FP_LO_REGS:
5380       return
5381         aarch64_vector_mode_p (mode)
5382           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5383           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5384     case STACK_REG:
5385       return 1;
5386
5387     case NO_REGS:
5388       return 0;
5389
5390     default:
5391       break;
5392     }
5393   gcc_unreachable ();
5394 }
5395
5396 static reg_class_t
5397 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5398 {
5399   if (regclass == POINTER_REGS)
5400     return GENERAL_REGS;
5401
5402   if (regclass == STACK_REG)
5403     {
5404       if (REG_P(x)
5405           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5406           return regclass;
5407
5408       return NO_REGS;
5409     }
5410
5411   /* If it's an integer immediate that MOVI can't handle, then
5412      FP_REGS is not an option, so we return NO_REGS instead.  */
5413   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5414       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5415     return NO_REGS;
5416
5417   /* Register eliminiation can result in a request for
5418      SP+constant->FP_REGS.  We cannot support such operations which
5419      use SP as source and an FP_REG as destination, so reject out
5420      right now.  */
5421   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5422     {
5423       rtx lhs = XEXP (x, 0);
5424
5425       /* Look through a possible SUBREG introduced by ILP32.  */
5426       if (GET_CODE (lhs) == SUBREG)
5427         lhs = SUBREG_REG (lhs);
5428
5429       gcc_assert (REG_P (lhs));
5430       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5431                                       POINTER_REGS));
5432       return NO_REGS;
5433     }
5434
5435   return regclass;
5436 }
5437
5438 void
5439 aarch64_asm_output_labelref (FILE* f, const char *name)
5440 {
5441   asm_fprintf (f, "%U%s", name);
5442 }
5443
5444 static void
5445 aarch64_elf_asm_constructor (rtx symbol, int priority)
5446 {
5447   if (priority == DEFAULT_INIT_PRIORITY)
5448     default_ctor_section_asm_out_constructor (symbol, priority);
5449   else
5450     {
5451       section *s;
5452       char buf[18];
5453       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5454       s = get_section (buf, SECTION_WRITE, NULL);
5455       switch_to_section (s);
5456       assemble_align (POINTER_SIZE);
5457       assemble_aligned_integer (POINTER_BYTES, symbol);
5458     }
5459 }
5460
5461 static void
5462 aarch64_elf_asm_destructor (rtx symbol, int priority)
5463 {
5464   if (priority == DEFAULT_INIT_PRIORITY)
5465     default_dtor_section_asm_out_destructor (symbol, priority);
5466   else
5467     {
5468       section *s;
5469       char buf[18];
5470       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5471       s = get_section (buf, SECTION_WRITE, NULL);
5472       switch_to_section (s);
5473       assemble_align (POINTER_SIZE);
5474       assemble_aligned_integer (POINTER_BYTES, symbol);
5475     }
5476 }
5477
5478 const char*
5479 aarch64_output_casesi (rtx *operands)
5480 {
5481   char buf[100];
5482   char label[100];
5483   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5484   int index;
5485   static const char *const patterns[4][2] =
5486   {
5487     {
5488       "ldrb\t%w3, [%0,%w1,uxtw]",
5489       "add\t%3, %4, %w3, sxtb #2"
5490     },
5491     {
5492       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5493       "add\t%3, %4, %w3, sxth #2"
5494     },
5495     {
5496       "ldr\t%w3, [%0,%w1,uxtw #2]",
5497       "add\t%3, %4, %w3, sxtw #2"
5498     },
5499     /* We assume that DImode is only generated when not optimizing and
5500        that we don't really need 64-bit address offsets.  That would
5501        imply an object file with 8GB of code in a single function!  */
5502     {
5503       "ldr\t%w3, [%0,%w1,uxtw #2]",
5504       "add\t%3, %4, %w3, sxtw #2"
5505     }
5506   };
5507
5508   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5509
5510   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5511
5512   gcc_assert (index >= 0 && index <= 3);
5513
5514   /* Need to implement table size reduction, by chaning the code below.  */
5515   output_asm_insn (patterns[index][0], operands);
5516   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5517   snprintf (buf, sizeof (buf),
5518             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5519   output_asm_insn (buf, operands);
5520   output_asm_insn (patterns[index][1], operands);
5521   output_asm_insn ("br\t%3", operands);
5522   assemble_label (asm_out_file, label);
5523   return "";
5524 }
5525
5526
5527 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5528    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5529    operator.  */
5530
5531 int
5532 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5533 {
5534   if (shift >= 0 && shift <= 3)
5535     {
5536       int size;
5537       for (size = 8; size <= 32; size *= 2)
5538         {
5539           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5540           if (mask == bits << shift)
5541             return size;
5542         }
5543     }
5544   return 0;
5545 }
5546
5547 /* Constant pools are per function only when PC relative
5548    literal loads are true or we are in the large memory
5549    model.  */
5550
5551 static inline bool
5552 aarch64_can_use_per_function_literal_pools_p (void)
5553 {
5554   return (!aarch64_nopcrelative_literal_loads
5555           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5556 }
5557
5558 static bool
5559 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5560 {
5561   /* Fixme:: In an ideal world this would work similar
5562      to the logic in aarch64_select_rtx_section but this
5563      breaks bootstrap in gcc go.  For now we workaround
5564      this by returning false here.  */
5565   return false;
5566 }
5567
5568 /* Select appropriate section for constants depending
5569    on where we place literal pools.  */
5570
5571 static section *
5572 aarch64_select_rtx_section (machine_mode mode,
5573                             rtx x,
5574                             unsigned HOST_WIDE_INT align)
5575 {
5576   if (aarch64_can_use_per_function_literal_pools_p ())
5577     return function_section (current_function_decl);
5578
5579   return default_elf_select_rtx_section (mode, x, align);
5580 }
5581
5582 /* Costs.  */
5583
5584 /* Helper function for rtx cost calculation.  Strip a shift expression
5585    from X.  Returns the inner operand if successful, or the original
5586    expression on failure.  */
5587 static rtx
5588 aarch64_strip_shift (rtx x)
5589 {
5590   rtx op = x;
5591
5592   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5593      we can convert both to ROR during final output.  */
5594   if ((GET_CODE (op) == ASHIFT
5595        || GET_CODE (op) == ASHIFTRT
5596        || GET_CODE (op) == LSHIFTRT
5597        || GET_CODE (op) == ROTATERT
5598        || GET_CODE (op) == ROTATE)
5599       && CONST_INT_P (XEXP (op, 1)))
5600     return XEXP (op, 0);
5601
5602   if (GET_CODE (op) == MULT
5603       && CONST_INT_P (XEXP (op, 1))
5604       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5605     return XEXP (op, 0);
5606
5607   return x;
5608 }
5609
5610 /* Helper function for rtx cost calculation.  Strip an extend
5611    expression from X.  Returns the inner operand if successful, or the
5612    original expression on failure.  We deal with a number of possible
5613    canonicalization variations here.  */
5614 static rtx
5615 aarch64_strip_extend (rtx x)
5616 {
5617   rtx op = x;
5618
5619   /* Zero and sign extraction of a widened value.  */
5620   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5621       && XEXP (op, 2) == const0_rtx
5622       && GET_CODE (XEXP (op, 0)) == MULT
5623       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5624                                          XEXP (op, 1)))
5625     return XEXP (XEXP (op, 0), 0);
5626
5627   /* It can also be represented (for zero-extend) as an AND with an
5628      immediate.  */
5629   if (GET_CODE (op) == AND
5630       && GET_CODE (XEXP (op, 0)) == MULT
5631       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5632       && CONST_INT_P (XEXP (op, 1))
5633       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5634                            INTVAL (XEXP (op, 1))) != 0)
5635     return XEXP (XEXP (op, 0), 0);
5636
5637   /* Now handle extended register, as this may also have an optional
5638      left shift by 1..4.  */
5639   if (GET_CODE (op) == ASHIFT
5640       && CONST_INT_P (XEXP (op, 1))
5641       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5642     op = XEXP (op, 0);
5643
5644   if (GET_CODE (op) == ZERO_EXTEND
5645       || GET_CODE (op) == SIGN_EXTEND)
5646     op = XEXP (op, 0);
5647
5648   if (op != x)
5649     return op;
5650
5651   return x;
5652 }
5653
5654 /* Return true iff CODE is a shift supported in combination
5655    with arithmetic instructions.  */
5656
5657 static bool
5658 aarch64_shift_p (enum rtx_code code)
5659 {
5660   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5661 }
5662
5663 /* Helper function for rtx cost calculation.  Calculate the cost of
5664    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5665    Return the calculated cost of the expression, recursing manually in to
5666    operands where needed.  */
5667
5668 static int
5669 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5670 {
5671   rtx op0, op1;
5672   const struct cpu_cost_table *extra_cost
5673     = aarch64_tune_params.insn_extra_cost;
5674   int cost = 0;
5675   bool compound_p = (outer == PLUS || outer == MINUS);
5676   machine_mode mode = GET_MODE (x);
5677
5678   gcc_checking_assert (code == MULT);
5679
5680   op0 = XEXP (x, 0);
5681   op1 = XEXP (x, 1);
5682
5683   if (VECTOR_MODE_P (mode))
5684     mode = GET_MODE_INNER (mode);
5685
5686   /* Integer multiply/fma.  */
5687   if (GET_MODE_CLASS (mode) == MODE_INT)
5688     {
5689       /* The multiply will be canonicalized as a shift, cost it as such.  */
5690       if (aarch64_shift_p (GET_CODE (x))
5691           || (CONST_INT_P (op1)
5692               && exact_log2 (INTVAL (op1)) > 0))
5693         {
5694           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5695                            || GET_CODE (op0) == SIGN_EXTEND;
5696           if (speed)
5697             {
5698               if (compound_p)
5699                 {
5700                   if (REG_P (op1))
5701                     /* ARITH + shift-by-register.  */
5702                     cost += extra_cost->alu.arith_shift_reg;
5703                   else if (is_extend)
5704                     /* ARITH + extended register.  We don't have a cost field
5705                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5706                     cost += extra_cost->alu.extend_arith;
5707                   else
5708                     /* ARITH + shift-by-immediate.  */
5709                     cost += extra_cost->alu.arith_shift;
5710                 }
5711               else
5712                 /* LSL (immediate).  */
5713                 cost += extra_cost->alu.shift;
5714
5715             }
5716           /* Strip extends as we will have costed them in the case above.  */
5717           if (is_extend)
5718             op0 = aarch64_strip_extend (op0);
5719
5720           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5721
5722           return cost;
5723         }
5724
5725       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5726          compound and let the below cases handle it.  After all, MNEG is a
5727          special-case alias of MSUB.  */
5728       if (GET_CODE (op0) == NEG)
5729         {
5730           op0 = XEXP (op0, 0);
5731           compound_p = true;
5732         }
5733
5734       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5735       if ((GET_CODE (op0) == ZERO_EXTEND
5736            && GET_CODE (op1) == ZERO_EXTEND)
5737           || (GET_CODE (op0) == SIGN_EXTEND
5738               && GET_CODE (op1) == SIGN_EXTEND))
5739         {
5740           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5741           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5742
5743           if (speed)
5744             {
5745               if (compound_p)
5746                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5747                 cost += extra_cost->mult[0].extend_add;
5748               else
5749                 /* MUL/SMULL/UMULL.  */
5750                 cost += extra_cost->mult[0].extend;
5751             }
5752
5753           return cost;
5754         }
5755
5756       /* This is either an integer multiply or a MADD.  In both cases
5757          we want to recurse and cost the operands.  */
5758       cost += rtx_cost (op0, mode, MULT, 0, speed);
5759       cost += rtx_cost (op1, mode, MULT, 1, speed);
5760
5761       if (speed)
5762         {
5763           if (compound_p)
5764             /* MADD/MSUB.  */
5765             cost += extra_cost->mult[mode == DImode].add;
5766           else
5767             /* MUL.  */
5768             cost += extra_cost->mult[mode == DImode].simple;
5769         }
5770
5771       return cost;
5772     }
5773   else
5774     {
5775       if (speed)
5776         {
5777           /* Floating-point FMA/FMUL can also support negations of the
5778              operands, unless the rounding mode is upward or downward in
5779              which case FNMUL is different than FMUL with operand negation.  */
5780           bool neg0 = GET_CODE (op0) == NEG;
5781           bool neg1 = GET_CODE (op1) == NEG;
5782           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5783             {
5784               if (neg0)
5785                 op0 = XEXP (op0, 0);
5786               if (neg1)
5787                 op1 = XEXP (op1, 0);
5788             }
5789
5790           if (compound_p)
5791             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5792             cost += extra_cost->fp[mode == DFmode].fma;
5793           else
5794             /* FMUL/FNMUL.  */
5795             cost += extra_cost->fp[mode == DFmode].mult;
5796         }
5797
5798       cost += rtx_cost (op0, mode, MULT, 0, speed);
5799       cost += rtx_cost (op1, mode, MULT, 1, speed);
5800       return cost;
5801     }
5802 }
5803
5804 static int
5805 aarch64_address_cost (rtx x,
5806                       machine_mode mode,
5807                       addr_space_t as ATTRIBUTE_UNUSED,
5808                       bool speed)
5809 {
5810   enum rtx_code c = GET_CODE (x);
5811   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5812   struct aarch64_address_info info;
5813   int cost = 0;
5814   info.shift = 0;
5815
5816   if (!aarch64_classify_address (&info, x, mode, c, false))
5817     {
5818       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5819         {
5820           /* This is a CONST or SYMBOL ref which will be split
5821              in a different way depending on the code model in use.
5822              Cost it through the generic infrastructure.  */
5823           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5824           /* Divide through by the cost of one instruction to
5825              bring it to the same units as the address costs.  */
5826           cost_symbol_ref /= COSTS_N_INSNS (1);
5827           /* The cost is then the cost of preparing the address,
5828              followed by an immediate (possibly 0) offset.  */
5829           return cost_symbol_ref + addr_cost->imm_offset;
5830         }
5831       else
5832         {
5833           /* This is most likely a jump table from a case
5834              statement.  */
5835           return addr_cost->register_offset;
5836         }
5837     }
5838
5839   switch (info.type)
5840     {
5841       case ADDRESS_LO_SUM:
5842       case ADDRESS_SYMBOLIC:
5843       case ADDRESS_REG_IMM:
5844         cost += addr_cost->imm_offset;
5845         break;
5846
5847       case ADDRESS_REG_WB:
5848         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5849           cost += addr_cost->pre_modify;
5850         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5851           cost += addr_cost->post_modify;
5852         else
5853           gcc_unreachable ();
5854
5855         break;
5856
5857       case ADDRESS_REG_REG:
5858         cost += addr_cost->register_offset;
5859         break;
5860
5861       case ADDRESS_REG_SXTW:
5862         cost += addr_cost->register_sextend;
5863         break;
5864
5865       case ADDRESS_REG_UXTW:
5866         cost += addr_cost->register_zextend;
5867         break;
5868
5869       default:
5870         gcc_unreachable ();
5871     }
5872
5873
5874   if (info.shift > 0)
5875     {
5876       /* For the sake of calculating the cost of the shifted register
5877          component, we can treat same sized modes in the same way.  */
5878       switch (GET_MODE_BITSIZE (mode))
5879         {
5880           case 16:
5881             cost += addr_cost->addr_scale_costs.hi;
5882             break;
5883
5884           case 32:
5885             cost += addr_cost->addr_scale_costs.si;
5886             break;
5887
5888           case 64:
5889             cost += addr_cost->addr_scale_costs.di;
5890             break;
5891
5892           /* We can't tell, or this is a 128-bit vector.  */
5893           default:
5894             cost += addr_cost->addr_scale_costs.ti;
5895             break;
5896         }
5897     }
5898
5899   return cost;
5900 }
5901
5902 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5903    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5904    to be taken.  */
5905
5906 int
5907 aarch64_branch_cost (bool speed_p, bool predictable_p)
5908 {
5909   /* When optimizing for speed, use the cost of unpredictable branches.  */
5910   const struct cpu_branch_cost *branch_costs =
5911     aarch64_tune_params.branch_costs;
5912
5913   if (!speed_p || predictable_p)
5914     return branch_costs->predictable;
5915   else
5916     return branch_costs->unpredictable;
5917 }
5918
5919 /* Return true if the RTX X in mode MODE is a zero or sign extract
5920    usable in an ADD or SUB (extended register) instruction.  */
5921 static bool
5922 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5923 {
5924   /* Catch add with a sign extract.
5925      This is add_<optab><mode>_multp2.  */
5926   if (GET_CODE (x) == SIGN_EXTRACT
5927       || GET_CODE (x) == ZERO_EXTRACT)
5928     {
5929       rtx op0 = XEXP (x, 0);
5930       rtx op1 = XEXP (x, 1);
5931       rtx op2 = XEXP (x, 2);
5932
5933       if (GET_CODE (op0) == MULT
5934           && CONST_INT_P (op1)
5935           && op2 == const0_rtx
5936           && CONST_INT_P (XEXP (op0, 1))
5937           && aarch64_is_extend_from_extract (mode,
5938                                              XEXP (op0, 1),
5939                                              op1))
5940         {
5941           return true;
5942         }
5943     }
5944   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5945      No shift.  */
5946   else if (GET_CODE (x) == SIGN_EXTEND
5947            || GET_CODE (x) == ZERO_EXTEND)
5948     return REG_P (XEXP (x, 0));
5949
5950   return false;
5951 }
5952
5953 static bool
5954 aarch64_frint_unspec_p (unsigned int u)
5955 {
5956   switch (u)
5957     {
5958       case UNSPEC_FRINTZ:
5959       case UNSPEC_FRINTP:
5960       case UNSPEC_FRINTM:
5961       case UNSPEC_FRINTA:
5962       case UNSPEC_FRINTN:
5963       case UNSPEC_FRINTX:
5964       case UNSPEC_FRINTI:
5965         return true;
5966
5967       default:
5968         return false;
5969     }
5970 }
5971
5972 /* Return true iff X is an rtx that will match an extr instruction
5973    i.e. as described in the *extr<mode>5_insn family of patterns.
5974    OP0 and OP1 will be set to the operands of the shifts involved
5975    on success and will be NULL_RTX otherwise.  */
5976
5977 static bool
5978 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5979 {
5980   rtx op0, op1;
5981   machine_mode mode = GET_MODE (x);
5982
5983   *res_op0 = NULL_RTX;
5984   *res_op1 = NULL_RTX;
5985
5986   if (GET_CODE (x) != IOR)
5987     return false;
5988
5989   op0 = XEXP (x, 0);
5990   op1 = XEXP (x, 1);
5991
5992   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5993       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5994     {
5995      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5996       if (GET_CODE (op1) == ASHIFT)
5997         std::swap (op0, op1);
5998
5999       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6000         return false;
6001
6002       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6003       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6004
6005       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6006           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6007         {
6008           *res_op0 = XEXP (op0, 0);
6009           *res_op1 = XEXP (op1, 0);
6010           return true;
6011         }
6012     }
6013
6014   return false;
6015 }
6016
6017 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6018    storing it in *COST.  Result is true if the total cost of the operation
6019    has now been calculated.  */
6020 static bool
6021 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6022 {
6023   rtx inner;
6024   rtx comparator;
6025   enum rtx_code cmpcode;
6026
6027   if (COMPARISON_P (op0))
6028     {
6029       inner = XEXP (op0, 0);
6030       comparator = XEXP (op0, 1);
6031       cmpcode = GET_CODE (op0);
6032     }
6033   else
6034     {
6035       inner = op0;
6036       comparator = const0_rtx;
6037       cmpcode = NE;
6038     }
6039
6040   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6041     {
6042       /* Conditional branch.  */
6043       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6044         return true;
6045       else
6046         {
6047           if (cmpcode == NE || cmpcode == EQ)
6048             {
6049               if (comparator == const0_rtx)
6050                 {
6051                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6052                   if (GET_CODE (inner) == ZERO_EXTRACT)
6053                     /* TBZ/TBNZ.  */
6054                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6055                                        ZERO_EXTRACT, 0, speed);
6056                   else
6057                     /* CBZ/CBNZ.  */
6058                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6059
6060                 return true;
6061               }
6062             }
6063           else if (cmpcode == LT || cmpcode == GE)
6064             {
6065               /* TBZ/TBNZ.  */
6066               if (comparator == const0_rtx)
6067                 return true;
6068             }
6069         }
6070     }
6071   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6072     {
6073       /* CCMP.  */
6074       if (GET_CODE (op1) == COMPARE)
6075         {
6076           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6077           if (XEXP (op1, 1) == const0_rtx)
6078             *cost += 1;
6079           if (speed)
6080             {
6081               machine_mode mode = GET_MODE (XEXP (op1, 0));
6082               const struct cpu_cost_table *extra_cost
6083                 = aarch64_tune_params.insn_extra_cost;
6084
6085               if (GET_MODE_CLASS (mode) == MODE_INT)
6086                 *cost += extra_cost->alu.arith;
6087               else
6088                 *cost += extra_cost->fp[mode == DFmode].compare;
6089             }
6090           return true;
6091         }
6092
6093       /* It's a conditional operation based on the status flags,
6094          so it must be some flavor of CSEL.  */
6095
6096       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6097       if (GET_CODE (op1) == NEG
6098           || GET_CODE (op1) == NOT
6099           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6100         op1 = XEXP (op1, 0);
6101       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6102         {
6103           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6104           op1 = XEXP (op1, 0);
6105           op2 = XEXP (op2, 0);
6106         }
6107
6108       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6109       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6110       return true;
6111     }
6112
6113   /* We don't know what this is, cost all operands.  */
6114   return false;
6115 }
6116
6117 /* Check whether X is a bitfield operation of the form shift + extend that
6118    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6119    operand to which the bitfield operation is applied.  Otherwise return
6120    NULL_RTX.  */
6121
6122 static rtx
6123 aarch64_extend_bitfield_pattern_p (rtx x)
6124 {
6125   rtx_code outer_code = GET_CODE (x);
6126   machine_mode outer_mode = GET_MODE (x);
6127
6128   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6129       && outer_mode != SImode && outer_mode != DImode)
6130     return NULL_RTX;
6131
6132   rtx inner = XEXP (x, 0);
6133   rtx_code inner_code = GET_CODE (inner);
6134   machine_mode inner_mode = GET_MODE (inner);
6135   rtx op = NULL_RTX;
6136
6137   switch (inner_code)
6138     {
6139       case ASHIFT:
6140         if (CONST_INT_P (XEXP (inner, 1))
6141             && (inner_mode == QImode || inner_mode == HImode))
6142           op = XEXP (inner, 0);
6143         break;
6144       case LSHIFTRT:
6145         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6146             && (inner_mode == QImode || inner_mode == HImode))
6147           op = XEXP (inner, 0);
6148         break;
6149       case ASHIFTRT:
6150         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6151             && (inner_mode == QImode || inner_mode == HImode))
6152           op = XEXP (inner, 0);
6153         break;
6154       default:
6155         break;
6156     }
6157
6158   return op;
6159 }
6160
6161 /* Calculate the cost of calculating X, storing it in *COST.  Result
6162    is true if the total cost of the operation has now been calculated.  */
6163 static bool
6164 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6165                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6166 {
6167   rtx op0, op1, op2;
6168   const struct cpu_cost_table *extra_cost
6169     = aarch64_tune_params.insn_extra_cost;
6170   int code = GET_CODE (x);
6171
6172   /* By default, assume that everything has equivalent cost to the
6173      cheapest instruction.  Any additional costs are applied as a delta
6174      above this default.  */
6175   *cost = COSTS_N_INSNS (1);
6176
6177   switch (code)
6178     {
6179     case SET:
6180       /* The cost depends entirely on the operands to SET.  */
6181       *cost = 0;
6182       op0 = SET_DEST (x);
6183       op1 = SET_SRC (x);
6184
6185       switch (GET_CODE (op0))
6186         {
6187         case MEM:
6188           if (speed)
6189             {
6190               rtx address = XEXP (op0, 0);
6191               if (VECTOR_MODE_P (mode))
6192                 *cost += extra_cost->ldst.storev;
6193               else if (GET_MODE_CLASS (mode) == MODE_INT)
6194                 *cost += extra_cost->ldst.store;
6195               else if (mode == SFmode)
6196                 *cost += extra_cost->ldst.storef;
6197               else if (mode == DFmode)
6198                 *cost += extra_cost->ldst.stored;
6199
6200               *cost +=
6201                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6202                                                      0, speed));
6203             }
6204
6205           *cost += rtx_cost (op1, mode, SET, 1, speed);
6206           return true;
6207
6208         case SUBREG:
6209           if (! REG_P (SUBREG_REG (op0)))
6210             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6211
6212           /* Fall through.  */
6213         case REG:
6214           /* The cost is one per vector-register copied.  */
6215           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6216             {
6217               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6218                               / GET_MODE_SIZE (V4SImode);
6219               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6220             }
6221           /* const0_rtx is in general free, but we will use an
6222              instruction to set a register to 0.  */
6223           else if (REG_P (op1) || op1 == const0_rtx)
6224             {
6225               /* The cost is 1 per register copied.  */
6226               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6227                               / UNITS_PER_WORD;
6228               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6229             }
6230           else
6231             /* Cost is just the cost of the RHS of the set.  */
6232             *cost += rtx_cost (op1, mode, SET, 1, speed);
6233           return true;
6234
6235         case ZERO_EXTRACT:
6236         case SIGN_EXTRACT:
6237           /* Bit-field insertion.  Strip any redundant widening of
6238              the RHS to meet the width of the target.  */
6239           if (GET_CODE (op1) == SUBREG)
6240             op1 = SUBREG_REG (op1);
6241           if ((GET_CODE (op1) == ZERO_EXTEND
6242                || GET_CODE (op1) == SIGN_EXTEND)
6243               && CONST_INT_P (XEXP (op0, 1))
6244               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6245                   >= INTVAL (XEXP (op0, 1))))
6246             op1 = XEXP (op1, 0);
6247
6248           if (CONST_INT_P (op1))
6249             {
6250               /* MOV immediate is assumed to always be cheap.  */
6251               *cost = COSTS_N_INSNS (1);
6252             }
6253           else
6254             {
6255               /* BFM.  */
6256               if (speed)
6257                 *cost += extra_cost->alu.bfi;
6258               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6259             }
6260
6261           return true;
6262
6263         default:
6264           /* We can't make sense of this, assume default cost.  */
6265           *cost = COSTS_N_INSNS (1);
6266           return false;
6267         }
6268       return false;
6269
6270     case CONST_INT:
6271       /* If an instruction can incorporate a constant within the
6272          instruction, the instruction's expression avoids calling
6273          rtx_cost() on the constant.  If rtx_cost() is called on a
6274          constant, then it is usually because the constant must be
6275          moved into a register by one or more instructions.
6276
6277          The exception is constant 0, which can be expressed
6278          as XZR/WZR and is therefore free.  The exception to this is
6279          if we have (set (reg) (const0_rtx)) in which case we must cost
6280          the move.  However, we can catch that when we cost the SET, so
6281          we don't need to consider that here.  */
6282       if (x == const0_rtx)
6283         *cost = 0;
6284       else
6285         {
6286           /* To an approximation, building any other constant is
6287              proportionally expensive to the number of instructions
6288              required to build that constant.  This is true whether we
6289              are compiling for SPEED or otherwise.  */
6290           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6291                                  (NULL_RTX, x, false, mode));
6292         }
6293       return true;
6294
6295     case CONST_DOUBLE:
6296       if (speed)
6297         {
6298           /* mov[df,sf]_aarch64.  */
6299           if (aarch64_float_const_representable_p (x))
6300             /* FMOV (scalar immediate).  */
6301             *cost += extra_cost->fp[mode == DFmode].fpconst;
6302           else if (!aarch64_float_const_zero_rtx_p (x))
6303             {
6304               /* This will be a load from memory.  */
6305               if (mode == DFmode)
6306                 *cost += extra_cost->ldst.loadd;
6307               else
6308                 *cost += extra_cost->ldst.loadf;
6309             }
6310           else
6311             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6312                or MOV v0.s[0], wzr - neither of which are modeled by the
6313                cost tables.  Just use the default cost.  */
6314             {
6315             }
6316         }
6317
6318       return true;
6319
6320     case MEM:
6321       if (speed)
6322         {
6323           /* For loads we want the base cost of a load, plus an
6324              approximation for the additional cost of the addressing
6325              mode.  */
6326           rtx address = XEXP (x, 0);
6327           if (VECTOR_MODE_P (mode))
6328             *cost += extra_cost->ldst.loadv;
6329           else if (GET_MODE_CLASS (mode) == MODE_INT)
6330             *cost += extra_cost->ldst.load;
6331           else if (mode == SFmode)
6332             *cost += extra_cost->ldst.loadf;
6333           else if (mode == DFmode)
6334             *cost += extra_cost->ldst.loadd;
6335
6336           *cost +=
6337                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6338                                                      0, speed));
6339         }
6340
6341       return true;
6342
6343     case NEG:
6344       op0 = XEXP (x, 0);
6345
6346       if (VECTOR_MODE_P (mode))
6347         {
6348           if (speed)
6349             {
6350               /* FNEG.  */
6351               *cost += extra_cost->vect.alu;
6352             }
6353           return false;
6354         }
6355
6356       if (GET_MODE_CLASS (mode) == MODE_INT)
6357         {
6358           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6359               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6360             {
6361               /* CSETM.  */
6362               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6363               return true;
6364             }
6365
6366           /* Cost this as SUB wzr, X.  */
6367           op0 = CONST0_RTX (mode);
6368           op1 = XEXP (x, 0);
6369           goto cost_minus;
6370         }
6371
6372       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6373         {
6374           /* Support (neg(fma...)) as a single instruction only if
6375              sign of zeros is unimportant.  This matches the decision
6376              making in aarch64.md.  */
6377           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6378             {
6379               /* FNMADD.  */
6380               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6381               return true;
6382             }
6383           if (GET_CODE (op0) == MULT)
6384             {
6385               /* FNMUL.  */
6386               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6387               return true;
6388             }
6389           if (speed)
6390             /* FNEG.  */
6391             *cost += extra_cost->fp[mode == DFmode].neg;
6392           return false;
6393         }
6394
6395       return false;
6396
6397     case CLRSB:
6398     case CLZ:
6399       if (speed)
6400         {
6401           if (VECTOR_MODE_P (mode))
6402             *cost += extra_cost->vect.alu;
6403           else
6404             *cost += extra_cost->alu.clz;
6405         }
6406
6407       return false;
6408
6409     case COMPARE:
6410       op0 = XEXP (x, 0);
6411       op1 = XEXP (x, 1);
6412
6413       if (op1 == const0_rtx
6414           && GET_CODE (op0) == AND)
6415         {
6416           x = op0;
6417           mode = GET_MODE (op0);
6418           goto cost_logic;
6419         }
6420
6421       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6422         {
6423           /* TODO: A write to the CC flags possibly costs extra, this
6424              needs encoding in the cost tables.  */
6425
6426           /* CC_ZESWPmode supports zero extend for free.  */
6427           if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6428             op0 = XEXP (op0, 0);
6429
6430           mode = GET_MODE (op0);
6431           /* ANDS.  */
6432           if (GET_CODE (op0) == AND)
6433             {
6434               x = op0;
6435               goto cost_logic;
6436             }
6437
6438           if (GET_CODE (op0) == PLUS)
6439             {
6440               /* ADDS (and CMN alias).  */
6441               x = op0;
6442               goto cost_plus;
6443             }
6444
6445           if (GET_CODE (op0) == MINUS)
6446             {
6447               /* SUBS.  */
6448               x = op0;
6449               goto cost_minus;
6450             }
6451
6452           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6453               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6454               && CONST_INT_P (XEXP (op0, 2)))
6455             {
6456               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6457                  Handle it here directly rather than going to cost_logic
6458                  since we know the immediate generated for the TST is valid
6459                  so we can avoid creating an intermediate rtx for it only
6460                  for costing purposes.  */
6461               if (speed)
6462                 *cost += extra_cost->alu.logical;
6463
6464               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6465                                  ZERO_EXTRACT, 0, speed);
6466               return true;
6467             }
6468
6469           if (GET_CODE (op1) == NEG)
6470             {
6471               /* CMN.  */
6472               if (speed)
6473                 *cost += extra_cost->alu.arith;
6474
6475               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6476               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6477               return true;
6478             }
6479
6480           /* CMP.
6481
6482              Compare can freely swap the order of operands, and
6483              canonicalization puts the more complex operation first.
6484              But the integer MINUS logic expects the shift/extend
6485              operation in op1.  */
6486           if (! (REG_P (op0)
6487                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6488           {
6489             op0 = XEXP (x, 1);
6490             op1 = XEXP (x, 0);
6491           }
6492           goto cost_minus;
6493         }
6494
6495       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6496         {
6497           /* FCMP.  */
6498           if (speed)
6499             *cost += extra_cost->fp[mode == DFmode].compare;
6500
6501           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6502             {
6503               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6504               /* FCMP supports constant 0.0 for no extra cost. */
6505               return true;
6506             }
6507           return false;
6508         }
6509
6510       if (VECTOR_MODE_P (mode))
6511         {
6512           /* Vector compare.  */
6513           if (speed)
6514             *cost += extra_cost->vect.alu;
6515
6516           if (aarch64_float_const_zero_rtx_p (op1))
6517             {
6518               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6519                  cost.  */
6520               return true;
6521             }
6522           return false;
6523         }
6524       return false;
6525
6526     case MINUS:
6527       {
6528         op0 = XEXP (x, 0);
6529         op1 = XEXP (x, 1);
6530
6531 cost_minus:
6532         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6533
6534         /* Detect valid immediates.  */
6535         if ((GET_MODE_CLASS (mode) == MODE_INT
6536              || (GET_MODE_CLASS (mode) == MODE_CC
6537                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6538             && CONST_INT_P (op1)
6539             && aarch64_uimm12_shift (INTVAL (op1)))
6540           {
6541             if (speed)
6542               /* SUB(S) (immediate).  */
6543               *cost += extra_cost->alu.arith;
6544             return true;
6545           }
6546
6547         /* Look for SUB (extended register).  */
6548         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6549           {
6550             if (speed)
6551               *cost += extra_cost->alu.extend_arith;
6552
6553             op1 = aarch64_strip_extend (op1);
6554             *cost += rtx_cost (op1, VOIDmode,
6555                                (enum rtx_code) GET_CODE (op1), 0, speed);
6556             return true;
6557           }
6558
6559         rtx new_op1 = aarch64_strip_extend (op1);
6560
6561         /* Cost this as an FMA-alike operation.  */
6562         if ((GET_CODE (new_op1) == MULT
6563              || aarch64_shift_p (GET_CODE (new_op1)))
6564             && code != COMPARE)
6565           {
6566             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6567                                             (enum rtx_code) code,
6568                                             speed);
6569             return true;
6570           }
6571
6572         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6573
6574         if (speed)
6575           {
6576             if (VECTOR_MODE_P (mode))
6577               {
6578                 /* Vector SUB.  */
6579                 *cost += extra_cost->vect.alu;
6580               }
6581             else if (GET_MODE_CLASS (mode) == MODE_INT)
6582               {
6583                 /* SUB(S).  */
6584                 *cost += extra_cost->alu.arith;
6585               }
6586             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6587               {
6588                 /* FSUB.  */
6589                 *cost += extra_cost->fp[mode == DFmode].addsub;
6590               }
6591           }
6592         return true;
6593       }
6594
6595     case PLUS:
6596       {
6597         rtx new_op0;
6598
6599         op0 = XEXP (x, 0);
6600         op1 = XEXP (x, 1);
6601
6602 cost_plus:
6603         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6604             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6605           {
6606             /* CSINC.  */
6607             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6608             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6609             return true;
6610           }
6611
6612         if (GET_MODE_CLASS (mode) == MODE_INT
6613             && CONST_INT_P (op1)
6614             && aarch64_uimm12_shift (INTVAL (op1)))
6615           {
6616             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6617
6618             if (speed)
6619               /* ADD (immediate).  */
6620               *cost += extra_cost->alu.arith;
6621             return true;
6622           }
6623
6624         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6625
6626         /* Look for ADD (extended register).  */
6627         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6628           {
6629             if (speed)
6630               *cost += extra_cost->alu.extend_arith;
6631
6632             op0 = aarch64_strip_extend (op0);
6633             *cost += rtx_cost (op0, VOIDmode,
6634                                (enum rtx_code) GET_CODE (op0), 0, speed);
6635             return true;
6636           }
6637
6638         /* Strip any extend, leave shifts behind as we will
6639            cost them through mult_cost.  */
6640         new_op0 = aarch64_strip_extend (op0);
6641
6642         if (GET_CODE (new_op0) == MULT
6643             || aarch64_shift_p (GET_CODE (new_op0)))
6644           {
6645             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6646                                             speed);
6647             return true;
6648           }
6649
6650         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6651
6652         if (speed)
6653           {
6654             if (VECTOR_MODE_P (mode))
6655               {
6656                 /* Vector ADD.  */
6657                 *cost += extra_cost->vect.alu;
6658               }
6659             else if (GET_MODE_CLASS (mode) == MODE_INT)
6660               {
6661                 /* ADD.  */
6662                 *cost += extra_cost->alu.arith;
6663               }
6664             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6665               {
6666                 /* FADD.  */
6667                 *cost += extra_cost->fp[mode == DFmode].addsub;
6668               }
6669           }
6670         return true;
6671       }
6672
6673     case BSWAP:
6674       *cost = COSTS_N_INSNS (1);
6675
6676       if (speed)
6677         {
6678           if (VECTOR_MODE_P (mode))
6679             *cost += extra_cost->vect.alu;
6680           else
6681             *cost += extra_cost->alu.rev;
6682         }
6683       return false;
6684
6685     case IOR:
6686       if (aarch_rev16_p (x))
6687         {
6688           *cost = COSTS_N_INSNS (1);
6689
6690           if (speed)
6691             {
6692               if (VECTOR_MODE_P (mode))
6693                 *cost += extra_cost->vect.alu;
6694               else
6695                 *cost += extra_cost->alu.rev;
6696             }
6697           return true;
6698         }
6699
6700       if (aarch64_extr_rtx_p (x, &op0, &op1))
6701         {
6702           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6703           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6704           if (speed)
6705             *cost += extra_cost->alu.shift;
6706
6707           return true;
6708         }
6709     /* Fall through.  */
6710     case XOR:
6711     case AND:
6712     cost_logic:
6713       op0 = XEXP (x, 0);
6714       op1 = XEXP (x, 1);
6715
6716       if (VECTOR_MODE_P (mode))
6717         {
6718           if (speed)
6719             *cost += extra_cost->vect.alu;
6720           return true;
6721         }
6722
6723       if (code == AND
6724           && GET_CODE (op0) == MULT
6725           && CONST_INT_P (XEXP (op0, 1))
6726           && CONST_INT_P (op1)
6727           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6728                                INTVAL (op1)) != 0)
6729         {
6730           /* This is a UBFM/SBFM.  */
6731           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6732           if (speed)
6733             *cost += extra_cost->alu.bfx;
6734           return true;
6735         }
6736
6737       if (GET_MODE_CLASS (mode) == MODE_INT)
6738         {
6739           /* We possibly get the immediate for free, this is not
6740              modelled.  */
6741           if (CONST_INT_P (op1)
6742               && aarch64_bitmask_imm (INTVAL (op1), mode))
6743             {
6744               *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6745
6746               if (speed)
6747                 *cost += extra_cost->alu.logical;
6748
6749               return true;
6750             }
6751           else
6752             {
6753               rtx new_op0 = op0;
6754
6755               /* Handle ORN, EON, or BIC.  */
6756               if (GET_CODE (op0) == NOT)
6757                 op0 = XEXP (op0, 0);
6758
6759               new_op0 = aarch64_strip_shift (op0);
6760
6761               /* If we had a shift on op0 then this is a logical-shift-
6762                  by-register/immediate operation.  Otherwise, this is just
6763                  a logical operation.  */
6764               if (speed)
6765                 {
6766                   if (new_op0 != op0)
6767                     {
6768                       /* Shift by immediate.  */
6769                       if (CONST_INT_P (XEXP (op0, 1)))
6770                         *cost += extra_cost->alu.log_shift;
6771                       else
6772                         *cost += extra_cost->alu.log_shift_reg;
6773                     }
6774                   else
6775                     *cost += extra_cost->alu.logical;
6776                 }
6777
6778               /* In both cases we want to cost both operands.  */
6779               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6780               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6781
6782               return true;
6783             }
6784         }
6785       return false;
6786
6787     case NOT:
6788       x = XEXP (x, 0);
6789       op0 = aarch64_strip_shift (x);
6790
6791       if (VECTOR_MODE_P (mode))
6792         {
6793           /* Vector NOT.  */
6794           *cost += extra_cost->vect.alu;
6795           return false;
6796         }
6797
6798       /* MVN-shifted-reg.  */
6799       if (op0 != x)
6800         {
6801           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6802
6803           if (speed)
6804             *cost += extra_cost->alu.log_shift;
6805
6806           return true;
6807         }
6808       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6809          Handle the second form here taking care that 'a' in the above can
6810          be a shift.  */
6811       else if (GET_CODE (op0) == XOR)
6812         {
6813           rtx newop0 = XEXP (op0, 0);
6814           rtx newop1 = XEXP (op0, 1);
6815           rtx op0_stripped = aarch64_strip_shift (newop0);
6816
6817           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6818           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6819
6820           if (speed)
6821             {
6822               if (op0_stripped != newop0)
6823                 *cost += extra_cost->alu.log_shift;
6824               else
6825                 *cost += extra_cost->alu.logical;
6826             }
6827
6828           return true;
6829         }
6830       /* MVN.  */
6831       if (speed)
6832         *cost += extra_cost->alu.logical;
6833
6834       return false;
6835
6836     case ZERO_EXTEND:
6837
6838       op0 = XEXP (x, 0);
6839       /* If a value is written in SI mode, then zero extended to DI
6840          mode, the operation will in general be free as a write to
6841          a 'w' register implicitly zeroes the upper bits of an 'x'
6842          register.  However, if this is
6843
6844            (set (reg) (zero_extend (reg)))
6845
6846          we must cost the explicit register move.  */
6847       if (mode == DImode
6848           && GET_MODE (op0) == SImode
6849           && outer == SET)
6850         {
6851           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6852
6853           if (!op_cost && speed)
6854             /* MOV.  */
6855             *cost += extra_cost->alu.extend;
6856           else
6857             /* Free, the cost is that of the SI mode operation.  */
6858             *cost = op_cost;
6859
6860           return true;
6861         }
6862       else if (MEM_P (op0))
6863         {
6864           /* All loads can zero extend to any size for free.  */
6865           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6866           return true;
6867         }
6868
6869       op0 = aarch64_extend_bitfield_pattern_p (x);
6870       if (op0)
6871         {
6872           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6873           if (speed)
6874             *cost += extra_cost->alu.bfx;
6875           return true;
6876         }
6877
6878       if (speed)
6879         {
6880           if (VECTOR_MODE_P (mode))
6881             {
6882               /* UMOV.  */
6883               *cost += extra_cost->vect.alu;
6884             }
6885           else
6886             {
6887               /* UXTB/UXTH.  */
6888               *cost += extra_cost->alu.extend;
6889             }
6890         }
6891       return false;
6892
6893     case SIGN_EXTEND:
6894       if (MEM_P (XEXP (x, 0)))
6895         {
6896           /* LDRSH.  */
6897           if (speed)
6898             {
6899               rtx address = XEXP (XEXP (x, 0), 0);
6900               *cost += extra_cost->ldst.load_sign_extend;
6901
6902               *cost +=
6903                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6904                                                      0, speed));
6905             }
6906           return true;
6907         }
6908
6909       op0 = aarch64_extend_bitfield_pattern_p (x);
6910       if (op0)
6911         {
6912           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6913           if (speed)
6914             *cost += extra_cost->alu.bfx;
6915           return true;
6916         }
6917
6918       if (speed)
6919         {
6920           if (VECTOR_MODE_P (mode))
6921             *cost += extra_cost->vect.alu;
6922           else
6923             *cost += extra_cost->alu.extend;
6924         }
6925       return false;
6926
6927     case ASHIFT:
6928       op0 = XEXP (x, 0);
6929       op1 = XEXP (x, 1);
6930
6931       if (CONST_INT_P (op1))
6932         {
6933           if (speed)
6934             {
6935               if (VECTOR_MODE_P (mode))
6936                 {
6937                   /* Vector shift (immediate).  */
6938                   *cost += extra_cost->vect.alu;
6939                 }
6940               else
6941                 {
6942                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6943                      aliases.  */
6944                   *cost += extra_cost->alu.shift;
6945                 }
6946             }
6947
6948           /* We can incorporate zero/sign extend for free.  */
6949           if (GET_CODE (op0) == ZERO_EXTEND
6950               || GET_CODE (op0) == SIGN_EXTEND)
6951             op0 = XEXP (op0, 0);
6952
6953           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6954           return true;
6955         }
6956       else
6957         {
6958           if (speed)
6959             {
6960               if (VECTOR_MODE_P (mode))
6961                 {
6962                   /* Vector shift (register).  */
6963                   *cost += extra_cost->vect.alu;
6964                 }
6965               else
6966                 {
6967                   /* LSLV.  */
6968                   *cost += extra_cost->alu.shift_reg;
6969                 }
6970             }
6971           return false;  /* All arguments need to be in registers.  */
6972         }
6973
6974     case ROTATE:
6975     case ROTATERT:
6976     case LSHIFTRT:
6977     case ASHIFTRT:
6978       op0 = XEXP (x, 0);
6979       op1 = XEXP (x, 1);
6980
6981       if (CONST_INT_P (op1))
6982         {
6983           /* ASR (immediate) and friends.  */
6984           if (speed)
6985             {
6986               if (VECTOR_MODE_P (mode))
6987                 *cost += extra_cost->vect.alu;
6988               else
6989                 *cost += extra_cost->alu.shift;
6990             }
6991
6992           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6993           return true;
6994         }
6995       else
6996         {
6997
6998           /* ASR (register) and friends.  */
6999           if (speed)
7000             {
7001               if (VECTOR_MODE_P (mode))
7002                 *cost += extra_cost->vect.alu;
7003               else
7004                 *cost += extra_cost->alu.shift_reg;
7005             }
7006           return false;  /* All arguments need to be in registers.  */
7007         }
7008
7009     case SYMBOL_REF:
7010
7011       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7012           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7013         {
7014           /* LDR.  */
7015           if (speed)
7016             *cost += extra_cost->ldst.load;
7017         }
7018       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7019                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7020         {
7021           /* ADRP, followed by ADD.  */
7022           *cost += COSTS_N_INSNS (1);
7023           if (speed)
7024             *cost += 2 * extra_cost->alu.arith;
7025         }
7026       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7027                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7028         {
7029           /* ADR.  */
7030           if (speed)
7031             *cost += extra_cost->alu.arith;
7032         }
7033
7034       if (flag_pic)
7035         {
7036           /* One extra load instruction, after accessing the GOT.  */
7037           *cost += COSTS_N_INSNS (1);
7038           if (speed)
7039             *cost += extra_cost->ldst.load;
7040         }
7041       return true;
7042
7043     case HIGH:
7044     case LO_SUM:
7045       /* ADRP/ADD (immediate).  */
7046       if (speed)
7047         *cost += extra_cost->alu.arith;
7048       return true;
7049
7050     case ZERO_EXTRACT:
7051     case SIGN_EXTRACT:
7052       /* UBFX/SBFX.  */
7053       if (speed)
7054         {
7055           if (VECTOR_MODE_P (mode))
7056             *cost += extra_cost->vect.alu;
7057           else
7058             *cost += extra_cost->alu.bfx;
7059         }
7060
7061       /* We can trust that the immediates used will be correct (there
7062          are no by-register forms), so we need only cost op0.  */
7063       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7064       return true;
7065
7066     case MULT:
7067       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7068       /* aarch64_rtx_mult_cost always handles recursion to its
7069          operands.  */
7070       return true;
7071
7072     case MOD:
7073     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7074        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7075        an unconditional negate.  This case should only ever be reached through
7076        the set_smod_pow2_cheap check in expmed.c.  */
7077       if (CONST_INT_P (XEXP (x, 1))
7078           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7079           && (mode == SImode || mode == DImode))
7080         {
7081           /* We expand to 4 instructions.  Reset the baseline.  */
7082           *cost = COSTS_N_INSNS (4);
7083
7084           if (speed)
7085             *cost += 2 * extra_cost->alu.logical
7086                      + 2 * extra_cost->alu.arith;
7087
7088           return true;
7089         }
7090
7091     /* Fall-through.  */
7092     case UMOD:
7093       if (speed)
7094         {
7095           if (VECTOR_MODE_P (mode))
7096             *cost += extra_cost->vect.alu;
7097           else if (GET_MODE_CLASS (mode) == MODE_INT)
7098             *cost += (extra_cost->mult[mode == DImode].add
7099                       + extra_cost->mult[mode == DImode].idiv);
7100           else if (mode == DFmode)
7101             *cost += (extra_cost->fp[1].mult
7102                       + extra_cost->fp[1].div);
7103           else if (mode == SFmode)
7104             *cost += (extra_cost->fp[0].mult
7105                       + extra_cost->fp[0].div);
7106         }
7107       return false;  /* All arguments need to be in registers.  */
7108
7109     case DIV:
7110     case UDIV:
7111     case SQRT:
7112       if (speed)
7113         {
7114           if (VECTOR_MODE_P (mode))
7115             *cost += extra_cost->vect.alu;
7116           else if (GET_MODE_CLASS (mode) == MODE_INT)
7117             /* There is no integer SQRT, so only DIV and UDIV can get
7118                here.  */
7119             *cost += extra_cost->mult[mode == DImode].idiv;
7120           else
7121             *cost += extra_cost->fp[mode == DFmode].div;
7122         }
7123       return false;  /* All arguments need to be in registers.  */
7124
7125     case IF_THEN_ELSE:
7126       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7127                                          XEXP (x, 2), cost, speed);
7128
7129     case EQ:
7130     case NE:
7131     case GT:
7132     case GTU:
7133     case LT:
7134     case LTU:
7135     case GE:
7136     case GEU:
7137     case LE:
7138     case LEU:
7139
7140       return false; /* All arguments must be in registers.  */
7141
7142     case FMA:
7143       op0 = XEXP (x, 0);
7144       op1 = XEXP (x, 1);
7145       op2 = XEXP (x, 2);
7146
7147       if (speed)
7148         {
7149           if (VECTOR_MODE_P (mode))
7150             *cost += extra_cost->vect.alu;
7151           else
7152             *cost += extra_cost->fp[mode == DFmode].fma;
7153         }
7154
7155       /* FMSUB, FNMADD, and FNMSUB are free.  */
7156       if (GET_CODE (op0) == NEG)
7157         op0 = XEXP (op0, 0);
7158
7159       if (GET_CODE (op2) == NEG)
7160         op2 = XEXP (op2, 0);
7161
7162       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7163          and the by-element operand as operand 0.  */
7164       if (GET_CODE (op1) == NEG)
7165         op1 = XEXP (op1, 0);
7166
7167       /* Catch vector-by-element operations.  The by-element operand can
7168          either be (vec_duplicate (vec_select (x))) or just
7169          (vec_select (x)), depending on whether we are multiplying by
7170          a vector or a scalar.
7171
7172          Canonicalization is not very good in these cases, FMA4 will put the
7173          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7174       if (GET_CODE (op0) == VEC_DUPLICATE)
7175         op0 = XEXP (op0, 0);
7176       else if (GET_CODE (op1) == VEC_DUPLICATE)
7177         op1 = XEXP (op1, 0);
7178
7179       if (GET_CODE (op0) == VEC_SELECT)
7180         op0 = XEXP (op0, 0);
7181       else if (GET_CODE (op1) == VEC_SELECT)
7182         op1 = XEXP (op1, 0);
7183
7184       /* If the remaining parameters are not registers,
7185          get the cost to put them into registers.  */
7186       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7187       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7188       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7189       return true;
7190
7191     case FLOAT:
7192     case UNSIGNED_FLOAT:
7193       if (speed)
7194         *cost += extra_cost->fp[mode == DFmode].fromint;
7195       return false;
7196
7197     case FLOAT_EXTEND:
7198       if (speed)
7199         {
7200           if (VECTOR_MODE_P (mode))
7201             {
7202               /*Vector truncate.  */
7203               *cost += extra_cost->vect.alu;
7204             }
7205           else
7206             *cost += extra_cost->fp[mode == DFmode].widen;
7207         }
7208       return false;
7209
7210     case FLOAT_TRUNCATE:
7211       if (speed)
7212         {
7213           if (VECTOR_MODE_P (mode))
7214             {
7215               /*Vector conversion.  */
7216               *cost += extra_cost->vect.alu;
7217             }
7218           else
7219             *cost += extra_cost->fp[mode == DFmode].narrow;
7220         }
7221       return false;
7222
7223     case FIX:
7224     case UNSIGNED_FIX:
7225       x = XEXP (x, 0);
7226       /* Strip the rounding part.  They will all be implemented
7227          by the fcvt* family of instructions anyway.  */
7228       if (GET_CODE (x) == UNSPEC)
7229         {
7230           unsigned int uns_code = XINT (x, 1);
7231
7232           if (uns_code == UNSPEC_FRINTA
7233               || uns_code == UNSPEC_FRINTM
7234               || uns_code == UNSPEC_FRINTN
7235               || uns_code == UNSPEC_FRINTP
7236               || uns_code == UNSPEC_FRINTZ)
7237             x = XVECEXP (x, 0, 0);
7238         }
7239
7240       if (speed)
7241         {
7242           if (VECTOR_MODE_P (mode))
7243             *cost += extra_cost->vect.alu;
7244           else
7245             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7246         }
7247
7248       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7249          fixed-point fcvt.  */
7250       if (GET_CODE (x) == MULT
7251           && ((VECTOR_MODE_P (mode)
7252                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7253               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7254         {
7255           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7256                              0, speed);
7257           return true;
7258         }
7259
7260       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7261       return true;
7262
7263     case ABS:
7264       if (VECTOR_MODE_P (mode))
7265         {
7266           /* ABS (vector).  */
7267           if (speed)
7268             *cost += extra_cost->vect.alu;
7269         }
7270       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7271         {
7272           op0 = XEXP (x, 0);
7273
7274           /* FABD, which is analogous to FADD.  */
7275           if (GET_CODE (op0) == MINUS)
7276             {
7277               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7278               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7279               if (speed)
7280                 *cost += extra_cost->fp[mode == DFmode].addsub;
7281
7282               return true;
7283             }
7284           /* Simple FABS is analogous to FNEG.  */
7285           if (speed)
7286             *cost += extra_cost->fp[mode == DFmode].neg;
7287         }
7288       else
7289         {
7290           /* Integer ABS will either be split to
7291              two arithmetic instructions, or will be an ABS
7292              (scalar), which we don't model.  */
7293           *cost = COSTS_N_INSNS (2);
7294           if (speed)
7295             *cost += 2 * extra_cost->alu.arith;
7296         }
7297       return false;
7298
7299     case SMAX:
7300     case SMIN:
7301       if (speed)
7302         {
7303           if (VECTOR_MODE_P (mode))
7304             *cost += extra_cost->vect.alu;
7305           else
7306             {
7307               /* FMAXNM/FMINNM/FMAX/FMIN.
7308                  TODO: This may not be accurate for all implementations, but
7309                  we do not model this in the cost tables.  */
7310               *cost += extra_cost->fp[mode == DFmode].addsub;
7311             }
7312         }
7313       return false;
7314
7315     case UNSPEC:
7316       /* The floating point round to integer frint* instructions.  */
7317       if (aarch64_frint_unspec_p (XINT (x, 1)))
7318         {
7319           if (speed)
7320             *cost += extra_cost->fp[mode == DFmode].roundint;
7321
7322           return false;
7323         }
7324
7325       if (XINT (x, 1) == UNSPEC_RBIT)
7326         {
7327           if (speed)
7328             *cost += extra_cost->alu.rev;
7329
7330           return false;
7331         }
7332       break;
7333
7334     case TRUNCATE:
7335
7336       /* Decompose <su>muldi3_highpart.  */
7337       if (/* (truncate:DI  */
7338           mode == DImode
7339           /*   (lshiftrt:TI  */
7340           && GET_MODE (XEXP (x, 0)) == TImode
7341           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7342           /*      (mult:TI  */
7343           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7344           /*        (ANY_EXTEND:TI (reg:DI))
7345                     (ANY_EXTEND:TI (reg:DI)))  */
7346           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7347                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7348               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7349                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7350           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7351           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7352           /*     (const_int 64)  */
7353           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7354           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7355         {
7356           /* UMULH/SMULH.  */
7357           if (speed)
7358             *cost += extra_cost->mult[mode == DImode].extend;
7359           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7360                              mode, MULT, 0, speed);
7361           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7362                              mode, MULT, 1, speed);
7363           return true;
7364         }
7365
7366       /* Fall through.  */
7367     default:
7368       break;
7369     }
7370
7371   if (dump_file && (dump_flags & TDF_DETAILS))
7372     fprintf (dump_file,
7373       "\nFailed to cost RTX.  Assuming default cost.\n");
7374
7375   return true;
7376 }
7377
7378 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7379    calculated for X.  This cost is stored in *COST.  Returns true
7380    if the total cost of X was calculated.  */
7381 static bool
7382 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7383                    int param, int *cost, bool speed)
7384 {
7385   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7386
7387   if (dump_file && (dump_flags & TDF_DETAILS))
7388     {
7389       print_rtl_single (dump_file, x);
7390       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7391                speed ? "Hot" : "Cold",
7392                *cost, result ? "final" : "partial");
7393     }
7394
7395   return result;
7396 }
7397
7398 static int
7399 aarch64_register_move_cost (machine_mode mode,
7400                             reg_class_t from_i, reg_class_t to_i)
7401 {
7402   enum reg_class from = (enum reg_class) from_i;
7403   enum reg_class to = (enum reg_class) to_i;
7404   const struct cpu_regmove_cost *regmove_cost
7405     = aarch64_tune_params.regmove_cost;
7406
7407   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7408   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7409     to = GENERAL_REGS;
7410
7411   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7412     from = GENERAL_REGS;
7413
7414   /* Moving between GPR and stack cost is the same as GP2GP.  */
7415   if ((from == GENERAL_REGS && to == STACK_REG)
7416       || (to == GENERAL_REGS && from == STACK_REG))
7417     return regmove_cost->GP2GP;
7418
7419   /* To/From the stack register, we move via the gprs.  */
7420   if (to == STACK_REG || from == STACK_REG)
7421     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7422             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7423
7424   if (GET_MODE_SIZE (mode) == 16)
7425     {
7426       /* 128-bit operations on general registers require 2 instructions.  */
7427       if (from == GENERAL_REGS && to == GENERAL_REGS)
7428         return regmove_cost->GP2GP * 2;
7429       else if (from == GENERAL_REGS)
7430         return regmove_cost->GP2FP * 2;
7431       else if (to == GENERAL_REGS)
7432         return regmove_cost->FP2GP * 2;
7433
7434       /* When AdvSIMD instructions are disabled it is not possible to move
7435          a 128-bit value directly between Q registers.  This is handled in
7436          secondary reload.  A general register is used as a scratch to move
7437          the upper DI value and the lower DI value is moved directly,
7438          hence the cost is the sum of three moves. */
7439       if (! TARGET_SIMD)
7440         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7441
7442       return regmove_cost->FP2FP;
7443     }
7444
7445   if (from == GENERAL_REGS && to == GENERAL_REGS)
7446     return regmove_cost->GP2GP;
7447   else if (from == GENERAL_REGS)
7448     return regmove_cost->GP2FP;
7449   else if (to == GENERAL_REGS)
7450     return regmove_cost->FP2GP;
7451
7452   return regmove_cost->FP2FP;
7453 }
7454
7455 static int
7456 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7457                           reg_class_t rclass ATTRIBUTE_UNUSED,
7458                           bool in ATTRIBUTE_UNUSED)
7459 {
7460   return aarch64_tune_params.memmov_cost;
7461 }
7462
7463 /* Return true if it is safe and beneficial to use the rsqrt optabs to
7464    optimize 1.0/sqrt.  */
7465
7466 static bool
7467 use_rsqrt_p (void)
7468 {
7469   return (!flag_trapping_math
7470           && flag_unsafe_math_optimizations
7471           && ((aarch64_tune_params.extra_tuning_flags
7472                & AARCH64_EXTRA_TUNE_RECIP_SQRT)
7473               || flag_mrecip_low_precision_sqrt));
7474 }
7475
7476 /* Function to decide when to use
7477    reciprocal square root builtins.  */
7478
7479 static tree
7480 aarch64_builtin_reciprocal (tree fndecl)
7481 {
7482   if (!use_rsqrt_p ())
7483     return NULL_TREE;
7484   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7485 }
7486
7487 typedef rtx (*rsqrte_type) (rtx, rtx);
7488
7489 /* Select reciprocal square root initial estimate
7490    insn depending on machine mode.  */
7491
7492 rsqrte_type
7493 get_rsqrte_type (machine_mode mode)
7494 {
7495   switch (mode)
7496   {
7497     case DFmode:   return gen_aarch64_rsqrte_df2;
7498     case SFmode:   return gen_aarch64_rsqrte_sf2;
7499     case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7500     case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7501     case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7502     default: gcc_unreachable ();
7503   }
7504 }
7505
7506 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7507
7508 /* Select reciprocal square root Newton-Raphson step
7509    insn depending on machine mode.  */
7510
7511 rsqrts_type
7512 get_rsqrts_type (machine_mode mode)
7513 {
7514   switch (mode)
7515   {
7516     case DFmode:   return gen_aarch64_rsqrts_df3;
7517     case SFmode:   return gen_aarch64_rsqrts_sf3;
7518     case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7519     case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7520     case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7521     default: gcc_unreachable ();
7522   }
7523 }
7524
7525 /* Emit instruction sequence to compute
7526    reciprocal square root.  Use two Newton-Raphson steps
7527    for single precision and three for double precision.  */
7528
7529 void
7530 aarch64_emit_swrsqrt (rtx dst, rtx src)
7531 {
7532   machine_mode mode = GET_MODE (src);
7533   gcc_assert (
7534     mode == SFmode || mode == V2SFmode || mode == V4SFmode
7535         || mode == DFmode || mode == V2DFmode);
7536
7537   rtx xsrc = gen_reg_rtx (mode);
7538   emit_move_insn (xsrc, src);
7539   rtx x0 = gen_reg_rtx (mode);
7540
7541   emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7542
7543   bool double_mode = (mode == DFmode || mode == V2DFmode);
7544
7545   int iterations = double_mode ? 3 : 2;
7546
7547   if (flag_mrecip_low_precision_sqrt)
7548     iterations--;
7549
7550   for (int i = 0; i < iterations; ++i)
7551     {
7552       rtx x1 = gen_reg_rtx (mode);
7553       rtx x2 = gen_reg_rtx (mode);
7554       rtx x3 = gen_reg_rtx (mode);
7555       emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7556
7557       emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7558
7559       emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7560       x0 = x1;
7561     }
7562
7563   emit_move_insn (dst, x0);
7564 }
7565
7566 /* Return the number of instructions that can be issued per cycle.  */
7567 static int
7568 aarch64_sched_issue_rate (void)
7569 {
7570   return aarch64_tune_params.issue_rate;
7571 }
7572
7573 static int
7574 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7575 {
7576   int issue_rate = aarch64_sched_issue_rate ();
7577
7578   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7579 }
7580
7581
7582 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7583    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7584    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7585
7586 static int
7587 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7588                                                     int ready_index)
7589 {
7590   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7591 }
7592
7593
7594 /* Vectorizer cost model target hooks.  */
7595
7596 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7597 static int
7598 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7599                                     tree vectype,
7600                                     int misalign ATTRIBUTE_UNUSED)
7601 {
7602   unsigned elements;
7603
7604   switch (type_of_cost)
7605     {
7606       case scalar_stmt:
7607         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7608
7609       case scalar_load:
7610         return aarch64_tune_params.vec_costs->scalar_load_cost;
7611
7612       case scalar_store:
7613         return aarch64_tune_params.vec_costs->scalar_store_cost;
7614
7615       case vector_stmt:
7616         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7617
7618       case vector_load:
7619         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7620
7621       case vector_store:
7622         return aarch64_tune_params.vec_costs->vec_store_cost;
7623
7624       case vec_to_scalar:
7625         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7626
7627       case scalar_to_vec:
7628         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7629
7630       case unaligned_load:
7631         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7632
7633       case unaligned_store:
7634         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7635
7636       case cond_branch_taken:
7637         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7638
7639       case cond_branch_not_taken:
7640         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7641
7642       case vec_perm:
7643         return aarch64_tune_params.vec_costs->vec_permute_cost;
7644
7645       case vec_promote_demote:
7646         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7647
7648       case vec_construct:
7649         elements = TYPE_VECTOR_SUBPARTS (vectype);
7650         return elements / 2 + 1;
7651
7652       default:
7653         gcc_unreachable ();
7654     }
7655 }
7656
7657 /* Implement targetm.vectorize.add_stmt_cost.  */
7658 static unsigned
7659 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7660                        struct _stmt_vec_info *stmt_info, int misalign,
7661                        enum vect_cost_model_location where)
7662 {
7663   unsigned *cost = (unsigned *) data;
7664   unsigned retval = 0;
7665
7666   if (flag_vect_cost_model)
7667     {
7668       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7669       int stmt_cost =
7670             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7671
7672       /* Statements in an inner loop relative to the loop being
7673          vectorized are weighted more heavily.  The value here is
7674          arbitrary and could potentially be improved with analysis.  */
7675       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7676         count *= 50; /*  FIXME  */
7677
7678       retval = (unsigned) (count * stmt_cost);
7679       cost[where] += retval;
7680     }
7681
7682   return retval;
7683 }
7684
7685 static void initialize_aarch64_code_model (struct gcc_options *);
7686
7687 /* Enum describing the various ways that the
7688    aarch64_parse_{arch,tune,cpu,extension} functions can fail.
7689    This way their callers can choose what kind of error to give.  */
7690
7691 enum aarch64_parse_opt_result
7692 {
7693   AARCH64_PARSE_OK,                     /* Parsing was successful.  */
7694   AARCH64_PARSE_MISSING_ARG,            /* Missing argument.  */
7695   AARCH64_PARSE_INVALID_FEATURE,        /* Invalid feature modifier.  */
7696   AARCH64_PARSE_INVALID_ARG             /* Invalid arch, tune, cpu arg.  */
7697 };
7698
7699 /* Parse the architecture extension string STR and update ISA_FLAGS
7700    with the architecture features turned on or off.  Return a
7701    aarch64_parse_opt_result describing the result.  */
7702
7703 static enum aarch64_parse_opt_result
7704 aarch64_parse_extension (char *str, unsigned long *isa_flags)
7705 {
7706   /* The extension string is parsed left to right.  */
7707   const struct aarch64_option_extension *opt = NULL;
7708
7709   /* Flag to say whether we are adding or removing an extension.  */
7710   int adding_ext = -1;
7711
7712   while (str != NULL && *str != 0)
7713     {
7714       char *ext;
7715       size_t len;
7716
7717       str++;
7718       ext = strchr (str, '+');
7719
7720       if (ext != NULL)
7721         len = ext - str;
7722       else
7723         len = strlen (str);
7724
7725       if (len >= 2 && strncmp (str, "no", 2) == 0)
7726         {
7727           adding_ext = 0;
7728           len -= 2;
7729           str += 2;
7730         }
7731       else if (len > 0)
7732         adding_ext = 1;
7733
7734       if (len == 0)
7735         return AARCH64_PARSE_MISSING_ARG;
7736
7737
7738       /* Scan over the extensions table trying to find an exact match.  */
7739       for (opt = all_extensions; opt->name != NULL; opt++)
7740         {
7741           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7742             {
7743               /* Add or remove the extension.  */
7744               if (adding_ext)
7745                 *isa_flags |= opt->flags_on;
7746               else
7747                 *isa_flags &= ~(opt->flags_off);
7748               break;
7749             }
7750         }
7751
7752       if (opt->name == NULL)
7753         {
7754           /* Extension not found in list.  */
7755           return AARCH64_PARSE_INVALID_FEATURE;
7756         }
7757
7758       str = ext;
7759     };
7760
7761   return AARCH64_PARSE_OK;
7762 }
7763
7764 /* Parse the TO_PARSE string and put the architecture struct that it
7765    selects into RES and the architectural features into ISA_FLAGS.
7766    Return an aarch64_parse_opt_result describing the parse result.
7767    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7768
7769 static enum aarch64_parse_opt_result
7770 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7771                     unsigned long *isa_flags)
7772 {
7773   char *ext;
7774   const struct processor *arch;
7775   char *str = (char *) alloca (strlen (to_parse) + 1);
7776   size_t len;
7777
7778   strcpy (str, to_parse);
7779
7780   ext = strchr (str, '+');
7781
7782   if (ext != NULL)
7783     len = ext - str;
7784   else
7785     len = strlen (str);
7786
7787   if (len == 0)
7788     return AARCH64_PARSE_MISSING_ARG;
7789
7790
7791   /* Loop through the list of supported ARCHes to find a match.  */
7792   for (arch = all_architectures; arch->name != NULL; arch++)
7793     {
7794       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7795         {
7796           unsigned long isa_temp = arch->flags;
7797
7798           if (ext != NULL)
7799             {
7800               /* TO_PARSE string contains at least one extension.  */
7801               enum aarch64_parse_opt_result ext_res
7802                 = aarch64_parse_extension (ext, &isa_temp);
7803
7804               if (ext_res != AARCH64_PARSE_OK)
7805                 return ext_res;
7806             }
7807           /* Extension parsing was successful.  Confirm the result
7808              arch and ISA flags.  */
7809           *res = arch;
7810           *isa_flags = isa_temp;
7811           return AARCH64_PARSE_OK;
7812         }
7813     }
7814
7815   /* ARCH name not found in list.  */
7816   return AARCH64_PARSE_INVALID_ARG;
7817 }
7818
7819 /* Parse the TO_PARSE string and put the result tuning in RES and the
7820    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7821    describing the parse result.  If there is an error parsing, RES and
7822    ISA_FLAGS are left unchanged.  */
7823
7824 static enum aarch64_parse_opt_result
7825 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7826                    unsigned long *isa_flags)
7827 {
7828   char *ext;
7829   const struct processor *cpu;
7830   char *str = (char *) alloca (strlen (to_parse) + 1);
7831   size_t len;
7832
7833   strcpy (str, to_parse);
7834
7835   ext = strchr (str, '+');
7836
7837   if (ext != NULL)
7838     len = ext - str;
7839   else
7840     len = strlen (str);
7841
7842   if (len == 0)
7843     return AARCH64_PARSE_MISSING_ARG;
7844
7845
7846   /* Loop through the list of supported CPUs to find a match.  */
7847   for (cpu = all_cores; cpu->name != NULL; cpu++)
7848     {
7849       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7850         {
7851           unsigned long isa_temp = cpu->flags;
7852
7853
7854           if (ext != NULL)
7855             {
7856               /* TO_PARSE string contains at least one extension.  */
7857               enum aarch64_parse_opt_result ext_res
7858                 = aarch64_parse_extension (ext, &isa_temp);
7859
7860               if (ext_res != AARCH64_PARSE_OK)
7861                 return ext_res;
7862             }
7863           /* Extension parsing was successfull.  Confirm the result
7864              cpu and ISA flags.  */
7865           *res = cpu;
7866           *isa_flags = isa_temp;
7867           return AARCH64_PARSE_OK;
7868         }
7869     }
7870
7871   /* CPU name not found in list.  */
7872   return AARCH64_PARSE_INVALID_ARG;
7873 }
7874
7875 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7876    Return an aarch64_parse_opt_result describing the parse result.
7877    If the parsing fails the RES does not change.  */
7878
7879 static enum aarch64_parse_opt_result
7880 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7881 {
7882   const struct processor *cpu;
7883   char *str = (char *) alloca (strlen (to_parse) + 1);
7884
7885   strcpy (str, to_parse);
7886
7887   /* Loop through the list of supported CPUs to find a match.  */
7888   for (cpu = all_cores; cpu->name != NULL; cpu++)
7889     {
7890       if (strcmp (cpu->name, str) == 0)
7891         {
7892           *res = cpu;
7893           return AARCH64_PARSE_OK;
7894         }
7895     }
7896
7897   /* CPU name not found in list.  */
7898   return AARCH64_PARSE_INVALID_ARG;
7899 }
7900
7901 /* Parse TOKEN, which has length LENGTH to see if it is an option
7902    described in FLAG.  If it is, return the index bit for that fusion type.
7903    If not, error (printing OPTION_NAME) and return zero.  */
7904
7905 static unsigned int
7906 aarch64_parse_one_option_token (const char *token,
7907                                 size_t length,
7908                                 const struct aarch64_flag_desc *flag,
7909                                 const char *option_name)
7910 {
7911   for (; flag->name != NULL; flag++)
7912     {
7913       if (length == strlen (flag->name)
7914           && !strncmp (flag->name, token, length))
7915         return flag->flag;
7916     }
7917
7918   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7919   return 0;
7920 }
7921
7922 /* Parse OPTION which is a comma-separated list of flags to enable.
7923    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7924    default state we inherit from the CPU tuning structures.  OPTION_NAME
7925    gives the top-level option we are parsing in the -moverride string,
7926    for use in error messages.  */
7927
7928 static unsigned int
7929 aarch64_parse_boolean_options (const char *option,
7930                                const struct aarch64_flag_desc *flags,
7931                                unsigned int initial_state,
7932                                const char *option_name)
7933 {
7934   const char separator = '.';
7935   const char* specs = option;
7936   const char* ntoken = option;
7937   unsigned int found_flags = initial_state;
7938
7939   while ((ntoken = strchr (specs, separator)))
7940     {
7941       size_t token_length = ntoken - specs;
7942       unsigned token_ops = aarch64_parse_one_option_token (specs,
7943                                                            token_length,
7944                                                            flags,
7945                                                            option_name);
7946       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7947          in the token stream, reset the supported operations.  So:
7948
7949            adrp+add.cmp+branch.none.adrp+add
7950
7951            would have the result of turning on only adrp+add fusion.  */
7952       if (!token_ops)
7953         found_flags = 0;
7954
7955       found_flags |= token_ops;
7956       specs = ++ntoken;
7957     }
7958
7959   /* We ended with a comma, print something.  */
7960   if (!(*specs))
7961     {
7962       error ("%s string ill-formed\n", option_name);
7963       return 0;
7964     }
7965
7966   /* We still have one more token to parse.  */
7967   size_t token_length = strlen (specs);
7968   unsigned token_ops = aarch64_parse_one_option_token (specs,
7969                                                        token_length,
7970                                                        flags,
7971                                                        option_name);
7972    if (!token_ops)
7973      found_flags = 0;
7974
7975   found_flags |= token_ops;
7976   return found_flags;
7977 }
7978
7979 /* Support for overriding instruction fusion.  */
7980
7981 static void
7982 aarch64_parse_fuse_string (const char *fuse_string,
7983                             struct tune_params *tune)
7984 {
7985   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7986                                                      aarch64_fusible_pairs,
7987                                                      tune->fusible_ops,
7988                                                      "fuse=");
7989 }
7990
7991 /* Support for overriding other tuning flags.  */
7992
7993 static void
7994 aarch64_parse_tune_string (const char *tune_string,
7995                             struct tune_params *tune)
7996 {
7997   tune->extra_tuning_flags
7998     = aarch64_parse_boolean_options (tune_string,
7999                                      aarch64_tuning_flags,
8000                                      tune->extra_tuning_flags,
8001                                      "tune=");
8002 }
8003
8004 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8005    we understand.  If it is, extract the option string and handoff to
8006    the appropriate function.  */
8007
8008 void
8009 aarch64_parse_one_override_token (const char* token,
8010                                   size_t length,
8011                                   struct tune_params *tune)
8012 {
8013   const struct aarch64_tuning_override_function *fn
8014     = aarch64_tuning_override_functions;
8015
8016   const char *option_part = strchr (token, '=');
8017   if (!option_part)
8018     {
8019       error ("tuning string missing in option (%s)", token);
8020       return;
8021     }
8022
8023   /* Get the length of the option name.  */
8024   length = option_part - token;
8025   /* Skip the '=' to get to the option string.  */
8026   option_part++;
8027
8028   for (; fn->name != NULL; fn++)
8029     {
8030       if (!strncmp (fn->name, token, length))
8031         {
8032           fn->parse_override (option_part, tune);
8033           return;
8034         }
8035     }
8036
8037   error ("unknown tuning option (%s)",token);
8038   return;
8039 }
8040
8041 /* A checking mechanism for the implementation of the tls size.  */
8042
8043 static void
8044 initialize_aarch64_tls_size (struct gcc_options *opts)
8045 {
8046   if (aarch64_tls_size == 0)
8047     aarch64_tls_size = 24;
8048
8049   switch (opts->x_aarch64_cmodel_var)
8050     {
8051     case AARCH64_CMODEL_TINY:
8052       /* Both the default and maximum TLS size allowed under tiny is 1M which
8053          needs two instructions to address, so we clamp the size to 24.  */
8054       if (aarch64_tls_size > 24)
8055         aarch64_tls_size = 24;
8056       break;
8057     case AARCH64_CMODEL_SMALL:
8058       /* The maximum TLS size allowed under small is 4G.  */
8059       if (aarch64_tls_size > 32)
8060         aarch64_tls_size = 32;
8061       break;
8062     case AARCH64_CMODEL_LARGE:
8063       /* The maximum TLS size allowed under large is 16E.
8064          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8065       if (aarch64_tls_size > 48)
8066         aarch64_tls_size = 48;
8067       break;
8068     default:
8069       gcc_unreachable ();
8070     }
8071
8072   return;
8073 }
8074
8075 /* Parse STRING looking for options in the format:
8076      string     :: option:string
8077      option     :: name=substring
8078      name       :: {a-z}
8079      substring  :: defined by option.  */
8080
8081 static void
8082 aarch64_parse_override_string (const char* input_string,
8083                                struct tune_params* tune)
8084 {
8085   const char separator = ':';
8086   size_t string_length = strlen (input_string) + 1;
8087   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8088   char *string = string_root;
8089   strncpy (string, input_string, string_length);
8090   string[string_length - 1] = '\0';
8091
8092   char* ntoken = string;
8093
8094   while ((ntoken = strchr (string, separator)))
8095     {
8096       size_t token_length = ntoken - string;
8097       /* Make this substring look like a string.  */
8098       *ntoken = '\0';
8099       aarch64_parse_one_override_token (string, token_length, tune);
8100       string = ++ntoken;
8101     }
8102
8103   /* One last option to parse.  */
8104   aarch64_parse_one_override_token (string, strlen (string), tune);
8105   free (string_root);
8106 }
8107
8108
8109 static void
8110 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8111 {
8112   if (opts->x_flag_omit_frame_pointer)
8113     opts->x_flag_omit_leaf_frame_pointer = false;
8114   else if (opts->x_flag_omit_leaf_frame_pointer)
8115     opts->x_flag_omit_frame_pointer = true;
8116
8117   /* If not optimizing for size, set the default
8118      alignment to what the target wants.  */
8119   if (!opts->x_optimize_size)
8120     {
8121       if (opts->x_align_loops <= 0)
8122         opts->x_align_loops = aarch64_tune_params.loop_align;
8123       if (opts->x_align_jumps <= 0)
8124         opts->x_align_jumps = aarch64_tune_params.jump_align;
8125       if (opts->x_align_functions <= 0)
8126         opts->x_align_functions = aarch64_tune_params.function_align;
8127     }
8128
8129   /* If nopcrelative_literal_loads is set on the command line, this
8130      implies that the user asked for PC relative literal loads.  */
8131   if (opts->x_nopcrelative_literal_loads == 1)
8132     aarch64_nopcrelative_literal_loads = false;
8133
8134   /* If it is not set on the command line, we default to no
8135      pc relative literal loads.  */
8136   if (opts->x_nopcrelative_literal_loads == 2)
8137     aarch64_nopcrelative_literal_loads = true;
8138
8139   /* In the tiny memory model it makes no sense
8140      to disallow non PC relative literal pool loads
8141      as many other things will break anyway.  */
8142   if (opts->x_nopcrelative_literal_loads
8143       && (aarch64_cmodel == AARCH64_CMODEL_TINY
8144           || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
8145     aarch64_nopcrelative_literal_loads = false;
8146 }
8147
8148 /* 'Unpack' up the internal tuning structs and update the options
8149     in OPTS.  The caller must have set up selected_tune and selected_arch
8150     as all the other target-specific codegen decisions are
8151     derived from them.  */
8152
8153 void
8154 aarch64_override_options_internal (struct gcc_options *opts)
8155 {
8156   aarch64_tune_flags = selected_tune->flags;
8157   aarch64_tune = selected_tune->sched_core;
8158   /* Make a copy of the tuning parameters attached to the core, which
8159      we may later overwrite.  */
8160   aarch64_tune_params = *(selected_tune->tune);
8161   aarch64_architecture_version = selected_arch->architecture_version;
8162
8163   if (opts->x_aarch64_override_tune_string)
8164     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8165                                   &aarch64_tune_params);
8166
8167   /* This target defaults to strict volatile bitfields.  */
8168   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8169     opts->x_flag_strict_volatile_bitfields = 1;
8170
8171   initialize_aarch64_code_model (opts);
8172   initialize_aarch64_tls_size (opts);
8173
8174   int queue_depth = 0;
8175   switch (aarch64_tune_params.autoprefetcher_model)
8176     {
8177       case tune_params::AUTOPREFETCHER_OFF:
8178         queue_depth = -1;
8179         break;
8180       case tune_params::AUTOPREFETCHER_WEAK:
8181         queue_depth = 0;
8182         break;
8183       case tune_params::AUTOPREFETCHER_STRONG:
8184         queue_depth = max_insn_queue_index + 1;
8185         break;
8186       default:
8187         gcc_unreachable ();
8188     }
8189
8190   /* We don't mind passing in global_options_set here as we don't use
8191      the *options_set structs anyway.  */
8192   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8193                          queue_depth,
8194                          opts->x_param_values,
8195                          global_options_set.x_param_values);
8196
8197   /* Set the L1 cache line size.  */
8198   if (selected_cpu->tune->cache_line_size != 0)
8199     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8200                            selected_cpu->tune->cache_line_size,
8201                            opts->x_param_values,
8202                            global_options_set.x_param_values);
8203
8204   aarch64_override_options_after_change_1 (opts);
8205 }
8206
8207 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8208    specified in STR and throw errors if appropriate.  Put the results if
8209    they are valid in RES and ISA_FLAGS.  Return whether the option is
8210    valid.  */
8211
8212 static bool
8213 aarch64_validate_mcpu (const char *str, const struct processor **res,
8214                        unsigned long *isa_flags)
8215 {
8216   enum aarch64_parse_opt_result parse_res
8217     = aarch64_parse_cpu (str, res, isa_flags);
8218
8219   if (parse_res == AARCH64_PARSE_OK)
8220     return true;
8221
8222   switch (parse_res)
8223     {
8224       case AARCH64_PARSE_MISSING_ARG:
8225         error ("missing cpu name in -mcpu=%qs", str);
8226         break;
8227       case AARCH64_PARSE_INVALID_ARG:
8228         error ("unknown value %qs for -mcpu", str);
8229         break;
8230       case AARCH64_PARSE_INVALID_FEATURE:
8231         error ("invalid feature modifier in -mcpu=%qs", str);
8232         break;
8233       default:
8234         gcc_unreachable ();
8235     }
8236
8237   return false;
8238 }
8239
8240 /* Validate a command-line -march option.  Parse the arch and extensions
8241    (if any) specified in STR and throw errors if appropriate.  Put the
8242    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8243    option is valid.  */
8244
8245 static bool
8246 aarch64_validate_march (const char *str, const struct processor **res,
8247                        unsigned long *isa_flags)
8248 {
8249   enum aarch64_parse_opt_result parse_res
8250     = aarch64_parse_arch (str, res, isa_flags);
8251
8252   if (parse_res == AARCH64_PARSE_OK)
8253     return true;
8254
8255   switch (parse_res)
8256     {
8257       case AARCH64_PARSE_MISSING_ARG:
8258         error ("missing arch name in -march=%qs", str);
8259         break;
8260       case AARCH64_PARSE_INVALID_ARG:
8261         error ("unknown value %qs for -march", str);
8262         break;
8263       case AARCH64_PARSE_INVALID_FEATURE:
8264         error ("invalid feature modifier in -march=%qs", str);
8265         break;
8266       default:
8267         gcc_unreachable ();
8268     }
8269
8270   return false;
8271 }
8272
8273 /* Validate a command-line -mtune option.  Parse the cpu
8274    specified in STR and throw errors if appropriate.  Put the
8275    result, if it is valid, in RES.  Return whether the option is
8276    valid.  */
8277
8278 static bool
8279 aarch64_validate_mtune (const char *str, const struct processor **res)
8280 {
8281   enum aarch64_parse_opt_result parse_res
8282     = aarch64_parse_tune (str, res);
8283
8284   if (parse_res == AARCH64_PARSE_OK)
8285     return true;
8286
8287   switch (parse_res)
8288     {
8289       case AARCH64_PARSE_MISSING_ARG:
8290         error ("missing cpu name in -mtune=%qs", str);
8291         break;
8292       case AARCH64_PARSE_INVALID_ARG:
8293         error ("unknown value %qs for -mtune", str);
8294         break;
8295       default:
8296         gcc_unreachable ();
8297     }
8298   return false;
8299 }
8300
8301 /* Return the CPU corresponding to the enum CPU.
8302    If it doesn't specify a cpu, return the default.  */
8303
8304 static const struct processor *
8305 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8306 {
8307   if (cpu != aarch64_none)
8308     return &all_cores[cpu];
8309
8310   /* The & 0x3f is to extract the bottom 6 bits that encode the
8311      default cpu as selected by the --with-cpu GCC configure option
8312      in config.gcc.
8313      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8314      flags mechanism should be reworked to make it more sane.  */
8315   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8316 }
8317
8318 /* Return the architecture corresponding to the enum ARCH.
8319    If it doesn't specify a valid architecture, return the default.  */
8320
8321 static const struct processor *
8322 aarch64_get_arch (enum aarch64_arch arch)
8323 {
8324   if (arch != aarch64_no_arch)
8325     return &all_architectures[arch];
8326
8327   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8328
8329   return &all_architectures[cpu->arch];
8330 }
8331
8332 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8333    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8334    tuning structs.  In particular it must set selected_tune and
8335    aarch64_isa_flags that define the available ISA features and tuning
8336    decisions.  It must also set selected_arch as this will be used to
8337    output the .arch asm tags for each function.  */
8338
8339 static void
8340 aarch64_override_options (void)
8341 {
8342   unsigned long cpu_isa = 0;
8343   unsigned long arch_isa = 0;
8344   aarch64_isa_flags = 0;
8345
8346   bool valid_cpu = true;
8347   bool valid_tune = true;
8348   bool valid_arch = true;
8349
8350   selected_cpu = NULL;
8351   selected_arch = NULL;
8352   selected_tune = NULL;
8353
8354   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8355      If either of -march or -mtune is given, they override their
8356      respective component of -mcpu.  */
8357   if (aarch64_cpu_string)
8358     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8359                                         &cpu_isa);
8360
8361   if (aarch64_arch_string)
8362     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8363                                           &arch_isa);
8364
8365   if (aarch64_tune_string)
8366     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8367
8368   /* If the user did not specify a processor, choose the default
8369      one for them.  This will be the CPU set during configuration using
8370      --with-cpu, otherwise it is "generic".  */
8371   if (!selected_cpu)
8372     {
8373       if (selected_arch)
8374         {
8375           selected_cpu = &all_cores[selected_arch->ident];
8376           aarch64_isa_flags = arch_isa;
8377           explicit_arch = selected_arch->arch;
8378         }
8379       else
8380         {
8381           /* Get default configure-time CPU.  */
8382           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8383           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8384         }
8385
8386       if (selected_tune)
8387         explicit_tune_core = selected_tune->ident;
8388     }
8389   /* If both -mcpu and -march are specified check that they are architecturally
8390      compatible, warn if they're not and prefer the -march ISA flags.  */
8391   else if (selected_arch)
8392     {
8393       if (selected_arch->arch != selected_cpu->arch)
8394         {
8395           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8396                        all_architectures[selected_cpu->arch].name,
8397                        selected_arch->name);
8398         }
8399       aarch64_isa_flags = arch_isa;
8400       explicit_arch = selected_arch->arch;
8401       explicit_tune_core = selected_tune ? selected_tune->ident
8402                                           : selected_cpu->ident;
8403     }
8404   else
8405     {
8406       /* -mcpu but no -march.  */
8407       aarch64_isa_flags = cpu_isa;
8408       explicit_tune_core = selected_tune ? selected_tune->ident
8409                                           : selected_cpu->ident;
8410       gcc_assert (selected_cpu);
8411       selected_arch = &all_architectures[selected_cpu->arch];
8412       explicit_arch = selected_arch->arch;
8413     }
8414
8415   /* Set the arch as well as we will need it when outputing
8416      the .arch directive in assembly.  */
8417   if (!selected_arch)
8418     {
8419       gcc_assert (selected_cpu);
8420       selected_arch = &all_architectures[selected_cpu->arch];
8421     }
8422
8423   if (!selected_tune)
8424     selected_tune = selected_cpu;
8425
8426 #ifndef HAVE_AS_MABI_OPTION
8427   /* The compiler may have been configured with 2.23.* binutils, which does
8428      not have support for ILP32.  */
8429   if (TARGET_ILP32)
8430     error ("Assembler does not support -mabi=ilp32");
8431 #endif
8432
8433   /* Make sure we properly set up the explicit options.  */
8434   if ((aarch64_cpu_string && valid_cpu)
8435        || (aarch64_tune_string && valid_tune))
8436     gcc_assert (explicit_tune_core != aarch64_none);
8437
8438   if ((aarch64_cpu_string && valid_cpu)
8439        || (aarch64_arch_string && valid_arch))
8440     gcc_assert (explicit_arch != aarch64_no_arch);
8441
8442   aarch64_override_options_internal (&global_options);
8443
8444   /* Save these options as the default ones in case we push and pop them later
8445      while processing functions with potential target attributes.  */
8446   target_option_default_node = target_option_current_node
8447       = build_target_option_node (&global_options);
8448
8449   aarch64_register_fma_steering ();
8450
8451 }
8452
8453 /* Implement targetm.override_options_after_change.  */
8454
8455 static void
8456 aarch64_override_options_after_change (void)
8457 {
8458   aarch64_override_options_after_change_1 (&global_options);
8459 }
8460
8461 static struct machine_function *
8462 aarch64_init_machine_status (void)
8463 {
8464   struct machine_function *machine;
8465   machine = ggc_cleared_alloc<machine_function> ();
8466   return machine;
8467 }
8468
8469 void
8470 aarch64_init_expanders (void)
8471 {
8472   init_machine_status = aarch64_init_machine_status;
8473 }
8474
8475 /* A checking mechanism for the implementation of the various code models.  */
8476 static void
8477 initialize_aarch64_code_model (struct gcc_options *opts)
8478 {
8479    if (opts->x_flag_pic)
8480      {
8481        switch (opts->x_aarch64_cmodel_var)
8482          {
8483          case AARCH64_CMODEL_TINY:
8484            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8485            break;
8486          case AARCH64_CMODEL_SMALL:
8487 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8488            aarch64_cmodel = (flag_pic == 2
8489                              ? AARCH64_CMODEL_SMALL_PIC
8490                              : AARCH64_CMODEL_SMALL_SPIC);
8491 #else
8492            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8493 #endif
8494            break;
8495          case AARCH64_CMODEL_LARGE:
8496            sorry ("code model %qs with -f%s", "large",
8497                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8498            break;
8499          default:
8500            gcc_unreachable ();
8501          }
8502      }
8503    else
8504      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8505 }
8506
8507 /* Implement TARGET_OPTION_SAVE.  */
8508
8509 static void
8510 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8511 {
8512   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8513 }
8514
8515 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8516    using the information saved in PTR.  */
8517
8518 static void
8519 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8520 {
8521   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8522   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8523   opts->x_explicit_arch = ptr->x_explicit_arch;
8524   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8525   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8526
8527   aarch64_override_options_internal (opts);
8528 }
8529
8530 /* Implement TARGET_OPTION_PRINT.  */
8531
8532 static void
8533 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8534 {
8535   const struct processor *cpu
8536     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8537   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8538   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8539   std::string extension
8540     = aarch64_get_extension_string_for_isa_flags (isa_flags);
8541
8542   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8543   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8544            arch->name, extension.c_str ());
8545 }
8546
8547 static GTY(()) tree aarch64_previous_fndecl;
8548
8549 void
8550 aarch64_reset_previous_fndecl (void)
8551 {
8552   aarch64_previous_fndecl = NULL;
8553 }
8554
8555 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8556    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8557    of the function, if such exists.  This function may be called multiple
8558    times on a single function so use aarch64_previous_fndecl to avoid
8559    setting up identical state.  */
8560
8561 static void
8562 aarch64_set_current_function (tree fndecl)
8563 {
8564   tree old_tree = (aarch64_previous_fndecl
8565                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8566                    : NULL_TREE);
8567
8568   tree new_tree = (fndecl
8569                    ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
8570                    : NULL_TREE);
8571
8572
8573   if (fndecl && fndecl != aarch64_previous_fndecl)
8574     {
8575       aarch64_previous_fndecl = fndecl;
8576       if (old_tree == new_tree)
8577         ;
8578
8579       else if (new_tree && new_tree != target_option_default_node)
8580         {
8581           cl_target_option_restore (&global_options,
8582                                     TREE_TARGET_OPTION (new_tree));
8583           if (TREE_TARGET_GLOBALS (new_tree))
8584             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8585           else
8586             TREE_TARGET_GLOBALS (new_tree)
8587               = save_target_globals_default_opts ();
8588         }
8589
8590       else if (old_tree && old_tree != target_option_default_node)
8591         {
8592           new_tree = target_option_current_node;
8593           cl_target_option_restore (&global_options,
8594                                     TREE_TARGET_OPTION (new_tree));
8595           if (TREE_TARGET_GLOBALS (new_tree))
8596             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8597           else if (new_tree == target_option_default_node)
8598             restore_target_globals (&default_target_globals);
8599           else
8600             TREE_TARGET_GLOBALS (new_tree)
8601               = save_target_globals_default_opts ();
8602         }
8603     }
8604
8605   if (!fndecl)
8606     return;
8607
8608   /* If we turned on SIMD make sure that any vector parameters are re-laid out
8609      so that they use proper vector modes.  */
8610   if (TARGET_SIMD)
8611     {
8612       tree parms = DECL_ARGUMENTS (fndecl);
8613       for (; parms && parms != void_list_node; parms = TREE_CHAIN (parms))
8614         {
8615           if (TREE_CODE (parms) == PARM_DECL
8616               && VECTOR_TYPE_P (TREE_TYPE (parms))
8617               && DECL_MODE (parms) != TYPE_MODE (TREE_TYPE (parms)))
8618             relayout_decl (parms);
8619         }
8620     }
8621 }
8622
8623 /* Enum describing the various ways we can handle attributes.
8624    In many cases we can reuse the generic option handling machinery.  */
8625
8626 enum aarch64_attr_opt_type
8627 {
8628   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8629   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8630   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8631   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8632 };
8633
8634 /* All the information needed to handle a target attribute.
8635    NAME is the name of the attribute.
8636    ATTR_TYPE specifies the type of behavior of the attribute as described
8637    in the definition of enum aarch64_attr_opt_type.
8638    ALLOW_NEG is true if the attribute supports a "no-" form.
8639    HANDLER is the function that takes the attribute string and whether
8640    it is a pragma or attribute and handles the option.  It is needed only
8641    when the ATTR_TYPE is aarch64_attr_custom.
8642    OPT_NUM is the enum specifying the option that the attribute modifies.
8643    This is needed for attributes that mirror the behavior of a command-line
8644    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8645    aarch64_attr_enum.  */
8646
8647 struct aarch64_attribute_info
8648 {
8649   const char *name;
8650   enum aarch64_attr_opt_type attr_type;
8651   bool allow_neg;
8652   bool (*handler) (const char *, const char *);
8653   enum opt_code opt_num;
8654 };
8655
8656 /* Handle the ARCH_STR argument to the arch= target attribute.
8657    PRAGMA_OR_ATTR is used in potential error messages.  */
8658
8659 static bool
8660 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8661 {
8662   const struct processor *tmp_arch = NULL;
8663   enum aarch64_parse_opt_result parse_res
8664     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8665
8666   if (parse_res == AARCH64_PARSE_OK)
8667     {
8668       gcc_assert (tmp_arch);
8669       selected_arch = tmp_arch;
8670       explicit_arch = selected_arch->arch;
8671       return true;
8672     }
8673
8674   switch (parse_res)
8675     {
8676       case AARCH64_PARSE_MISSING_ARG:
8677         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8678         break;
8679       case AARCH64_PARSE_INVALID_ARG:
8680         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8681         break;
8682       case AARCH64_PARSE_INVALID_FEATURE:
8683         error ("invalid feature modifier %qs for 'arch' target %s",
8684                str, pragma_or_attr);
8685         break;
8686       default:
8687         gcc_unreachable ();
8688     }
8689
8690   return false;
8691 }
8692
8693 /* Handle the argument CPU_STR to the cpu= target attribute.
8694    PRAGMA_OR_ATTR is used in potential error messages.  */
8695
8696 static bool
8697 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8698 {
8699   const struct processor *tmp_cpu = NULL;
8700   enum aarch64_parse_opt_result parse_res
8701     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8702
8703   if (parse_res == AARCH64_PARSE_OK)
8704     {
8705       gcc_assert (tmp_cpu);
8706       selected_tune = tmp_cpu;
8707       explicit_tune_core = selected_tune->ident;
8708
8709       selected_arch = &all_architectures[tmp_cpu->arch];
8710       explicit_arch = selected_arch->arch;
8711       return true;
8712     }
8713
8714   switch (parse_res)
8715     {
8716       case AARCH64_PARSE_MISSING_ARG:
8717         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8718         break;
8719       case AARCH64_PARSE_INVALID_ARG:
8720         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8721         break;
8722       case AARCH64_PARSE_INVALID_FEATURE:
8723         error ("invalid feature modifier %qs for 'cpu' target %s",
8724                str, pragma_or_attr);
8725         break;
8726       default:
8727         gcc_unreachable ();
8728     }
8729
8730   return false;
8731 }
8732
8733 /* Handle the argument STR to the tune= target attribute.
8734    PRAGMA_OR_ATTR is used in potential error messages.  */
8735
8736 static bool
8737 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8738 {
8739   const struct processor *tmp_tune = NULL;
8740   enum aarch64_parse_opt_result parse_res
8741     = aarch64_parse_tune (str, &tmp_tune);
8742
8743   if (parse_res == AARCH64_PARSE_OK)
8744     {
8745       gcc_assert (tmp_tune);
8746       selected_tune = tmp_tune;
8747       explicit_tune_core = selected_tune->ident;
8748       return true;
8749     }
8750
8751   switch (parse_res)
8752     {
8753       case AARCH64_PARSE_INVALID_ARG:
8754         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8755         break;
8756       default:
8757         gcc_unreachable ();
8758     }
8759
8760   return false;
8761 }
8762
8763 /* Parse an architecture extensions target attribute string specified in STR.
8764    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8765    if successful.  Update aarch64_isa_flags to reflect the ISA features
8766    modified.
8767    PRAGMA_OR_ATTR is used in potential error messages.  */
8768
8769 static bool
8770 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8771 {
8772   enum aarch64_parse_opt_result parse_res;
8773   unsigned long isa_flags = aarch64_isa_flags;
8774
8775   /* We allow "+nothing" in the beginning to clear out all architectural
8776      features if the user wants to handpick specific features.  */
8777   if (strncmp ("+nothing", str, 8) == 0)
8778     {
8779       isa_flags = 0;
8780       str += 8;
8781     }
8782
8783   parse_res = aarch64_parse_extension (str, &isa_flags);
8784
8785   if (parse_res == AARCH64_PARSE_OK)
8786     {
8787       aarch64_isa_flags = isa_flags;
8788       return true;
8789     }
8790
8791   switch (parse_res)
8792     {
8793       case AARCH64_PARSE_MISSING_ARG:
8794         error ("missing feature modifier in target %s %qs",
8795                pragma_or_attr, str);
8796         break;
8797
8798       case AARCH64_PARSE_INVALID_FEATURE:
8799         error ("invalid feature modifier in target %s %qs",
8800                pragma_or_attr, str);
8801         break;
8802
8803       default:
8804         gcc_unreachable ();
8805     }
8806
8807  return false;
8808 }
8809
8810 /* The target attributes that we support.  On top of these we also support just
8811    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8812    handled explicitly in aarch64_process_one_target_attr.  */
8813
8814 static const struct aarch64_attribute_info aarch64_attributes[] =
8815 {
8816   { "general-regs-only", aarch64_attr_mask, false, NULL,
8817      OPT_mgeneral_regs_only },
8818   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8819      OPT_mfix_cortex_a53_835769 },
8820   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8821   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8822   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8823      OPT_momit_leaf_frame_pointer },
8824   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8825   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8826      OPT_march_ },
8827   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8828   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8829      OPT_mtune_ },
8830   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8831 };
8832
8833 /* Parse ARG_STR which contains the definition of one target attribute.
8834    Show appropriate errors if any or return true if the attribute is valid.
8835    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8836    we're processing a target attribute or pragma.  */
8837
8838 static bool
8839 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8840 {
8841   bool invert = false;
8842
8843   size_t len = strlen (arg_str);
8844
8845   if (len == 0)
8846     {
8847       error ("malformed target %s", pragma_or_attr);
8848       return false;
8849     }
8850
8851   char *str_to_check = (char *) alloca (len + 1);
8852   strcpy (str_to_check, arg_str);
8853
8854   /* Skip leading whitespace.  */
8855   while (*str_to_check == ' ' || *str_to_check == '\t')
8856     str_to_check++;
8857
8858   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8859      It is easier to detect and handle it explicitly here rather than going
8860      through the machinery for the rest of the target attributes in this
8861      function.  */
8862   if (*str_to_check == '+')
8863     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8864
8865   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8866     {
8867       invert = true;
8868       str_to_check += 3;
8869     }
8870   char *arg = strchr (str_to_check, '=');
8871
8872   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8873      and point ARG to "foo".  */
8874   if (arg)
8875     {
8876       *arg = '\0';
8877       arg++;
8878     }
8879   const struct aarch64_attribute_info *p_attr;
8880   bool found = false;
8881   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8882     {
8883       /* If the names don't match up, or the user has given an argument
8884          to an attribute that doesn't accept one, or didn't give an argument
8885          to an attribute that expects one, fail to match.  */
8886       if (strcmp (str_to_check, p_attr->name) != 0)
8887         continue;
8888
8889       found = true;
8890       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8891                               || p_attr->attr_type == aarch64_attr_enum;
8892
8893       if (attr_need_arg_p ^ (arg != NULL))
8894         {
8895           error ("target %s %qs does not accept an argument",
8896                   pragma_or_attr, str_to_check);
8897           return false;
8898         }
8899
8900       /* If the name matches but the attribute does not allow "no-" versions
8901          then we can't match.  */
8902       if (invert && !p_attr->allow_neg)
8903         {
8904           error ("target %s %qs does not allow a negated form",
8905                   pragma_or_attr, str_to_check);
8906           return false;
8907         }
8908
8909       switch (p_attr->attr_type)
8910         {
8911         /* Has a custom handler registered.
8912            For example, cpu=, arch=, tune=.  */
8913           case aarch64_attr_custom:
8914             gcc_assert (p_attr->handler);
8915             if (!p_attr->handler (arg, pragma_or_attr))
8916               return false;
8917             break;
8918
8919           /* Either set or unset a boolean option.  */
8920           case aarch64_attr_bool:
8921             {
8922               struct cl_decoded_option decoded;
8923
8924               generate_option (p_attr->opt_num, NULL, !invert,
8925                                CL_TARGET, &decoded);
8926               aarch64_handle_option (&global_options, &global_options_set,
8927                                       &decoded, input_location);
8928               break;
8929             }
8930           /* Set or unset a bit in the target_flags.  aarch64_handle_option
8931              should know what mask to apply given the option number.  */
8932           case aarch64_attr_mask:
8933             {
8934               struct cl_decoded_option decoded;
8935               /* We only need to specify the option number.
8936                  aarch64_handle_option will know which mask to apply.  */
8937               decoded.opt_index = p_attr->opt_num;
8938               decoded.value = !invert;
8939               aarch64_handle_option (&global_options, &global_options_set,
8940                                       &decoded, input_location);
8941               break;
8942             }
8943           /* Use the option setting machinery to set an option to an enum.  */
8944           case aarch64_attr_enum:
8945             {
8946               gcc_assert (arg);
8947               bool valid;
8948               int value;
8949               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8950                                               &value, CL_TARGET);
8951               if (valid)
8952                 {
8953                   set_option (&global_options, NULL, p_attr->opt_num, value,
8954                               NULL, DK_UNSPECIFIED, input_location,
8955                               global_dc);
8956                 }
8957               else
8958                 {
8959                   error ("target %s %s=%s is not valid",
8960                          pragma_or_attr, str_to_check, arg);
8961                 }
8962               break;
8963             }
8964           default:
8965             gcc_unreachable ();
8966         }
8967     }
8968
8969   /* If we reached here we either have found an attribute and validated
8970      it or didn't match any.  If we matched an attribute but its arguments
8971      were malformed we will have returned false already.  */
8972   return found;
8973 }
8974
8975 /* Count how many times the character C appears in
8976    NULL-terminated string STR.  */
8977
8978 static unsigned int
8979 num_occurences_in_str (char c, char *str)
8980 {
8981   unsigned int res = 0;
8982   while (*str != '\0')
8983     {
8984       if (*str == c)
8985         res++;
8986
8987       str++;
8988     }
8989
8990   return res;
8991 }
8992
8993 /* Parse the tree in ARGS that contains the target attribute information
8994    and update the global target options space.  PRAGMA_OR_ATTR is a string
8995    to be used in error messages, specifying whether this is processing
8996    a target attribute or a target pragma.  */
8997
8998 bool
8999 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9000 {
9001   if (TREE_CODE (args) == TREE_LIST)
9002     {
9003       do
9004         {
9005           tree head = TREE_VALUE (args);
9006           if (head)
9007             {
9008               if (!aarch64_process_target_attr (head, pragma_or_attr))
9009                 return false;
9010             }
9011           args = TREE_CHAIN (args);
9012         } while (args);
9013
9014       return true;
9015     }
9016   /* We expect to find a string to parse.  */
9017   gcc_assert (TREE_CODE (args) == STRING_CST);
9018
9019   size_t len = strlen (TREE_STRING_POINTER (args));
9020   char *str_to_check = (char *) alloca (len + 1);
9021   strcpy (str_to_check, TREE_STRING_POINTER (args));
9022
9023   if (len == 0)
9024     {
9025       error ("malformed target %s value", pragma_or_attr);
9026       return false;
9027     }
9028
9029   /* Used to catch empty spaces between commas i.e.
9030      attribute ((target ("attr1,,attr2"))).  */
9031   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9032
9033   /* Handle multiple target attributes separated by ','.  */
9034   char *token = strtok (str_to_check, ",");
9035
9036   unsigned int num_attrs = 0;
9037   while (token)
9038     {
9039       num_attrs++;
9040       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9041         {
9042           error ("target %s %qs is invalid", pragma_or_attr, token);
9043           return false;
9044         }
9045
9046       token = strtok (NULL, ",");
9047     }
9048
9049   if (num_attrs != num_commas + 1)
9050     {
9051       error ("malformed target %s list %qs",
9052               pragma_or_attr, TREE_STRING_POINTER (args));
9053       return false;
9054     }
9055
9056   return true;
9057 }
9058
9059 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9060    process attribute ((target ("..."))).  */
9061
9062 static bool
9063 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9064 {
9065   struct cl_target_option cur_target;
9066   bool ret;
9067   tree old_optimize;
9068   tree new_target, new_optimize;
9069   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9070
9071   /* If what we're processing is the current pragma string then the
9072      target option node is already stored in target_option_current_node
9073      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9074      having to re-parse the string.  This is especially useful to keep
9075      arm_neon.h compile times down since that header contains a lot
9076      of intrinsics enclosed in pragmas.  */
9077   if (!existing_target && args == current_target_pragma)
9078     {
9079       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9080       return true;
9081     }
9082   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9083
9084   old_optimize = build_optimization_node (&global_options);
9085   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9086
9087   /* If the function changed the optimization levels as well as setting
9088      target options, start with the optimizations specified.  */
9089   if (func_optimize && func_optimize != old_optimize)
9090     cl_optimization_restore (&global_options,
9091                              TREE_OPTIMIZATION (func_optimize));
9092
9093   /* Save the current target options to restore at the end.  */
9094   cl_target_option_save (&cur_target, &global_options);
9095
9096   /* If fndecl already has some target attributes applied to it, unpack
9097      them so that we add this attribute on top of them, rather than
9098      overwriting them.  */
9099   if (existing_target)
9100     {
9101       struct cl_target_option *existing_options
9102         = TREE_TARGET_OPTION (existing_target);
9103
9104       if (existing_options)
9105         cl_target_option_restore (&global_options, existing_options);
9106     }
9107   else
9108     cl_target_option_restore (&global_options,
9109                         TREE_TARGET_OPTION (target_option_current_node));
9110
9111
9112   ret = aarch64_process_target_attr (args, "attribute");
9113
9114   /* Set up any additional state.  */
9115   if (ret)
9116     {
9117       aarch64_override_options_internal (&global_options);
9118       /* Initialize SIMD builtins if we haven't already.
9119          Set current_target_pragma to NULL for the duration so that
9120          the builtin initialization code doesn't try to tag the functions
9121          being built with the attributes specified by any current pragma, thus
9122          going into an infinite recursion.  */
9123       if (TARGET_SIMD)
9124         {
9125           tree saved_current_target_pragma = current_target_pragma;
9126           current_target_pragma = NULL;
9127           aarch64_init_simd_builtins ();
9128           current_target_pragma = saved_current_target_pragma;
9129         }
9130       new_target = build_target_option_node (&global_options);
9131     }
9132   else
9133     new_target = NULL;
9134
9135   new_optimize = build_optimization_node (&global_options);
9136
9137   if (fndecl && ret)
9138     {
9139       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9140
9141       if (old_optimize != new_optimize)
9142         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9143     }
9144
9145   cl_target_option_restore (&global_options, &cur_target);
9146
9147   if (old_optimize != new_optimize)
9148     cl_optimization_restore (&global_options,
9149                              TREE_OPTIMIZATION (old_optimize));
9150   return ret;
9151 }
9152
9153 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9154    tri-bool options (yes, no, don't care) and the default value is
9155    DEF, determine whether to reject inlining.  */
9156
9157 static bool
9158 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9159                                      int dont_care, int def)
9160 {
9161   /* If the callee doesn't care, always allow inlining.  */
9162   if (callee == dont_care)
9163     return true;
9164
9165   /* If the caller doesn't care, always allow inlining.  */
9166   if (caller == dont_care)
9167     return true;
9168
9169   /* Otherwise, allow inlining if either the callee and caller values
9170      agree, or if the callee is using the default value.  */
9171   return (callee == caller || callee == def);
9172 }
9173
9174 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9175    to inline CALLEE into CALLER based on target-specific info.
9176    Make sure that the caller and callee have compatible architectural
9177    features.  Then go through the other possible target attributes
9178    and see if they can block inlining.  Try not to reject always_inline
9179    callees unless they are incompatible architecturally.  */
9180
9181 static bool
9182 aarch64_can_inline_p (tree caller, tree callee)
9183 {
9184   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9185   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9186
9187   /* If callee has no option attributes, then it is ok to inline.  */
9188   if (!callee_tree)
9189     return true;
9190
9191   struct cl_target_option *caller_opts
9192         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9193                                            : target_option_default_node);
9194
9195   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9196
9197
9198   /* Callee's ISA flags should be a subset of the caller's.  */
9199   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9200        != callee_opts->x_aarch64_isa_flags)
9201     return false;
9202
9203   /* Allow non-strict aligned functions inlining into strict
9204      aligned ones.  */
9205   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9206        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9207       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9208            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9209     return false;
9210
9211   bool always_inline = lookup_attribute ("always_inline",
9212                                           DECL_ATTRIBUTES (callee));
9213
9214   /* If the architectural features match up and the callee is always_inline
9215      then the other attributes don't matter.  */
9216   if (always_inline)
9217     return true;
9218
9219   if (caller_opts->x_aarch64_cmodel_var
9220       != callee_opts->x_aarch64_cmodel_var)
9221     return false;
9222
9223   if (caller_opts->x_aarch64_tls_dialect
9224       != callee_opts->x_aarch64_tls_dialect)
9225     return false;
9226
9227   /* Honour explicit requests to workaround errata.  */
9228   if (!aarch64_tribools_ok_for_inlining_p (
9229           caller_opts->x_aarch64_fix_a53_err835769,
9230           callee_opts->x_aarch64_fix_a53_err835769,
9231           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9232     return false;
9233
9234   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9235      caller and calle and they don't match up, reject inlining.  */
9236   if (!aarch64_tribools_ok_for_inlining_p (
9237           caller_opts->x_flag_omit_leaf_frame_pointer,
9238           callee_opts->x_flag_omit_leaf_frame_pointer,
9239           2, 1))
9240     return false;
9241
9242   /* If the callee has specific tuning overrides, respect them.  */
9243   if (callee_opts->x_aarch64_override_tune_string != NULL
9244       && caller_opts->x_aarch64_override_tune_string == NULL)
9245     return false;
9246
9247   /* If the user specified tuning override strings for the
9248      caller and callee and they don't match up, reject inlining.
9249      We just do a string compare here, we don't analyze the meaning
9250      of the string, as it would be too costly for little gain.  */
9251   if (callee_opts->x_aarch64_override_tune_string
9252       && caller_opts->x_aarch64_override_tune_string
9253       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9254                   caller_opts->x_aarch64_override_tune_string) != 0))
9255     return false;
9256
9257   return true;
9258 }
9259
9260 /* Return true if SYMBOL_REF X binds locally.  */
9261
9262 static bool
9263 aarch64_symbol_binds_local_p (const_rtx x)
9264 {
9265   return (SYMBOL_REF_DECL (x)
9266           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9267           : SYMBOL_REF_LOCAL_P (x));
9268 }
9269
9270 /* Return true if SYMBOL_REF X is thread local */
9271 static bool
9272 aarch64_tls_symbol_p (rtx x)
9273 {
9274   if (! TARGET_HAVE_TLS)
9275     return false;
9276
9277   if (GET_CODE (x) != SYMBOL_REF)
9278     return false;
9279
9280   return SYMBOL_REF_TLS_MODEL (x) != 0;
9281 }
9282
9283 /* Classify a TLS symbol into one of the TLS kinds.  */
9284 enum aarch64_symbol_type
9285 aarch64_classify_tls_symbol (rtx x)
9286 {
9287   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9288
9289   switch (tls_kind)
9290     {
9291     case TLS_MODEL_GLOBAL_DYNAMIC:
9292     case TLS_MODEL_LOCAL_DYNAMIC:
9293       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9294
9295     case TLS_MODEL_INITIAL_EXEC:
9296       switch (aarch64_cmodel)
9297         {
9298         case AARCH64_CMODEL_TINY:
9299         case AARCH64_CMODEL_TINY_PIC:
9300           return SYMBOL_TINY_TLSIE;
9301         default:
9302           return SYMBOL_SMALL_TLSIE;
9303         }
9304
9305     case TLS_MODEL_LOCAL_EXEC:
9306       if (aarch64_tls_size == 12)
9307         return SYMBOL_TLSLE12;
9308       else if (aarch64_tls_size == 24)
9309         return SYMBOL_TLSLE24;
9310       else if (aarch64_tls_size == 32)
9311         return SYMBOL_TLSLE32;
9312       else if (aarch64_tls_size == 48)
9313         return SYMBOL_TLSLE48;
9314       else
9315         gcc_unreachable ();
9316
9317     case TLS_MODEL_EMULATED:
9318     case TLS_MODEL_NONE:
9319       return SYMBOL_FORCE_TO_MEM;
9320
9321     default:
9322       gcc_unreachable ();
9323     }
9324 }
9325
9326 /* Return the method that should be used to access SYMBOL_REF or
9327    LABEL_REF X.  */
9328
9329 enum aarch64_symbol_type
9330 aarch64_classify_symbol (rtx x, rtx offset)
9331 {
9332   if (GET_CODE (x) == LABEL_REF)
9333     {
9334       switch (aarch64_cmodel)
9335         {
9336         case AARCH64_CMODEL_LARGE:
9337           return SYMBOL_FORCE_TO_MEM;
9338
9339         case AARCH64_CMODEL_TINY_PIC:
9340         case AARCH64_CMODEL_TINY:
9341           return SYMBOL_TINY_ABSOLUTE;
9342
9343         case AARCH64_CMODEL_SMALL_SPIC:
9344         case AARCH64_CMODEL_SMALL_PIC:
9345         case AARCH64_CMODEL_SMALL:
9346           return SYMBOL_SMALL_ABSOLUTE;
9347
9348         default:
9349           gcc_unreachable ();
9350         }
9351     }
9352
9353   if (GET_CODE (x) == SYMBOL_REF)
9354     {
9355       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
9356         {
9357           /* This is alright even in PIC code as the constant
9358              pool reference is always PC relative and within
9359              the same translation unit.  */
9360           if (nopcrelative_literal_loads
9361               && CONSTANT_POOL_ADDRESS_P (x))
9362             return SYMBOL_SMALL_ABSOLUTE;
9363           else
9364             return SYMBOL_FORCE_TO_MEM;
9365         }
9366
9367       if (aarch64_tls_symbol_p (x))
9368         return aarch64_classify_tls_symbol (x);
9369
9370       switch (aarch64_cmodel)
9371         {
9372         case AARCH64_CMODEL_TINY:
9373           /* When we retreive symbol + offset address, we have to make sure
9374              the offset does not cause overflow of the final address.  But
9375              we have no way of knowing the address of symbol at compile time
9376              so we can't accurately say if the distance between the PC and
9377              symbol + offset is outside the addressible range of +/-1M in the
9378              TINY code model.  So we rely on images not being greater than
9379              1M and cap the offset at 1M and anything beyond 1M will have to
9380              be loaded using an alternative mechanism.  */
9381           if (SYMBOL_REF_WEAK (x)
9382               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9383             return SYMBOL_FORCE_TO_MEM;
9384           return SYMBOL_TINY_ABSOLUTE;
9385
9386         case AARCH64_CMODEL_SMALL:
9387           /* Same reasoning as the tiny code model, but the offset cap here is
9388              4G.  */
9389           if (SYMBOL_REF_WEAK (x)
9390               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9391                             HOST_WIDE_INT_C (4294967264)))
9392             return SYMBOL_FORCE_TO_MEM;
9393           return SYMBOL_SMALL_ABSOLUTE;
9394
9395         case AARCH64_CMODEL_TINY_PIC:
9396           if (!aarch64_symbol_binds_local_p (x))
9397             return SYMBOL_TINY_GOT;
9398           return SYMBOL_TINY_ABSOLUTE;
9399
9400         case AARCH64_CMODEL_SMALL_SPIC:
9401         case AARCH64_CMODEL_SMALL_PIC:
9402           if (!aarch64_symbol_binds_local_p (x))
9403             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9404                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9405           return SYMBOL_SMALL_ABSOLUTE;
9406
9407         default:
9408           gcc_unreachable ();
9409         }
9410     }
9411
9412   /* By default push everything into the constant pool.  */
9413   return SYMBOL_FORCE_TO_MEM;
9414 }
9415
9416 bool
9417 aarch64_constant_address_p (rtx x)
9418 {
9419   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9420 }
9421
9422 bool
9423 aarch64_legitimate_pic_operand_p (rtx x)
9424 {
9425   if (GET_CODE (x) == SYMBOL_REF
9426       || (GET_CODE (x) == CONST
9427           && GET_CODE (XEXP (x, 0)) == PLUS
9428           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9429      return false;
9430
9431   return true;
9432 }
9433
9434 /* Return true if X holds either a quarter-precision or
9435      floating-point +0.0 constant.  */
9436 static bool
9437 aarch64_valid_floating_const (machine_mode mode, rtx x)
9438 {
9439   if (!CONST_DOUBLE_P (x))
9440     return false;
9441
9442   if (aarch64_float_const_zero_rtx_p (x))
9443     return true;
9444
9445   /* We only handle moving 0.0 to a TFmode register.  */
9446   if (!(mode == SFmode || mode == DFmode))
9447     return false;
9448
9449   return aarch64_float_const_representable_p (x);
9450 }
9451
9452 static bool
9453 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9454 {
9455   /* Do not allow vector struct mode constants.  We could support
9456      0 and -1 easily, but they need support in aarch64-simd.md.  */
9457   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9458     return false;
9459
9460   /* This could probably go away because
9461      we now decompose CONST_INTs according to expand_mov_immediate.  */
9462   if ((GET_CODE (x) == CONST_VECTOR
9463        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9464       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9465         return !targetm.cannot_force_const_mem (mode, x);
9466
9467   if (GET_CODE (x) == HIGH
9468       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9469     return true;
9470
9471   return aarch64_constant_address_p (x);
9472 }
9473
9474 rtx
9475 aarch64_load_tp (rtx target)
9476 {
9477   if (!target
9478       || GET_MODE (target) != Pmode
9479       || !register_operand (target, Pmode))
9480     target = gen_reg_rtx (Pmode);
9481
9482   /* Can return in any reg.  */
9483   emit_insn (gen_aarch64_load_tp_hard (target));
9484   return target;
9485 }
9486
9487 /* On AAPCS systems, this is the "struct __va_list".  */
9488 static GTY(()) tree va_list_type;
9489
9490 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9491    Return the type to use as __builtin_va_list.
9492
9493    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9494
9495    struct __va_list
9496    {
9497      void *__stack;
9498      void *__gr_top;
9499      void *__vr_top;
9500      int   __gr_offs;
9501      int   __vr_offs;
9502    };  */
9503
9504 static tree
9505 aarch64_build_builtin_va_list (void)
9506 {
9507   tree va_list_name;
9508   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9509
9510   /* Create the type.  */
9511   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9512   /* Give it the required name.  */
9513   va_list_name = build_decl (BUILTINS_LOCATION,
9514                              TYPE_DECL,
9515                              get_identifier ("__va_list"),
9516                              va_list_type);
9517   DECL_ARTIFICIAL (va_list_name) = 1;
9518   TYPE_NAME (va_list_type) = va_list_name;
9519   TYPE_STUB_DECL (va_list_type) = va_list_name;
9520
9521   /* Create the fields.  */
9522   f_stack = build_decl (BUILTINS_LOCATION,
9523                         FIELD_DECL, get_identifier ("__stack"),
9524                         ptr_type_node);
9525   f_grtop = build_decl (BUILTINS_LOCATION,
9526                         FIELD_DECL, get_identifier ("__gr_top"),
9527                         ptr_type_node);
9528   f_vrtop = build_decl (BUILTINS_LOCATION,
9529                         FIELD_DECL, get_identifier ("__vr_top"),
9530                         ptr_type_node);
9531   f_groff = build_decl (BUILTINS_LOCATION,
9532                         FIELD_DECL, get_identifier ("__gr_offs"),
9533                         integer_type_node);
9534   f_vroff = build_decl (BUILTINS_LOCATION,
9535                         FIELD_DECL, get_identifier ("__vr_offs"),
9536                         integer_type_node);
9537
9538   DECL_ARTIFICIAL (f_stack) = 1;
9539   DECL_ARTIFICIAL (f_grtop) = 1;
9540   DECL_ARTIFICIAL (f_vrtop) = 1;
9541   DECL_ARTIFICIAL (f_groff) = 1;
9542   DECL_ARTIFICIAL (f_vroff) = 1;
9543
9544   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9545   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9546   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9547   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9548   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9549
9550   TYPE_FIELDS (va_list_type) = f_stack;
9551   DECL_CHAIN (f_stack) = f_grtop;
9552   DECL_CHAIN (f_grtop) = f_vrtop;
9553   DECL_CHAIN (f_vrtop) = f_groff;
9554   DECL_CHAIN (f_groff) = f_vroff;
9555
9556   /* Compute its layout.  */
9557   layout_type (va_list_type);
9558
9559   return va_list_type;
9560 }
9561
9562 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9563 static void
9564 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9565 {
9566   const CUMULATIVE_ARGS *cum;
9567   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9568   tree stack, grtop, vrtop, groff, vroff;
9569   tree t;
9570   int gr_save_area_size;
9571   int vr_save_area_size;
9572   int vr_offset;
9573
9574   cum = &crtl->args.info;
9575   gr_save_area_size
9576     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
9577   vr_save_area_size
9578     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
9579
9580   if (!TARGET_FLOAT)
9581     {
9582       gcc_assert (cum->aapcs_nvrn == 0);
9583       vr_save_area_size = 0;
9584     }
9585
9586   f_stack = TYPE_FIELDS (va_list_type_node);
9587   f_grtop = DECL_CHAIN (f_stack);
9588   f_vrtop = DECL_CHAIN (f_grtop);
9589   f_groff = DECL_CHAIN (f_vrtop);
9590   f_vroff = DECL_CHAIN (f_groff);
9591
9592   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9593                   NULL_TREE);
9594   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9595                   NULL_TREE);
9596   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9597                   NULL_TREE);
9598   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9599                   NULL_TREE);
9600   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9601                   NULL_TREE);
9602
9603   /* Emit code to initialize STACK, which points to the next varargs stack
9604      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9605      by named arguments.  STACK is 8-byte aligned.  */
9606   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9607   if (cum->aapcs_stack_size > 0)
9608     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9609   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9610   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9611
9612   /* Emit code to initialize GRTOP, the top of the GR save area.
9613      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9614   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9615   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9616   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9617
9618   /* Emit code to initialize VRTOP, the top of the VR save area.
9619      This address is gr_save_area_bytes below GRTOP, rounded
9620      down to the next 16-byte boundary.  */
9621   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9622   vr_offset = ROUND_UP (gr_save_area_size,
9623                         STACK_BOUNDARY / BITS_PER_UNIT);
9624
9625   if (vr_offset)
9626     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9627   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9628   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9629
9630   /* Emit code to initialize GROFF, the offset from GRTOP of the
9631      next GPR argument.  */
9632   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9633               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9634   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9635
9636   /* Likewise emit code to initialize VROFF, the offset from FTOP
9637      of the next VR argument.  */
9638   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9639               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9640   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9641 }
9642
9643 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9644
9645 static tree
9646 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9647                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9648 {
9649   tree addr;
9650   bool indirect_p;
9651   bool is_ha;           /* is HFA or HVA.  */
9652   bool dw_align;        /* double-word align.  */
9653   machine_mode ag_mode = VOIDmode;
9654   int nregs;
9655   machine_mode mode;
9656
9657   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9658   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9659   HOST_WIDE_INT size, rsize, adjust, align;
9660   tree t, u, cond1, cond2;
9661
9662   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9663   if (indirect_p)
9664     type = build_pointer_type (type);
9665
9666   mode = TYPE_MODE (type);
9667
9668   f_stack = TYPE_FIELDS (va_list_type_node);
9669   f_grtop = DECL_CHAIN (f_stack);
9670   f_vrtop = DECL_CHAIN (f_grtop);
9671   f_groff = DECL_CHAIN (f_vrtop);
9672   f_vroff = DECL_CHAIN (f_groff);
9673
9674   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9675                   f_stack, NULL_TREE);
9676   size = int_size_in_bytes (type);
9677   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9678
9679   dw_align = false;
9680   adjust = 0;
9681   if (aarch64_vfp_is_call_or_return_candidate (mode,
9682                                                type,
9683                                                &ag_mode,
9684                                                &nregs,
9685                                                &is_ha))
9686     {
9687       /* TYPE passed in fp/simd registers.  */
9688       if (!TARGET_FLOAT)
9689         aarch64_err_no_fpadvsimd (mode, "varargs");
9690
9691       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9692                       unshare_expr (valist), f_vrtop, NULL_TREE);
9693       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9694                       unshare_expr (valist), f_vroff, NULL_TREE);
9695
9696       rsize = nregs * UNITS_PER_VREG;
9697
9698       if (is_ha)
9699         {
9700           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9701             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9702         }
9703       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9704                && size < UNITS_PER_VREG)
9705         {
9706           adjust = UNITS_PER_VREG - size;
9707         }
9708     }
9709   else
9710     {
9711       /* TYPE passed in general registers.  */
9712       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9713                       unshare_expr (valist), f_grtop, NULL_TREE);
9714       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9715                       unshare_expr (valist), f_groff, NULL_TREE);
9716       rsize = ROUND_UP (size, UNITS_PER_WORD);
9717       nregs = rsize / UNITS_PER_WORD;
9718
9719       if (align > 8)
9720         dw_align = true;
9721
9722       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9723           && size < UNITS_PER_WORD)
9724         {
9725           adjust = UNITS_PER_WORD  - size;
9726         }
9727     }
9728
9729   /* Get a local temporary for the field value.  */
9730   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9731
9732   /* Emit code to branch if off >= 0.  */
9733   t = build2 (GE_EXPR, boolean_type_node, off,
9734               build_int_cst (TREE_TYPE (off), 0));
9735   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9736
9737   if (dw_align)
9738     {
9739       /* Emit: offs = (offs + 15) & -16.  */
9740       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9741                   build_int_cst (TREE_TYPE (off), 15));
9742       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9743                   build_int_cst (TREE_TYPE (off), -16));
9744       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9745     }
9746   else
9747     roundup = NULL;
9748
9749   /* Update ap.__[g|v]r_offs  */
9750   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9751               build_int_cst (TREE_TYPE (off), rsize));
9752   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9753
9754   /* String up.  */
9755   if (roundup)
9756     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9757
9758   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9759   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9760               build_int_cst (TREE_TYPE (f_off), 0));
9761   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9762
9763   /* String up: make sure the assignment happens before the use.  */
9764   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9765   COND_EXPR_ELSE (cond1) = t;
9766
9767   /* Prepare the trees handling the argument that is passed on the stack;
9768      the top level node will store in ON_STACK.  */
9769   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9770   if (align > 8)
9771     {
9772       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9773       t = fold_convert (intDI_type_node, arg);
9774       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9775                   build_int_cst (TREE_TYPE (t), 15));
9776       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9777                   build_int_cst (TREE_TYPE (t), -16));
9778       t = fold_convert (TREE_TYPE (arg), t);
9779       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9780     }
9781   else
9782     roundup = NULL;
9783   /* Advance ap.__stack  */
9784   t = fold_convert (intDI_type_node, arg);
9785   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9786               build_int_cst (TREE_TYPE (t), size + 7));
9787   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9788               build_int_cst (TREE_TYPE (t), -8));
9789   t = fold_convert (TREE_TYPE (arg), t);
9790   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9791   /* String up roundup and advance.  */
9792   if (roundup)
9793     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9794   /* String up with arg */
9795   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9796   /* Big-endianness related address adjustment.  */
9797   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9798       && size < UNITS_PER_WORD)
9799   {
9800     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9801                 size_int (UNITS_PER_WORD - size));
9802     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9803   }
9804
9805   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9806   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9807
9808   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9809   t = off;
9810   if (adjust)
9811     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9812                 build_int_cst (TREE_TYPE (off), adjust));
9813
9814   t = fold_convert (sizetype, t);
9815   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9816
9817   if (is_ha)
9818     {
9819       /* type ha; // treat as "struct {ftype field[n];}"
9820          ... [computing offs]
9821          for (i = 0; i <nregs; ++i, offs += 16)
9822            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9823          return ha;  */
9824       int i;
9825       tree tmp_ha, field_t, field_ptr_t;
9826
9827       /* Declare a local variable.  */
9828       tmp_ha = create_tmp_var_raw (type, "ha");
9829       gimple_add_tmp_var (tmp_ha);
9830
9831       /* Establish the base type.  */
9832       switch (ag_mode)
9833         {
9834         case SFmode:
9835           field_t = float_type_node;
9836           field_ptr_t = float_ptr_type_node;
9837           break;
9838         case DFmode:
9839           field_t = double_type_node;
9840           field_ptr_t = double_ptr_type_node;
9841           break;
9842         case TFmode:
9843           field_t = long_double_type_node;
9844           field_ptr_t = long_double_ptr_type_node;
9845           break;
9846 /* The half precision and quad precision are not fully supported yet.  Enable
9847    the following code after the support is complete.  Need to find the correct
9848    type node for __fp16 *.  */
9849 #if 0
9850         case HFmode:
9851           field_t = float_type_node;
9852           field_ptr_t = float_ptr_type_node;
9853           break;
9854 #endif
9855         case V2SImode:
9856         case V4SImode:
9857             {
9858               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9859               field_t = build_vector_type_for_mode (innertype, ag_mode);
9860               field_ptr_t = build_pointer_type (field_t);
9861             }
9862           break;
9863         default:
9864           gcc_assert (0);
9865         }
9866
9867       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
9868       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9869       addr = t;
9870       t = fold_convert (field_ptr_t, addr);
9871       t = build2 (MODIFY_EXPR, field_t,
9872                   build1 (INDIRECT_REF, field_t, tmp_ha),
9873                   build1 (INDIRECT_REF, field_t, t));
9874
9875       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
9876       for (i = 1; i < nregs; ++i)
9877         {
9878           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9879           u = fold_convert (field_ptr_t, addr);
9880           u = build2 (MODIFY_EXPR, field_t,
9881                       build2 (MEM_REF, field_t, tmp_ha,
9882                               build_int_cst (field_ptr_t,
9883                                              (i *
9884                                               int_size_in_bytes (field_t)))),
9885                       build1 (INDIRECT_REF, field_t, u));
9886           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9887         }
9888
9889       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9890       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9891     }
9892
9893   COND_EXPR_ELSE (cond2) = t;
9894   addr = fold_convert (build_pointer_type (type), cond1);
9895   addr = build_va_arg_indirect_ref (addr);
9896
9897   if (indirect_p)
9898     addr = build_va_arg_indirect_ref (addr);
9899
9900   return addr;
9901 }
9902
9903 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
9904
9905 static void
9906 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9907                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9908                                 int no_rtl)
9909 {
9910   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9911   CUMULATIVE_ARGS local_cum;
9912   int gr_saved, vr_saved;
9913
9914   /* The caller has advanced CUM up to, but not beyond, the last named
9915      argument.  Advance a local copy of CUM past the last "real" named
9916      argument, to find out how many registers are left over.  */
9917   local_cum = *cum;
9918   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9919
9920   /* Found out how many registers we need to save.  */
9921   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
9922   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
9923
9924   if (!TARGET_FLOAT)
9925     {
9926       gcc_assert (local_cum.aapcs_nvrn == 0);
9927       vr_saved = 0;
9928     }
9929
9930   if (!no_rtl)
9931     {
9932       if (gr_saved > 0)
9933         {
9934           rtx ptr, mem;
9935
9936           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
9937           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9938                                - gr_saved * UNITS_PER_WORD);
9939           mem = gen_frame_mem (BLKmode, ptr);
9940           set_mem_alias_set (mem, get_varargs_alias_set ());
9941
9942           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9943                                mem, gr_saved);
9944         }
9945       if (vr_saved > 0)
9946         {
9947           /* We can't use move_block_from_reg, because it will use
9948              the wrong mode, storing D regs only.  */
9949           machine_mode mode = TImode;
9950           int off, i;
9951
9952           /* Set OFF to the offset from virtual_incoming_args_rtx of
9953              the first vector register.  The VR save area lies below
9954              the GR one, and is aligned to 16 bytes.  */
9955           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
9956                            STACK_BOUNDARY / BITS_PER_UNIT);
9957           off -= vr_saved * UNITS_PER_VREG;
9958
9959           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
9960             {
9961               rtx ptr, mem;
9962
9963               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
9964               mem = gen_frame_mem (mode, ptr);
9965               set_mem_alias_set (mem, get_varargs_alias_set ());
9966               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
9967               off += UNITS_PER_VREG;
9968             }
9969         }
9970     }
9971
9972   /* We don't save the size into *PRETEND_SIZE because we want to avoid
9973      any complication of having crtl->args.pretend_args_size changed.  */
9974   cfun->machine->frame.saved_varargs_size
9975     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
9976                  STACK_BOUNDARY / BITS_PER_UNIT)
9977        + vr_saved * UNITS_PER_VREG);
9978 }
9979
9980 static void
9981 aarch64_conditional_register_usage (void)
9982 {
9983   int i;
9984   if (!TARGET_FLOAT)
9985     {
9986       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
9987         {
9988           fixed_regs[i] = 1;
9989           call_used_regs[i] = 1;
9990         }
9991     }
9992 }
9993
9994 /* Walk down the type tree of TYPE counting consecutive base elements.
9995    If *MODEP is VOIDmode, then set it to the first valid floating point
9996    type.  If a non-floating point type is found, or if a floating point
9997    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
9998    otherwise return the count in the sub-tree.  */
9999 static int
10000 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10001 {
10002   machine_mode mode;
10003   HOST_WIDE_INT size;
10004
10005   switch (TREE_CODE (type))
10006     {
10007     case REAL_TYPE:
10008       mode = TYPE_MODE (type);
10009       if (mode != DFmode && mode != SFmode && mode != TFmode)
10010         return -1;
10011
10012       if (*modep == VOIDmode)
10013         *modep = mode;
10014
10015       if (*modep == mode)
10016         return 1;
10017
10018       break;
10019
10020     case COMPLEX_TYPE:
10021       mode = TYPE_MODE (TREE_TYPE (type));
10022       if (mode != DFmode && mode != SFmode && mode != TFmode)
10023         return -1;
10024
10025       if (*modep == VOIDmode)
10026         *modep = mode;
10027
10028       if (*modep == mode)
10029         return 2;
10030
10031       break;
10032
10033     case VECTOR_TYPE:
10034       /* Use V2SImode and V4SImode as representatives of all 64-bit
10035          and 128-bit vector types.  */
10036       size = int_size_in_bytes (type);
10037       switch (size)
10038         {
10039         case 8:
10040           mode = V2SImode;
10041           break;
10042         case 16:
10043           mode = V4SImode;
10044           break;
10045         default:
10046           return -1;
10047         }
10048
10049       if (*modep == VOIDmode)
10050         *modep = mode;
10051
10052       /* Vector modes are considered to be opaque: two vectors are
10053          equivalent for the purposes of being homogeneous aggregates
10054          if they are the same size.  */
10055       if (*modep == mode)
10056         return 1;
10057
10058       break;
10059
10060     case ARRAY_TYPE:
10061       {
10062         int count;
10063         tree index = TYPE_DOMAIN (type);
10064
10065         /* Can't handle incomplete types nor sizes that are not
10066            fixed.  */
10067         if (!COMPLETE_TYPE_P (type)
10068             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10069           return -1;
10070
10071         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10072         if (count == -1
10073             || !index
10074             || !TYPE_MAX_VALUE (index)
10075             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10076             || !TYPE_MIN_VALUE (index)
10077             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10078             || count < 0)
10079           return -1;
10080
10081         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10082                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10083
10084         /* There must be no padding.  */
10085         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10086           return -1;
10087
10088         return count;
10089       }
10090
10091     case RECORD_TYPE:
10092       {
10093         int count = 0;
10094         int sub_count;
10095         tree field;
10096
10097         /* Can't handle incomplete types nor sizes that are not
10098            fixed.  */
10099         if (!COMPLETE_TYPE_P (type)
10100             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10101           return -1;
10102
10103         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10104           {
10105             if (TREE_CODE (field) != FIELD_DECL)
10106               continue;
10107
10108             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10109             if (sub_count < 0)
10110               return -1;
10111             count += sub_count;
10112           }
10113
10114         /* There must be no padding.  */
10115         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10116           return -1;
10117
10118         return count;
10119       }
10120
10121     case UNION_TYPE:
10122     case QUAL_UNION_TYPE:
10123       {
10124         /* These aren't very interesting except in a degenerate case.  */
10125         int count = 0;
10126         int sub_count;
10127         tree field;
10128
10129         /* Can't handle incomplete types nor sizes that are not
10130            fixed.  */
10131         if (!COMPLETE_TYPE_P (type)
10132             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10133           return -1;
10134
10135         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10136           {
10137             if (TREE_CODE (field) != FIELD_DECL)
10138               continue;
10139
10140             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10141             if (sub_count < 0)
10142               return -1;
10143             count = count > sub_count ? count : sub_count;
10144           }
10145
10146         /* There must be no padding.  */
10147         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10148           return -1;
10149
10150         return count;
10151       }
10152
10153     default:
10154       break;
10155     }
10156
10157   return -1;
10158 }
10159
10160 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10161    type as described in AAPCS64 \S 4.1.2.
10162
10163    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10164
10165 static bool
10166 aarch64_short_vector_p (const_tree type,
10167                         machine_mode mode)
10168 {
10169   HOST_WIDE_INT size = -1;
10170
10171   if (type && TREE_CODE (type) == VECTOR_TYPE)
10172     size = int_size_in_bytes (type);
10173   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10174             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10175     size = GET_MODE_SIZE (mode);
10176
10177   return (size == 8 || size == 16);
10178 }
10179
10180 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10181    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10182    array types.  The C99 floating-point complex types are also considered
10183    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10184    types, which are GCC extensions and out of the scope of AAPCS64, are
10185    treated as composite types here as well.
10186
10187    Note that MODE itself is not sufficient in determining whether a type
10188    is such a composite type or not.  This is because
10189    stor-layout.c:compute_record_mode may have already changed the MODE
10190    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10191    structure with only one field may have its MODE set to the mode of the
10192    field.  Also an integer mode whose size matches the size of the
10193    RECORD_TYPE type may be used to substitute the original mode
10194    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10195    solely relied on.  */
10196
10197 static bool
10198 aarch64_composite_type_p (const_tree type,
10199                           machine_mode mode)
10200 {
10201   if (aarch64_short_vector_p (type, mode))
10202     return false;
10203
10204   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10205     return true;
10206
10207   if (mode == BLKmode
10208       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10209       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10210     return true;
10211
10212   return false;
10213 }
10214
10215 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10216    shall be passed or returned in simd/fp register(s) (providing these
10217    parameter passing registers are available).
10218
10219    Upon successful return, *COUNT returns the number of needed registers,
10220    *BASE_MODE returns the mode of the individual register and when IS_HAF
10221    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10222    floating-point aggregate or a homogeneous short-vector aggregate.  */
10223
10224 static bool
10225 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10226                                          const_tree type,
10227                                          machine_mode *base_mode,
10228                                          int *count,
10229                                          bool *is_ha)
10230 {
10231   machine_mode new_mode = VOIDmode;
10232   bool composite_p = aarch64_composite_type_p (type, mode);
10233
10234   if (is_ha != NULL) *is_ha = false;
10235
10236   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10237       || aarch64_short_vector_p (type, mode))
10238     {
10239       *count = 1;
10240       new_mode = mode;
10241     }
10242   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10243     {
10244       if (is_ha != NULL) *is_ha = true;
10245       *count = 2;
10246       new_mode = GET_MODE_INNER (mode);
10247     }
10248   else if (type && composite_p)
10249     {
10250       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10251
10252       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10253         {
10254           if (is_ha != NULL) *is_ha = true;
10255           *count = ag_count;
10256         }
10257       else
10258         return false;
10259     }
10260   else
10261     return false;
10262
10263   *base_mode = new_mode;
10264   return true;
10265 }
10266
10267 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10268
10269 static rtx
10270 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10271                           int incoming ATTRIBUTE_UNUSED)
10272 {
10273   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10274 }
10275
10276 /* Implements target hook vector_mode_supported_p.  */
10277 static bool
10278 aarch64_vector_mode_supported_p (machine_mode mode)
10279 {
10280   if (TARGET_SIMD
10281       && (mode == V4SImode  || mode == V8HImode
10282           || mode == V16QImode || mode == V2DImode
10283           || mode == V2SImode  || mode == V4HImode
10284           || mode == V8QImode || mode == V2SFmode
10285           || mode == V4SFmode || mode == V2DFmode
10286           || mode == V4HFmode || mode == V8HFmode
10287           || mode == V1DFmode))
10288     return true;
10289
10290   return false;
10291 }
10292
10293 /* Return appropriate SIMD container
10294    for MODE within a vector of WIDTH bits.  */
10295 static machine_mode
10296 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10297 {
10298   gcc_assert (width == 64 || width == 128);
10299   if (TARGET_SIMD)
10300     {
10301       if (width == 128)
10302         switch (mode)
10303           {
10304           case DFmode:
10305             return V2DFmode;
10306           case SFmode:
10307             return V4SFmode;
10308           case SImode:
10309             return V4SImode;
10310           case HImode:
10311             return V8HImode;
10312           case QImode:
10313             return V16QImode;
10314           case DImode:
10315             return V2DImode;
10316           default:
10317             break;
10318           }
10319       else
10320         switch (mode)
10321           {
10322           case SFmode:
10323             return V2SFmode;
10324           case SImode:
10325             return V2SImode;
10326           case HImode:
10327             return V4HImode;
10328           case QImode:
10329             return V8QImode;
10330           default:
10331             break;
10332           }
10333     }
10334   return word_mode;
10335 }
10336
10337 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10338 static machine_mode
10339 aarch64_preferred_simd_mode (machine_mode mode)
10340 {
10341   return aarch64_simd_container_mode (mode, 128);
10342 }
10343
10344 /* Return the bitmask of possible vector sizes for the vectorizer
10345    to iterate over.  */
10346 static unsigned int
10347 aarch64_autovectorize_vector_sizes (void)
10348 {
10349   return (16 | 8);
10350 }
10351
10352 /* Implement TARGET_MANGLE_TYPE.  */
10353
10354 static const char *
10355 aarch64_mangle_type (const_tree type)
10356 {
10357   /* The AArch64 ABI documents say that "__va_list" has to be
10358      managled as if it is in the "std" namespace.  */
10359   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10360     return "St9__va_list";
10361
10362   /* Half-precision float.  */
10363   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10364     return "Dh";
10365
10366   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10367      builtin types.  */
10368   if (TYPE_NAME (type) != NULL)
10369     return aarch64_mangle_builtin_type (type);
10370
10371   /* Use the default mangling.  */
10372   return NULL;
10373 }
10374
10375
10376 /* Return true if the rtx_insn contains a MEM RTX somewhere
10377    in it.  */
10378
10379 static bool
10380 has_memory_op (rtx_insn *mem_insn)
10381 {
10382   subrtx_iterator::array_type array;
10383   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10384     if (MEM_P (*iter))
10385       return true;
10386
10387   return false;
10388 }
10389
10390 /* Find the first rtx_insn before insn that will generate an assembly
10391    instruction.  */
10392
10393 static rtx_insn *
10394 aarch64_prev_real_insn (rtx_insn *insn)
10395 {
10396   if (!insn)
10397     return NULL;
10398
10399   do
10400     {
10401       insn = prev_real_insn (insn);
10402     }
10403   while (insn && recog_memoized (insn) < 0);
10404
10405   return insn;
10406 }
10407
10408 static bool
10409 is_madd_op (enum attr_type t1)
10410 {
10411   unsigned int i;
10412   /* A number of these may be AArch32 only.  */
10413   enum attr_type mlatypes[] = {
10414     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10415     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10416     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10417   };
10418
10419   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10420     {
10421       if (t1 == mlatypes[i])
10422         return true;
10423     }
10424
10425   return false;
10426 }
10427
10428 /* Check if there is a register dependency between a load and the insn
10429    for which we hold recog_data.  */
10430
10431 static bool
10432 dep_between_memop_and_curr (rtx memop)
10433 {
10434   rtx load_reg;
10435   int opno;
10436
10437   gcc_assert (GET_CODE (memop) == SET);
10438
10439   if (!REG_P (SET_DEST (memop)))
10440     return false;
10441
10442   load_reg = SET_DEST (memop);
10443   for (opno = 1; opno < recog_data.n_operands; opno++)
10444     {
10445       rtx operand = recog_data.operand[opno];
10446       if (REG_P (operand)
10447           && reg_overlap_mentioned_p (load_reg, operand))
10448         return true;
10449
10450     }
10451   return false;
10452 }
10453
10454
10455 /* When working around the Cortex-A53 erratum 835769,
10456    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10457    instruction and has a preceding memory instruction such that a NOP
10458    should be inserted between them.  */
10459
10460 bool
10461 aarch64_madd_needs_nop (rtx_insn* insn)
10462 {
10463   enum attr_type attr_type;
10464   rtx_insn *prev;
10465   rtx body;
10466
10467   if (!TARGET_FIX_ERR_A53_835769)
10468     return false;
10469
10470   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10471     return false;
10472
10473   attr_type = get_attr_type (insn);
10474   if (!is_madd_op (attr_type))
10475     return false;
10476
10477   prev = aarch64_prev_real_insn (insn);
10478   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10479      Restore recog state to INSN to avoid state corruption.  */
10480   extract_constrain_insn_cached (insn);
10481
10482   if (!prev || !has_memory_op (prev))
10483     return false;
10484
10485   body = single_set (prev);
10486
10487   /* If the previous insn is a memory op and there is no dependency between
10488      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10489      have a complex memory operation, probably a load/store pair.
10490      Be conservative for now and emit a NOP.  */
10491   if (GET_MODE (recog_data.operand[0]) == DImode
10492       && (!body || !dep_between_memop_and_curr (body)))
10493     return true;
10494
10495   return false;
10496
10497 }
10498
10499
10500 /* Implement FINAL_PRESCAN_INSN.  */
10501
10502 void
10503 aarch64_final_prescan_insn (rtx_insn *insn)
10504 {
10505   if (aarch64_madd_needs_nop (insn))
10506     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10507 }
10508
10509
10510 /* Return the equivalent letter for size.  */
10511 static char
10512 sizetochar (int size)
10513 {
10514   switch (size)
10515     {
10516     case 64: return 'd';
10517     case 32: return 's';
10518     case 16: return 'h';
10519     case 8 : return 'b';
10520     default: gcc_unreachable ();
10521     }
10522 }
10523
10524 /* Return true iff x is a uniform vector of floating-point
10525    constants, and the constant can be represented in
10526    quarter-precision form.  Note, as aarch64_float_const_representable
10527    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10528 static bool
10529 aarch64_vect_float_const_representable_p (rtx x)
10530 {
10531   rtx elt;
10532   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10533           && const_vec_duplicate_p (x, &elt)
10534           && aarch64_float_const_representable_p (elt));
10535 }
10536
10537 /* Return true for valid and false for invalid.  */
10538 bool
10539 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10540                               struct simd_immediate_info *info)
10541 {
10542 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10543   matches = 1;                                          \
10544   for (i = 0; i < idx; i += (STRIDE))                   \
10545     if (!(TEST))                                        \
10546       matches = 0;                                      \
10547   if (matches)                                          \
10548     {                                                   \
10549       immtype = (CLASS);                                \
10550       elsize = (ELSIZE);                                \
10551       eshift = (SHIFT);                                 \
10552       emvn = (NEG);                                     \
10553       break;                                            \
10554     }
10555
10556   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10557   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10558   unsigned char bytes[16];
10559   int immtype = -1, matches;
10560   unsigned int invmask = inverse ? 0xff : 0;
10561   int eshift, emvn;
10562
10563   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10564     {
10565       if (! (aarch64_simd_imm_zero_p (op, mode)
10566              || aarch64_vect_float_const_representable_p (op)))
10567         return false;
10568
10569       if (info)
10570         {
10571           info->value = CONST_VECTOR_ELT (op, 0);
10572           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10573           info->mvn = false;
10574           info->shift = 0;
10575         }
10576
10577       return true;
10578     }
10579
10580   /* Splat vector constant out into a byte vector.  */
10581   for (i = 0; i < n_elts; i++)
10582     {
10583       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10584          it must be laid out in the vector register in reverse order.  */
10585       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10586       unsigned HOST_WIDE_INT elpart;
10587
10588       gcc_assert (CONST_INT_P (el));
10589       elpart = INTVAL (el);
10590
10591       for (unsigned int byte = 0; byte < innersize; byte++)
10592         {
10593           bytes[idx++] = (elpart & 0xff) ^ invmask;
10594           elpart >>= BITS_PER_UNIT;
10595         }
10596
10597     }
10598
10599   /* Sanity check.  */
10600   gcc_assert (idx == GET_MODE_SIZE (mode));
10601
10602   do
10603     {
10604       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10605              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10606
10607       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10608              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10609
10610       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10611              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10612
10613       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10614              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10615
10616       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10617
10618       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10619
10620       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10621              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10622
10623       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10624              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10625
10626       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10627              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10628
10629       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10630              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10631
10632       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10633
10634       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10635
10636       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10637              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10638
10639       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10640              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10641
10642       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10643              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10644
10645       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10646              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10647
10648       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10649
10650       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10651              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10652     }
10653   while (0);
10654
10655   if (immtype == -1)
10656     return false;
10657
10658   if (info)
10659     {
10660       info->element_width = elsize;
10661       info->mvn = emvn != 0;
10662       info->shift = eshift;
10663
10664       unsigned HOST_WIDE_INT imm = 0;
10665
10666       if (immtype >= 12 && immtype <= 15)
10667         info->msl = true;
10668
10669       /* Un-invert bytes of recognized vector, if necessary.  */
10670       if (invmask != 0)
10671         for (i = 0; i < idx; i++)
10672           bytes[i] ^= invmask;
10673
10674       if (immtype == 17)
10675         {
10676           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10677           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10678
10679           for (i = 0; i < 8; i++)
10680             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10681               << (i * BITS_PER_UNIT);
10682
10683
10684           info->value = GEN_INT (imm);
10685         }
10686       else
10687         {
10688           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10689             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10690
10691           /* Construct 'abcdefgh' because the assembler cannot handle
10692              generic constants.  */
10693           if (info->mvn)
10694             imm = ~imm;
10695           imm = (imm >> info->shift) & 0xff;
10696           info->value = GEN_INT (imm);
10697         }
10698     }
10699
10700   return true;
10701 #undef CHECK
10702 }
10703
10704 /* Check of immediate shift constants are within range.  */
10705 bool
10706 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10707 {
10708   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10709   if (left)
10710     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10711   else
10712     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10713 }
10714
10715 /* Return true if X is a uniform vector where all elements
10716    are either the floating-point constant 0.0 or the
10717    integer constant 0.  */
10718 bool
10719 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10720 {
10721   return x == CONST0_RTX (mode);
10722 }
10723
10724
10725 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10726    operation of width WIDTH at bit position POS.  */
10727
10728 rtx
10729 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10730 {
10731   gcc_assert (CONST_INT_P (width));
10732   gcc_assert (CONST_INT_P (pos));
10733
10734   unsigned HOST_WIDE_INT mask
10735     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10736   return GEN_INT (mask << UINTVAL (pos));
10737 }
10738
10739 bool
10740 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10741 {
10742   HOST_WIDE_INT imm = INTVAL (x);
10743   int i;
10744
10745   for (i = 0; i < 8; i++)
10746     {
10747       unsigned int byte = imm & 0xff;
10748       if (byte != 0xff && byte != 0)
10749        return false;
10750       imm >>= 8;
10751     }
10752
10753   return true;
10754 }
10755
10756 bool
10757 aarch64_mov_operand_p (rtx x, machine_mode mode)
10758 {
10759   if (GET_CODE (x) == HIGH
10760       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10761     return true;
10762
10763   if (CONST_INT_P (x))
10764     return true;
10765
10766   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10767     return true;
10768
10769   return aarch64_classify_symbolic_expression (x)
10770     == SYMBOL_TINY_ABSOLUTE;
10771 }
10772
10773 /* Return a const_int vector of VAL.  */
10774 rtx
10775 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10776 {
10777   int nunits = GET_MODE_NUNITS (mode);
10778   rtvec v = rtvec_alloc (nunits);
10779   int i;
10780
10781   for (i=0; i < nunits; i++)
10782     RTVEC_ELT (v, i) = GEN_INT (val);
10783
10784   return gen_rtx_CONST_VECTOR (mode, v);
10785 }
10786
10787 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10788
10789 bool
10790 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10791 {
10792   machine_mode vmode;
10793
10794   gcc_assert (!VECTOR_MODE_P (mode));
10795   vmode = aarch64_preferred_simd_mode (mode);
10796   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10797   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10798 }
10799
10800 /* Construct and return a PARALLEL RTX vector with elements numbering the
10801    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10802    the vector - from the perspective of the architecture.  This does not
10803    line up with GCC's perspective on lane numbers, so we end up with
10804    different masks depending on our target endian-ness.  The diagram
10805    below may help.  We must draw the distinction when building masks
10806    which select one half of the vector.  An instruction selecting
10807    architectural low-lanes for a big-endian target, must be described using
10808    a mask selecting GCC high-lanes.
10809
10810                  Big-Endian             Little-Endian
10811
10812 GCC             0   1   2   3           3   2   1   0
10813               | x | x | x | x |       | x | x | x | x |
10814 Architecture    3   2   1   0           3   2   1   0
10815
10816 Low Mask:         { 2, 3 }                { 0, 1 }
10817 High Mask:        { 0, 1 }                { 2, 3 }
10818 */
10819
10820 rtx
10821 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10822 {
10823   int nunits = GET_MODE_NUNITS (mode);
10824   rtvec v = rtvec_alloc (nunits / 2);
10825   int high_base = nunits / 2;
10826   int low_base = 0;
10827   int base;
10828   rtx t1;
10829   int i;
10830
10831   if (BYTES_BIG_ENDIAN)
10832     base = high ? low_base : high_base;
10833   else
10834     base = high ? high_base : low_base;
10835
10836   for (i = 0; i < nunits / 2; i++)
10837     RTVEC_ELT (v, i) = GEN_INT (base + i);
10838
10839   t1 = gen_rtx_PARALLEL (mode, v);
10840   return t1;
10841 }
10842
10843 /* Check OP for validity as a PARALLEL RTX vector with elements
10844    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10845    from the perspective of the architecture.  See the diagram above
10846    aarch64_simd_vect_par_cnst_half for more details.  */
10847
10848 bool
10849 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10850                                        bool high)
10851 {
10852   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10853   HOST_WIDE_INT count_op = XVECLEN (op, 0);
10854   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10855   int i = 0;
10856
10857   if (!VECTOR_MODE_P (mode))
10858     return false;
10859
10860   if (count_op != count_ideal)
10861     return false;
10862
10863   for (i = 0; i < count_ideal; i++)
10864     {
10865       rtx elt_op = XVECEXP (op, 0, i);
10866       rtx elt_ideal = XVECEXP (ideal, 0, i);
10867
10868       if (!CONST_INT_P (elt_op)
10869           || INTVAL (elt_ideal) != INTVAL (elt_op))
10870         return false;
10871     }
10872   return true;
10873 }
10874
10875 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
10876    HIGH (exclusive).  */
10877 void
10878 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10879                           const_tree exp)
10880 {
10881   HOST_WIDE_INT lane;
10882   gcc_assert (CONST_INT_P (operand));
10883   lane = INTVAL (operand);
10884
10885   if (lane < low || lane >= high)
10886   {
10887     if (exp)
10888       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10889     else
10890       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10891   }
10892 }
10893
10894 /* Return TRUE if OP is a valid vector addressing mode.  */
10895 bool
10896 aarch64_simd_mem_operand_p (rtx op)
10897 {
10898   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10899                         || REG_P (XEXP (op, 0)));
10900 }
10901
10902 /* Emit a register copy from operand to operand, taking care not to
10903    early-clobber source registers in the process.
10904
10905    COUNT is the number of components into which the copy needs to be
10906    decomposed.  */
10907 void
10908 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10909                                 unsigned int count)
10910 {
10911   unsigned int i;
10912   int rdest = REGNO (operands[0]);
10913   int rsrc = REGNO (operands[1]);
10914
10915   if (!reg_overlap_mentioned_p (operands[0], operands[1])
10916       || rdest < rsrc)
10917     for (i = 0; i < count; i++)
10918       emit_move_insn (gen_rtx_REG (mode, rdest + i),
10919                       gen_rtx_REG (mode, rsrc + i));
10920   else
10921     for (i = 0; i < count; i++)
10922       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10923                       gen_rtx_REG (mode, rsrc + count - i - 1));
10924 }
10925
10926 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
10927    one of VSTRUCT modes: OI, CI or XI.  */
10928 int
10929 aarch64_simd_attr_length_move (rtx_insn *insn)
10930 {
10931   machine_mode mode;
10932
10933   extract_insn_cached (insn);
10934
10935   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
10936     {
10937       mode = GET_MODE (recog_data.operand[0]);
10938       switch (mode)
10939         {
10940         case OImode:
10941           return 8;
10942         case CImode:
10943           return 12;
10944         case XImode:
10945           return 16;
10946         default:
10947           gcc_unreachable ();
10948         }
10949     }
10950   return 4;
10951 }
10952
10953 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10954    one of VSTRUCT modes: OI, CI, or XI.  */
10955 int
10956 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10957 {
10958   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10959 }
10960
10961 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
10962    alignment of a vector to 128 bits.  */
10963 static HOST_WIDE_INT
10964 aarch64_simd_vector_alignment (const_tree type)
10965 {
10966   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10967   return MIN (align, 128);
10968 }
10969
10970 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
10971 static bool
10972 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10973 {
10974   if (is_packed)
10975     return false;
10976
10977   /* We guarantee alignment for vectors up to 128-bits.  */
10978   if (tree_int_cst_compare (TYPE_SIZE (type),
10979                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10980     return false;
10981
10982   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
10983   return true;
10984 }
10985
10986 /* If VALS is a vector constant that can be loaded into a register
10987    using DUP, generate instructions to do so and return an RTX to
10988    assign to the register.  Otherwise return NULL_RTX.  */
10989 static rtx
10990 aarch64_simd_dup_constant (rtx vals)
10991 {
10992   machine_mode mode = GET_MODE (vals);
10993   machine_mode inner_mode = GET_MODE_INNER (mode);
10994   rtx x;
10995
10996   if (!const_vec_duplicate_p (vals, &x))
10997     return NULL_RTX;
10998
10999   /* We can load this constant by using DUP and a constant in a
11000      single ARM register.  This will be cheaper than a vector
11001      load.  */
11002   x = copy_to_mode_reg (inner_mode, x);
11003   return gen_rtx_VEC_DUPLICATE (mode, x);
11004 }
11005
11006
11007 /* Generate code to load VALS, which is a PARALLEL containing only
11008    constants (for vec_init) or CONST_VECTOR, efficiently into a
11009    register.  Returns an RTX to copy into the register, or NULL_RTX
11010    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11011 static rtx
11012 aarch64_simd_make_constant (rtx vals)
11013 {
11014   machine_mode mode = GET_MODE (vals);
11015   rtx const_dup;
11016   rtx const_vec = NULL_RTX;
11017   int n_elts = GET_MODE_NUNITS (mode);
11018   int n_const = 0;
11019   int i;
11020
11021   if (GET_CODE (vals) == CONST_VECTOR)
11022     const_vec = vals;
11023   else if (GET_CODE (vals) == PARALLEL)
11024     {
11025       /* A CONST_VECTOR must contain only CONST_INTs and
11026          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11027          Only store valid constants in a CONST_VECTOR.  */
11028       for (i = 0; i < n_elts; ++i)
11029         {
11030           rtx x = XVECEXP (vals, 0, i);
11031           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11032             n_const++;
11033         }
11034       if (n_const == n_elts)
11035         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11036     }
11037   else
11038     gcc_unreachable ();
11039
11040   if (const_vec != NULL_RTX
11041       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11042     /* Load using MOVI/MVNI.  */
11043     return const_vec;
11044   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11045     /* Loaded using DUP.  */
11046     return const_dup;
11047   else if (const_vec != NULL_RTX)
11048     /* Load from constant pool. We can not take advantage of single-cycle
11049        LD1 because we need a PC-relative addressing mode.  */
11050     return const_vec;
11051   else
11052     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11053        We can not construct an initializer.  */
11054     return NULL_RTX;
11055 }
11056
11057 /* Expand a vector initialisation sequence, such that TARGET is
11058    initialised to contain VALS.  */
11059
11060 void
11061 aarch64_expand_vector_init (rtx target, rtx vals)
11062 {
11063   machine_mode mode = GET_MODE (target);
11064   machine_mode inner_mode = GET_MODE_INNER (mode);
11065   /* The number of vector elements.  */
11066   int n_elts = GET_MODE_NUNITS (mode);
11067   /* The number of vector elements which are not constant.  */
11068   int n_var = 0;
11069   rtx any_const = NULL_RTX;
11070   /* The first element of vals.  */
11071   rtx v0 = XVECEXP (vals, 0, 0);
11072   bool all_same = true;
11073
11074   /* Count the number of variable elements to initialise.  */
11075   for (int i = 0; i < n_elts; ++i)
11076     {
11077       rtx x = XVECEXP (vals, 0, i);
11078       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11079         ++n_var;
11080       else
11081         any_const = x;
11082
11083       all_same &= rtx_equal_p (x, v0);
11084     }
11085
11086   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11087      how best to handle this.  */
11088   if (n_var == 0)
11089     {
11090       rtx constant = aarch64_simd_make_constant (vals);
11091       if (constant != NULL_RTX)
11092         {
11093           emit_move_insn (target, constant);
11094           return;
11095         }
11096     }
11097
11098   /* Splat a single non-constant element if we can.  */
11099   if (all_same)
11100     {
11101       rtx x = copy_to_mode_reg (inner_mode, v0);
11102       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11103       return;
11104     }
11105
11106   /* Initialise a vector which is part-variable.  We want to first try
11107      to build those lanes which are constant in the most efficient way we
11108      can.  */
11109   if (n_var != n_elts)
11110     {
11111       rtx copy = copy_rtx (vals);
11112
11113       /* Load constant part of vector.  We really don't care what goes into the
11114          parts we will overwrite, but we're more likely to be able to load the
11115          constant efficiently if it has fewer, larger, repeating parts
11116          (see aarch64_simd_valid_immediate).  */
11117       for (int i = 0; i < n_elts; i++)
11118         {
11119           rtx x = XVECEXP (vals, 0, i);
11120           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11121             continue;
11122           rtx subst = any_const;
11123           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11124             {
11125               /* Look in the copied vector, as more elements are const.  */
11126               rtx test = XVECEXP (copy, 0, i ^ bit);
11127               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11128                 {
11129                   subst = test;
11130                   break;
11131                 }
11132             }
11133           XVECEXP (copy, 0, i) = subst;
11134         }
11135       aarch64_expand_vector_init (target, copy);
11136     }
11137
11138   /* Insert the variable lanes directly.  */
11139
11140   enum insn_code icode = optab_handler (vec_set_optab, mode);
11141   gcc_assert (icode != CODE_FOR_nothing);
11142
11143   for (int i = 0; i < n_elts; i++)
11144     {
11145       rtx x = XVECEXP (vals, 0, i);
11146       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11147         continue;
11148       x = copy_to_mode_reg (inner_mode, x);
11149       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11150     }
11151 }
11152
11153 static unsigned HOST_WIDE_INT
11154 aarch64_shift_truncation_mask (machine_mode mode)
11155 {
11156   return
11157     (!SHIFT_COUNT_TRUNCATED
11158      || aarch64_vector_mode_supported_p (mode)
11159      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11160 }
11161
11162 /* Select a format to encode pointers in exception handling data.  */
11163 int
11164 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11165 {
11166    int type;
11167    switch (aarch64_cmodel)
11168      {
11169      case AARCH64_CMODEL_TINY:
11170      case AARCH64_CMODEL_TINY_PIC:
11171      case AARCH64_CMODEL_SMALL:
11172      case AARCH64_CMODEL_SMALL_PIC:
11173      case AARCH64_CMODEL_SMALL_SPIC:
11174        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11175           for everything.  */
11176        type = DW_EH_PE_sdata4;
11177        break;
11178      default:
11179        /* No assumptions here.  8-byte relocs required.  */
11180        type = DW_EH_PE_sdata8;
11181        break;
11182      }
11183    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11184 }
11185
11186 /* The last .arch and .tune assembly strings that we printed.  */
11187 static std::string aarch64_last_printed_arch_string;
11188 static std::string aarch64_last_printed_tune_string;
11189
11190 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11191    by the function fndecl.  */
11192
11193 void
11194 aarch64_declare_function_name (FILE *stream, const char* name,
11195                                 tree fndecl)
11196 {
11197   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11198
11199   struct cl_target_option *targ_options;
11200   if (target_parts)
11201     targ_options = TREE_TARGET_OPTION (target_parts);
11202   else
11203     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11204   gcc_assert (targ_options);
11205
11206   const struct processor *this_arch
11207     = aarch64_get_arch (targ_options->x_explicit_arch);
11208
11209   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11210   std::string extension
11211     = aarch64_get_extension_string_for_isa_flags (isa_flags);
11212   /* Only update the assembler .arch string if it is distinct from the last
11213      such string we printed.  */
11214   std::string to_print = this_arch->name + extension;
11215   if (to_print != aarch64_last_printed_arch_string)
11216     {
11217       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11218       aarch64_last_printed_arch_string = to_print;
11219     }
11220
11221   /* Print the cpu name we're tuning for in the comments, might be
11222      useful to readers of the generated asm.  Do it only when it changes
11223      from function to function and verbose assembly is requested.  */
11224   const struct processor *this_tune
11225     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11226
11227   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11228     {
11229       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11230                    this_tune->name);
11231       aarch64_last_printed_tune_string = this_tune->name;
11232     }
11233
11234   /* Don't forget the type directive for ELF.  */
11235   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11236   ASM_OUTPUT_LABEL (stream, name);
11237 }
11238
11239 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11240
11241 static void
11242 aarch64_start_file (void)
11243 {
11244   struct cl_target_option *default_options
11245     = TREE_TARGET_OPTION (target_option_default_node);
11246
11247   const struct processor *default_arch
11248     = aarch64_get_arch (default_options->x_explicit_arch);
11249   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11250   std::string extension
11251     = aarch64_get_extension_string_for_isa_flags (default_isa_flags);
11252
11253    aarch64_last_printed_arch_string = default_arch->name + extension;
11254    aarch64_last_printed_tune_string = "";
11255    asm_fprintf (asm_out_file, "\t.arch %s\n",
11256                 aarch64_last_printed_arch_string.c_str ());
11257
11258    default_file_start ();
11259 }
11260
11261 /* Emit load exclusive.  */
11262
11263 static void
11264 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11265                              rtx mem, rtx model_rtx)
11266 {
11267   rtx (*gen) (rtx, rtx, rtx);
11268
11269   switch (mode)
11270     {
11271     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11272     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11273     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11274     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11275     default:
11276       gcc_unreachable ();
11277     }
11278
11279   emit_insn (gen (rval, mem, model_rtx));
11280 }
11281
11282 /* Emit store exclusive.  */
11283
11284 static void
11285 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11286                               rtx rval, rtx mem, rtx model_rtx)
11287 {
11288   rtx (*gen) (rtx, rtx, rtx, rtx);
11289
11290   switch (mode)
11291     {
11292     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11293     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11294     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11295     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11296     default:
11297       gcc_unreachable ();
11298     }
11299
11300   emit_insn (gen (bval, rval, mem, model_rtx));
11301 }
11302
11303 /* Mark the previous jump instruction as unlikely.  */
11304
11305 static void
11306 aarch64_emit_unlikely_jump (rtx insn)
11307 {
11308   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11309
11310   insn = emit_jump_insn (insn);
11311   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11312 }
11313
11314 /* Expand a compare and swap pattern.  */
11315
11316 void
11317 aarch64_expand_compare_and_swap (rtx operands[])
11318 {
11319   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11320   machine_mode mode, cmp_mode;
11321   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11322   int idx;
11323   gen_cas_fn gen;
11324   const gen_cas_fn split_cas[] =
11325   {
11326     gen_aarch64_compare_and_swapqi,
11327     gen_aarch64_compare_and_swaphi,
11328     gen_aarch64_compare_and_swapsi,
11329     gen_aarch64_compare_and_swapdi
11330   };
11331   const gen_cas_fn atomic_cas[] =
11332   {
11333     gen_aarch64_compare_and_swapqi_lse,
11334     gen_aarch64_compare_and_swaphi_lse,
11335     gen_aarch64_compare_and_swapsi_lse,
11336     gen_aarch64_compare_and_swapdi_lse
11337   };
11338
11339   bval = operands[0];
11340   rval = operands[1];
11341   mem = operands[2];
11342   oldval = operands[3];
11343   newval = operands[4];
11344   is_weak = operands[5];
11345   mod_s = operands[6];
11346   mod_f = operands[7];
11347   mode = GET_MODE (mem);
11348   cmp_mode = mode;
11349
11350   /* Normally the succ memory model must be stronger than fail, but in the
11351      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11352      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11353
11354   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11355       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11356     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11357
11358   switch (mode)
11359     {
11360     case QImode:
11361     case HImode:
11362       /* For short modes, we're going to perform the comparison in SImode,
11363          so do the zero-extension now.  */
11364       cmp_mode = SImode;
11365       rval = gen_reg_rtx (SImode);
11366       oldval = convert_modes (SImode, mode, oldval, true);
11367       /* Fall through.  */
11368
11369     case SImode:
11370     case DImode:
11371       /* Force the value into a register if needed.  */
11372       if (!aarch64_plus_operand (oldval, mode))
11373         oldval = force_reg (cmp_mode, oldval);
11374       break;
11375
11376     default:
11377       gcc_unreachable ();
11378     }
11379
11380   switch (mode)
11381     {
11382     case QImode: idx = 0; break;
11383     case HImode: idx = 1; break;
11384     case SImode: idx = 2; break;
11385     case DImode: idx = 3; break;
11386     default:
11387       gcc_unreachable ();
11388     }
11389   if (TARGET_LSE)
11390     gen = atomic_cas[idx];
11391   else
11392     gen = split_cas[idx];
11393
11394   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11395
11396   if (mode == QImode || mode == HImode)
11397     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11398
11399   x = gen_rtx_REG (CCmode, CC_REGNUM);
11400   x = gen_rtx_EQ (SImode, x, const0_rtx);
11401   emit_insn (gen_rtx_SET (bval, x));
11402 }
11403
11404 /* Test whether the target supports using a atomic load-operate instruction.
11405    CODE is the operation and AFTER is TRUE if the data in memory after the
11406    operation should be returned and FALSE if the data before the operation
11407    should be returned.  Returns FALSE if the operation isn't supported by the
11408    architecture.  */
11409
11410 bool
11411 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11412 {
11413   if (!TARGET_LSE)
11414     return false;
11415
11416   switch (code)
11417     {
11418     case SET:
11419     case AND:
11420     case IOR:
11421     case XOR:
11422     case MINUS:
11423     case PLUS:
11424       return true;
11425     default:
11426       return false;
11427     }
11428 }
11429
11430 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11431    sequence implementing an atomic operation.  */
11432
11433 static void
11434 aarch64_emit_post_barrier (enum memmodel model)
11435 {
11436   const enum memmodel base_model = memmodel_base (model);
11437
11438   if (is_mm_sync (model)
11439       && (base_model == MEMMODEL_ACQUIRE
11440           || base_model == MEMMODEL_ACQ_REL
11441           || base_model == MEMMODEL_SEQ_CST))
11442     {
11443       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11444     }
11445 }
11446
11447 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11448    for the data in memory.  EXPECTED is the value expected to be in memory.
11449    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11450    is the memory ordering to use.  */
11451
11452 void
11453 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11454                         rtx expected, rtx desired,
11455                         rtx model)
11456 {
11457   rtx (*gen) (rtx, rtx, rtx, rtx);
11458   machine_mode mode;
11459
11460   mode = GET_MODE (mem);
11461
11462   switch (mode)
11463     {
11464     case QImode: gen = gen_aarch64_atomic_casqi; break;
11465     case HImode: gen = gen_aarch64_atomic_cashi; break;
11466     case SImode: gen = gen_aarch64_atomic_cassi; break;
11467     case DImode: gen = gen_aarch64_atomic_casdi; break;
11468     default:
11469       gcc_unreachable ();
11470     }
11471
11472   /* Move the expected value into the CAS destination register.  */
11473   emit_insn (gen_rtx_SET (rval, expected));
11474
11475   /* Emit the CAS.  */
11476   emit_insn (gen (rval, mem, desired, model));
11477
11478   /* Compare the expected value with the value loaded by the CAS, to establish
11479      whether the swap was made.  */
11480   aarch64_gen_compare_reg (EQ, rval, expected);
11481 }
11482
11483 /* Split a compare and swap pattern.  */
11484
11485 void
11486 aarch64_split_compare_and_swap (rtx operands[])
11487 {
11488   rtx rval, mem, oldval, newval, scratch;
11489   machine_mode mode;
11490   bool is_weak;
11491   rtx_code_label *label1, *label2;
11492   rtx x, cond;
11493   enum memmodel model;
11494   rtx model_rtx;
11495
11496   rval = operands[0];
11497   mem = operands[1];
11498   oldval = operands[2];
11499   newval = operands[3];
11500   is_weak = (operands[4] != const0_rtx);
11501   model_rtx = operands[5];
11502   scratch = operands[7];
11503   mode = GET_MODE (mem);
11504   model = memmodel_from_int (INTVAL (model_rtx));
11505
11506   label1 = NULL;
11507   if (!is_weak)
11508     {
11509       label1 = gen_label_rtx ();
11510       emit_label (label1);
11511     }
11512   label2 = gen_label_rtx ();
11513
11514   /* The initial load can be relaxed for a __sync operation since a final
11515      barrier will be emitted to stop code hoisting.  */
11516   if (is_mm_sync (model))
11517     aarch64_emit_load_exclusive (mode, rval, mem,
11518                                  GEN_INT (MEMMODEL_RELAXED));
11519   else
11520     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11521
11522   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11523   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11524   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11525                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11526   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11527
11528   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11529
11530   if (!is_weak)
11531     {
11532       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11533       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11534                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11535       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11536     }
11537   else
11538     {
11539       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11540       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11541       emit_insn (gen_rtx_SET (cond, x));
11542     }
11543
11544   emit_label (label2);
11545
11546   /* Emit any final barrier needed for a __sync operation.  */
11547   if (is_mm_sync (model))
11548     aarch64_emit_post_barrier (model);
11549 }
11550
11551 /* Emit a BIC instruction.  */
11552
11553 static void
11554 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11555 {
11556   rtx shift_rtx = GEN_INT (shift);
11557   rtx (*gen) (rtx, rtx, rtx, rtx);
11558
11559   switch (mode)
11560     {
11561     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11562     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11563     default:
11564       gcc_unreachable ();
11565     }
11566
11567   emit_insn (gen (dst, s2, shift_rtx, s1));
11568 }
11569
11570 /* Emit an atomic swap.  */
11571
11572 static void
11573 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11574                           rtx mem, rtx model)
11575 {
11576   rtx (*gen) (rtx, rtx, rtx, rtx);
11577
11578   switch (mode)
11579     {
11580     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11581     case HImode: gen = gen_aarch64_atomic_swphi; break;
11582     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11583     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11584     default:
11585       gcc_unreachable ();
11586     }
11587
11588   emit_insn (gen (dst, mem, value, model));
11589 }
11590
11591 /* Operations supported by aarch64_emit_atomic_load_op.  */
11592
11593 enum aarch64_atomic_load_op_code
11594 {
11595   AARCH64_LDOP_PLUS,    /* A + B  */
11596   AARCH64_LDOP_XOR,     /* A ^ B  */
11597   AARCH64_LDOP_OR,      /* A | B  */
11598   AARCH64_LDOP_BIC      /* A & ~B  */
11599 };
11600
11601 /* Emit an atomic load-operate.  */
11602
11603 static void
11604 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11605                              machine_mode mode, rtx dst, rtx src,
11606                              rtx mem, rtx model)
11607 {
11608   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11609   const aarch64_atomic_load_op_fn plus[] =
11610   {
11611     gen_aarch64_atomic_loadaddqi,
11612     gen_aarch64_atomic_loadaddhi,
11613     gen_aarch64_atomic_loadaddsi,
11614     gen_aarch64_atomic_loadadddi
11615   };
11616   const aarch64_atomic_load_op_fn eor[] =
11617   {
11618     gen_aarch64_atomic_loadeorqi,
11619     gen_aarch64_atomic_loadeorhi,
11620     gen_aarch64_atomic_loadeorsi,
11621     gen_aarch64_atomic_loadeordi
11622   };
11623   const aarch64_atomic_load_op_fn ior[] =
11624   {
11625     gen_aarch64_atomic_loadsetqi,
11626     gen_aarch64_atomic_loadsethi,
11627     gen_aarch64_atomic_loadsetsi,
11628     gen_aarch64_atomic_loadsetdi
11629   };
11630   const aarch64_atomic_load_op_fn bic[] =
11631   {
11632     gen_aarch64_atomic_loadclrqi,
11633     gen_aarch64_atomic_loadclrhi,
11634     gen_aarch64_atomic_loadclrsi,
11635     gen_aarch64_atomic_loadclrdi
11636   };
11637   aarch64_atomic_load_op_fn gen;
11638   int idx = 0;
11639
11640   switch (mode)
11641     {
11642     case QImode: idx = 0; break;
11643     case HImode: idx = 1; break;
11644     case SImode: idx = 2; break;
11645     case DImode: idx = 3; break;
11646     default:
11647       gcc_unreachable ();
11648     }
11649
11650   switch (code)
11651     {
11652     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11653     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11654     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11655     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11656     default:
11657       gcc_unreachable ();
11658     }
11659
11660   emit_insn (gen (dst, mem, src, model));
11661 }
11662
11663 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11664    location to store the data read from memory.  OUT_RESULT is the location to
11665    store the result of the operation.  MEM is the memory location to read and
11666    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11667    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11668    be NULL.  */
11669
11670 void
11671 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11672                          rtx mem, rtx value, rtx model_rtx)
11673 {
11674   machine_mode mode = GET_MODE (mem);
11675   machine_mode wmode = (mode == DImode ? DImode : SImode);
11676   const bool short_mode = (mode < SImode);
11677   aarch64_atomic_load_op_code ldop_code;
11678   rtx src;
11679   rtx x;
11680
11681   if (out_data)
11682     out_data = gen_lowpart (mode, out_data);
11683
11684   if (out_result)
11685     out_result = gen_lowpart (mode, out_result);
11686
11687   /* Make sure the value is in a register, putting it into a destination
11688      register if it needs to be manipulated.  */
11689   if (!register_operand (value, mode)
11690       || code == AND || code == MINUS)
11691     {
11692       src = out_result ? out_result : out_data;
11693       emit_move_insn (src, gen_lowpart (mode, value));
11694     }
11695   else
11696     src = value;
11697   gcc_assert (register_operand (src, mode));
11698
11699   /* Preprocess the data for the operation as necessary.  If the operation is
11700      a SET then emit a swap instruction and finish.  */
11701   switch (code)
11702     {
11703     case SET:
11704       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11705       return;
11706
11707     case MINUS:
11708       /* Negate the value and treat it as a PLUS.  */
11709       {
11710         rtx neg_src;
11711
11712         /* Resize the value if necessary.  */
11713         if (short_mode)
11714           src = gen_lowpart (wmode, src);
11715
11716         neg_src = gen_rtx_NEG (wmode, src);
11717         emit_insn (gen_rtx_SET (src, neg_src));
11718
11719         if (short_mode)
11720           src = gen_lowpart (mode, src);
11721       }
11722       /* Fall-through.  */
11723     case PLUS:
11724       ldop_code = AARCH64_LDOP_PLUS;
11725       break;
11726
11727     case IOR:
11728       ldop_code = AARCH64_LDOP_OR;
11729       break;
11730
11731     case XOR:
11732       ldop_code = AARCH64_LDOP_XOR;
11733       break;
11734
11735     case AND:
11736       {
11737         rtx not_src;
11738
11739         /* Resize the value if necessary.  */
11740         if (short_mode)
11741           src = gen_lowpart (wmode, src);
11742
11743         not_src = gen_rtx_NOT (wmode, src);
11744         emit_insn (gen_rtx_SET (src, not_src));
11745
11746         if (short_mode)
11747           src = gen_lowpart (mode, src);
11748       }
11749       ldop_code = AARCH64_LDOP_BIC;
11750       break;
11751
11752     default:
11753       /* The operation can't be done with atomic instructions.  */
11754       gcc_unreachable ();
11755     }
11756
11757   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11758
11759   /* If necessary, calculate the data in memory after the update by redoing the
11760      operation from values in registers.  */
11761   if (!out_result)
11762     return;
11763
11764   if (short_mode)
11765     {
11766       src = gen_lowpart (wmode, src);
11767       out_data = gen_lowpart (wmode, out_data);
11768       out_result = gen_lowpart (wmode, out_result);
11769     }
11770
11771   x = NULL_RTX;
11772
11773   switch (code)
11774     {
11775     case MINUS:
11776     case PLUS:
11777       x = gen_rtx_PLUS (wmode, out_data, src);
11778       break;
11779     case IOR:
11780       x = gen_rtx_IOR (wmode, out_data, src);
11781       break;
11782     case XOR:
11783       x = gen_rtx_XOR (wmode, out_data, src);
11784       break;
11785     case AND:
11786       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11787       return;
11788     default:
11789       gcc_unreachable ();
11790     }
11791
11792   emit_set_insn (out_result, x);
11793
11794   return;
11795 }
11796
11797 /* Split an atomic operation.  */
11798
11799 void
11800 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11801                          rtx value, rtx model_rtx, rtx cond)
11802 {
11803   machine_mode mode = GET_MODE (mem);
11804   machine_mode wmode = (mode == DImode ? DImode : SImode);
11805   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11806   const bool is_sync = is_mm_sync (model);
11807   rtx_code_label *label;
11808   rtx x;
11809
11810   /* Split the atomic operation into a sequence.  */
11811   label = gen_label_rtx ();
11812   emit_label (label);
11813
11814   if (new_out)
11815     new_out = gen_lowpart (wmode, new_out);
11816   if (old_out)
11817     old_out = gen_lowpart (wmode, old_out);
11818   else
11819     old_out = new_out;
11820   value = simplify_gen_subreg (wmode, value, mode, 0);
11821
11822   /* The initial load can be relaxed for a __sync operation since a final
11823      barrier will be emitted to stop code hoisting.  */
11824  if (is_sync)
11825     aarch64_emit_load_exclusive (mode, old_out, mem,
11826                                  GEN_INT (MEMMODEL_RELAXED));
11827   else
11828     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11829
11830   switch (code)
11831     {
11832     case SET:
11833       new_out = value;
11834       break;
11835
11836     case NOT:
11837       x = gen_rtx_AND (wmode, old_out, value);
11838       emit_insn (gen_rtx_SET (new_out, x));
11839       x = gen_rtx_NOT (wmode, new_out);
11840       emit_insn (gen_rtx_SET (new_out, x));
11841       break;
11842
11843     case MINUS:
11844       if (CONST_INT_P (value))
11845         {
11846           value = GEN_INT (-INTVAL (value));
11847           code = PLUS;
11848         }
11849       /* Fall through.  */
11850
11851     default:
11852       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11853       emit_insn (gen_rtx_SET (new_out, x));
11854       break;
11855     }
11856
11857   aarch64_emit_store_exclusive (mode, cond, mem,
11858                                 gen_lowpart (mode, new_out), model_rtx);
11859
11860   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11861   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11862                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11863   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11864
11865   /* Emit any final barrier needed for a __sync operation.  */
11866   if (is_sync)
11867     aarch64_emit_post_barrier (model);
11868 }
11869
11870 static void
11871 aarch64_init_libfuncs (void)
11872 {
11873    /* Half-precision float operations.  The compiler handles all operations
11874      with NULL libfuncs by converting to SFmode.  */
11875
11876   /* Conversions.  */
11877   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11878   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11879
11880   /* Arithmetic.  */
11881   set_optab_libfunc (add_optab, HFmode, NULL);
11882   set_optab_libfunc (sdiv_optab, HFmode, NULL);
11883   set_optab_libfunc (smul_optab, HFmode, NULL);
11884   set_optab_libfunc (neg_optab, HFmode, NULL);
11885   set_optab_libfunc (sub_optab, HFmode, NULL);
11886
11887   /* Comparisons.  */
11888   set_optab_libfunc (eq_optab, HFmode, NULL);
11889   set_optab_libfunc (ne_optab, HFmode, NULL);
11890   set_optab_libfunc (lt_optab, HFmode, NULL);
11891   set_optab_libfunc (le_optab, HFmode, NULL);
11892   set_optab_libfunc (ge_optab, HFmode, NULL);
11893   set_optab_libfunc (gt_optab, HFmode, NULL);
11894   set_optab_libfunc (unord_optab, HFmode, NULL);
11895 }
11896
11897 /* Target hook for c_mode_for_suffix.  */
11898 static machine_mode
11899 aarch64_c_mode_for_suffix (char suffix)
11900 {
11901   if (suffix == 'q')
11902     return TFmode;
11903
11904   return VOIDmode;
11905 }
11906
11907 /* We can only represent floating point constants which will fit in
11908    "quarter-precision" values.  These values are characterised by
11909    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
11910    by:
11911
11912    (-1)^s * (n/16) * 2^r
11913
11914    Where:
11915      's' is the sign bit.
11916      'n' is an integer in the range 16 <= n <= 31.
11917      'r' is an integer in the range -3 <= r <= 4.  */
11918
11919 /* Return true iff X can be represented by a quarter-precision
11920    floating point immediate operand X.  Note, we cannot represent 0.0.  */
11921 bool
11922 aarch64_float_const_representable_p (rtx x)
11923 {
11924   /* This represents our current view of how many bits
11925      make up the mantissa.  */
11926   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11927   int exponent;
11928   unsigned HOST_WIDE_INT mantissa, mask;
11929   REAL_VALUE_TYPE r, m;
11930   bool fail;
11931
11932   if (!CONST_DOUBLE_P (x))
11933     return false;
11934
11935   /* We don't support HFmode constants yet.  */
11936   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11937     return false;
11938
11939   r = *CONST_DOUBLE_REAL_VALUE (x);
11940
11941   /* We cannot represent infinities, NaNs or +/-zero.  We won't
11942      know if we have +zero until we analyse the mantissa, but we
11943      can reject the other invalid values.  */
11944   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11945       || REAL_VALUE_MINUS_ZERO (r))
11946     return false;
11947
11948   /* Extract exponent.  */
11949   r = real_value_abs (&r);
11950   exponent = REAL_EXP (&r);
11951
11952   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11953      highest (sign) bit, with a fixed binary point at bit point_pos.
11954      m1 holds the low part of the mantissa, m2 the high part.
11955      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11956      bits for the mantissa, this can fail (low bits will be lost).  */
11957   real_ldexp (&m, &r, point_pos - exponent);
11958   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11959
11960   /* If the low part of the mantissa has bits set we cannot represent
11961      the value.  */
11962   if (w.elt (0) != 0)
11963     return false;
11964   /* We have rejected the lower HOST_WIDE_INT, so update our
11965      understanding of how many bits lie in the mantissa and
11966      look only at the high HOST_WIDE_INT.  */
11967   mantissa = w.elt (1);
11968   point_pos -= HOST_BITS_PER_WIDE_INT;
11969
11970   /* We can only represent values with a mantissa of the form 1.xxxx.  */
11971   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11972   if ((mantissa & mask) != 0)
11973     return false;
11974
11975   /* Having filtered unrepresentable values, we may now remove all
11976      but the highest 5 bits.  */
11977   mantissa >>= point_pos - 5;
11978
11979   /* We cannot represent the value 0.0, so reject it.  This is handled
11980      elsewhere.  */
11981   if (mantissa == 0)
11982     return false;
11983
11984   /* Then, as bit 4 is always set, we can mask it off, leaving
11985      the mantissa in the range [0, 15].  */
11986   mantissa &= ~(1 << 4);
11987   gcc_assert (mantissa <= 15);
11988
11989   /* GCC internally does not use IEEE754-like encoding (where normalized
11990      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
11991      Our mantissa values are shifted 4 places to the left relative to
11992      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
11993      by 5 places to correct for GCC's representation.  */
11994   exponent = 5 - exponent;
11995
11996   return (exponent >= 0 && exponent <= 7);
11997 }
11998
11999 char*
12000 aarch64_output_simd_mov_immediate (rtx const_vector,
12001                                    machine_mode mode,
12002                                    unsigned width)
12003 {
12004   bool is_valid;
12005   static char templ[40];
12006   const char *mnemonic;
12007   const char *shift_op;
12008   unsigned int lane_count = 0;
12009   char element_char;
12010
12011   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12012
12013   /* This will return true to show const_vector is legal for use as either
12014      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12015      also update INFO to show how the immediate should be generated.  */
12016   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12017   gcc_assert (is_valid);
12018
12019   element_char = sizetochar (info.element_width);
12020   lane_count = width / info.element_width;
12021
12022   mode = GET_MODE_INNER (mode);
12023   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12024     {
12025       gcc_assert (info.shift == 0 && ! info.mvn);
12026       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12027          move immediate path.  */
12028       if (aarch64_float_const_zero_rtx_p (info.value))
12029         info.value = GEN_INT (0);
12030       else
12031         {
12032 #define buf_size 20
12033           char float_buf[buf_size] = {'\0'};
12034           real_to_decimal_for_mode (float_buf,
12035                                     CONST_DOUBLE_REAL_VALUE (info.value),
12036                                     buf_size, buf_size, 1, mode);
12037 #undef buf_size
12038
12039           if (lane_count == 1)
12040             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12041           else
12042             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12043                       lane_count, element_char, float_buf);
12044           return templ;
12045         }
12046     }
12047
12048   mnemonic = info.mvn ? "mvni" : "movi";
12049   shift_op = info.msl ? "msl" : "lsl";
12050
12051   gcc_assert (CONST_INT_P (info.value));
12052   if (lane_count == 1)
12053     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12054               mnemonic, UINTVAL (info.value));
12055   else if (info.shift)
12056     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12057               ", %s %d", mnemonic, lane_count, element_char,
12058               UINTVAL (info.value), shift_op, info.shift);
12059   else
12060     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12061               mnemonic, lane_count, element_char, UINTVAL (info.value));
12062   return templ;
12063 }
12064
12065 char*
12066 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12067                                           machine_mode mode)
12068 {
12069   machine_mode vmode;
12070
12071   gcc_assert (!VECTOR_MODE_P (mode));
12072   vmode = aarch64_simd_container_mode (mode, 64);
12073   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12074   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12075 }
12076
12077 /* Split operands into moves from op[1] + op[2] into op[0].  */
12078
12079 void
12080 aarch64_split_combinev16qi (rtx operands[3])
12081 {
12082   unsigned int dest = REGNO (operands[0]);
12083   unsigned int src1 = REGNO (operands[1]);
12084   unsigned int src2 = REGNO (operands[2]);
12085   machine_mode halfmode = GET_MODE (operands[1]);
12086   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12087   rtx destlo, desthi;
12088
12089   gcc_assert (halfmode == V16QImode);
12090
12091   if (src1 == dest && src2 == dest + halfregs)
12092     {
12093       /* No-op move.  Can't split to nothing; emit something.  */
12094       emit_note (NOTE_INSN_DELETED);
12095       return;
12096     }
12097
12098   /* Preserve register attributes for variable tracking.  */
12099   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12100   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12101                                GET_MODE_SIZE (halfmode));
12102
12103   /* Special case of reversed high/low parts.  */
12104   if (reg_overlap_mentioned_p (operands[2], destlo)
12105       && reg_overlap_mentioned_p (operands[1], desthi))
12106     {
12107       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12108       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12109       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12110     }
12111   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12112     {
12113       /* Try to avoid unnecessary moves if part of the result
12114          is in the right place already.  */
12115       if (src1 != dest)
12116         emit_move_insn (destlo, operands[1]);
12117       if (src2 != dest + halfregs)
12118         emit_move_insn (desthi, operands[2]);
12119     }
12120   else
12121     {
12122       if (src2 != dest + halfregs)
12123         emit_move_insn (desthi, operands[2]);
12124       if (src1 != dest)
12125         emit_move_insn (destlo, operands[1]);
12126     }
12127 }
12128
12129 /* vec_perm support.  */
12130
12131 #define MAX_VECT_LEN 16
12132
12133 struct expand_vec_perm_d
12134 {
12135   rtx target, op0, op1;
12136   unsigned char perm[MAX_VECT_LEN];
12137   machine_mode vmode;
12138   unsigned char nelt;
12139   bool one_vector_p;
12140   bool testing_p;
12141 };
12142
12143 /* Generate a variable permutation.  */
12144
12145 static void
12146 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12147 {
12148   machine_mode vmode = GET_MODE (target);
12149   bool one_vector_p = rtx_equal_p (op0, op1);
12150
12151   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12152   gcc_checking_assert (GET_MODE (op0) == vmode);
12153   gcc_checking_assert (GET_MODE (op1) == vmode);
12154   gcc_checking_assert (GET_MODE (sel) == vmode);
12155   gcc_checking_assert (TARGET_SIMD);
12156
12157   if (one_vector_p)
12158     {
12159       if (vmode == V8QImode)
12160         {
12161           /* Expand the argument to a V16QI mode by duplicating it.  */
12162           rtx pair = gen_reg_rtx (V16QImode);
12163           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12164           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12165         }
12166       else
12167         {
12168           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12169         }
12170     }
12171   else
12172     {
12173       rtx pair;
12174
12175       if (vmode == V8QImode)
12176         {
12177           pair = gen_reg_rtx (V16QImode);
12178           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12179           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12180         }
12181       else
12182         {
12183           pair = gen_reg_rtx (OImode);
12184           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12185           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12186         }
12187     }
12188 }
12189
12190 void
12191 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12192 {
12193   machine_mode vmode = GET_MODE (target);
12194   unsigned int nelt = GET_MODE_NUNITS (vmode);
12195   bool one_vector_p = rtx_equal_p (op0, op1);
12196   rtx mask;
12197
12198   /* The TBL instruction does not use a modulo index, so we must take care
12199      of that ourselves.  */
12200   mask = aarch64_simd_gen_const_vector_dup (vmode,
12201       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12202   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12203
12204   /* For big-endian, we also need to reverse the index within the vector
12205      (but not which vector).  */
12206   if (BYTES_BIG_ENDIAN)
12207     {
12208       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12209       if (!one_vector_p)
12210         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12211       sel = expand_simple_binop (vmode, XOR, sel, mask,
12212                                  NULL, 0, OPTAB_LIB_WIDEN);
12213     }
12214   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12215 }
12216
12217 /* Recognize patterns suitable for the TRN instructions.  */
12218 static bool
12219 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12220 {
12221   unsigned int i, odd, mask, nelt = d->nelt;
12222   rtx out, in0, in1, x;
12223   rtx (*gen) (rtx, rtx, rtx);
12224   machine_mode vmode = d->vmode;
12225
12226   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12227     return false;
12228
12229   /* Note that these are little-endian tests.
12230      We correct for big-endian later.  */
12231   if (d->perm[0] == 0)
12232     odd = 0;
12233   else if (d->perm[0] == 1)
12234     odd = 1;
12235   else
12236     return false;
12237   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12238
12239   for (i = 0; i < nelt; i += 2)
12240     {
12241       if (d->perm[i] != i + odd)
12242         return false;
12243       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12244         return false;
12245     }
12246
12247   /* Success!  */
12248   if (d->testing_p)
12249     return true;
12250
12251   in0 = d->op0;
12252   in1 = d->op1;
12253   if (BYTES_BIG_ENDIAN)
12254     {
12255       x = in0, in0 = in1, in1 = x;
12256       odd = !odd;
12257     }
12258   out = d->target;
12259
12260   if (odd)
12261     {
12262       switch (vmode)
12263         {
12264         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12265         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12266         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12267         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12268         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12269         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12270         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12271         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12272         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12273         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12274         default:
12275           return false;
12276         }
12277     }
12278   else
12279     {
12280       switch (vmode)
12281         {
12282         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12283         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12284         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12285         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12286         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12287         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12288         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12289         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12290         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12291         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12292         default:
12293           return false;
12294         }
12295     }
12296
12297   emit_insn (gen (out, in0, in1));
12298   return true;
12299 }
12300
12301 /* Recognize patterns suitable for the UZP instructions.  */
12302 static bool
12303 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12304 {
12305   unsigned int i, odd, mask, nelt = d->nelt;
12306   rtx out, in0, in1, x;
12307   rtx (*gen) (rtx, rtx, rtx);
12308   machine_mode vmode = d->vmode;
12309
12310   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12311     return false;
12312
12313   /* Note that these are little-endian tests.
12314      We correct for big-endian later.  */
12315   if (d->perm[0] == 0)
12316     odd = 0;
12317   else if (d->perm[0] == 1)
12318     odd = 1;
12319   else
12320     return false;
12321   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12322
12323   for (i = 0; i < nelt; i++)
12324     {
12325       unsigned elt = (i * 2 + odd) & mask;
12326       if (d->perm[i] != elt)
12327         return false;
12328     }
12329
12330   /* Success!  */
12331   if (d->testing_p)
12332     return true;
12333
12334   in0 = d->op0;
12335   in1 = d->op1;
12336   if (BYTES_BIG_ENDIAN)
12337     {
12338       x = in0, in0 = in1, in1 = x;
12339       odd = !odd;
12340     }
12341   out = d->target;
12342
12343   if (odd)
12344     {
12345       switch (vmode)
12346         {
12347         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12348         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12349         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12350         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12351         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12352         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12353         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12354         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12355         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12356         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12357         default:
12358           return false;
12359         }
12360     }
12361   else
12362     {
12363       switch (vmode)
12364         {
12365         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12366         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12367         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12368         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12369         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12370         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12371         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12372         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12373         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12374         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12375         default:
12376           return false;
12377         }
12378     }
12379
12380   emit_insn (gen (out, in0, in1));
12381   return true;
12382 }
12383
12384 /* Recognize patterns suitable for the ZIP instructions.  */
12385 static bool
12386 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12387 {
12388   unsigned int i, high, mask, nelt = d->nelt;
12389   rtx out, in0, in1, x;
12390   rtx (*gen) (rtx, rtx, rtx);
12391   machine_mode vmode = d->vmode;
12392
12393   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12394     return false;
12395
12396   /* Note that these are little-endian tests.
12397      We correct for big-endian later.  */
12398   high = nelt / 2;
12399   if (d->perm[0] == high)
12400     /* Do Nothing.  */
12401     ;
12402   else if (d->perm[0] == 0)
12403     high = 0;
12404   else
12405     return false;
12406   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12407
12408   for (i = 0; i < nelt / 2; i++)
12409     {
12410       unsigned elt = (i + high) & mask;
12411       if (d->perm[i * 2] != elt)
12412         return false;
12413       elt = (elt + nelt) & mask;
12414       if (d->perm[i * 2 + 1] != elt)
12415         return false;
12416     }
12417
12418   /* Success!  */
12419   if (d->testing_p)
12420     return true;
12421
12422   in0 = d->op0;
12423   in1 = d->op1;
12424   if (BYTES_BIG_ENDIAN)
12425     {
12426       x = in0, in0 = in1, in1 = x;
12427       high = !high;
12428     }
12429   out = d->target;
12430
12431   if (high)
12432     {
12433       switch (vmode)
12434         {
12435         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12436         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12437         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12438         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12439         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12440         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12441         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12442         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12443         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12444         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12445         default:
12446           return false;
12447         }
12448     }
12449   else
12450     {
12451       switch (vmode)
12452         {
12453         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12454         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12455         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12456         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12457         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12458         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12459         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12460         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12461         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12462         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12463         default:
12464           return false;
12465         }
12466     }
12467
12468   emit_insn (gen (out, in0, in1));
12469   return true;
12470 }
12471
12472 /* Recognize patterns for the EXT insn.  */
12473
12474 static bool
12475 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12476 {
12477   unsigned int i, nelt = d->nelt;
12478   rtx (*gen) (rtx, rtx, rtx, rtx);
12479   rtx offset;
12480
12481   unsigned int location = d->perm[0]; /* Always < nelt.  */
12482
12483   /* Check if the extracted indices are increasing by one.  */
12484   for (i = 1; i < nelt; i++)
12485     {
12486       unsigned int required = location + i;
12487       if (d->one_vector_p)
12488         {
12489           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12490           required &= (nelt - 1);
12491         }
12492       if (d->perm[i] != required)
12493         return false;
12494     }
12495
12496   switch (d->vmode)
12497     {
12498     case V16QImode: gen = gen_aarch64_extv16qi; break;
12499     case V8QImode: gen = gen_aarch64_extv8qi; break;
12500     case V4HImode: gen = gen_aarch64_extv4hi; break;
12501     case V8HImode: gen = gen_aarch64_extv8hi; break;
12502     case V2SImode: gen = gen_aarch64_extv2si; break;
12503     case V4SImode: gen = gen_aarch64_extv4si; break;
12504     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12505     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12506     case V2DImode: gen = gen_aarch64_extv2di; break;
12507     case V2DFmode: gen = gen_aarch64_extv2df; break;
12508     default:
12509       return false;
12510     }
12511
12512   /* Success! */
12513   if (d->testing_p)
12514     return true;
12515
12516   /* The case where (location == 0) is a no-op for both big- and little-endian,
12517      and is removed by the mid-end at optimization levels -O1 and higher.  */
12518
12519   if (BYTES_BIG_ENDIAN && (location != 0))
12520     {
12521       /* After setup, we want the high elements of the first vector (stored
12522          at the LSB end of the register), and the low elements of the second
12523          vector (stored at the MSB end of the register). So swap.  */
12524       std::swap (d->op0, d->op1);
12525       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12526       location = nelt - location;
12527     }
12528
12529   offset = GEN_INT (location);
12530   emit_insn (gen (d->target, d->op0, d->op1, offset));
12531   return true;
12532 }
12533
12534 /* Recognize patterns for the REV insns.  */
12535
12536 static bool
12537 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12538 {
12539   unsigned int i, j, diff, nelt = d->nelt;
12540   rtx (*gen) (rtx, rtx);
12541
12542   if (!d->one_vector_p)
12543     return false;
12544
12545   diff = d->perm[0];
12546   switch (diff)
12547     {
12548     case 7:
12549       switch (d->vmode)
12550         {
12551         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12552         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12553         default:
12554           return false;
12555         }
12556       break;
12557     case 3:
12558       switch (d->vmode)
12559         {
12560         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12561         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12562         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12563         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12564         default:
12565           return false;
12566         }
12567       break;
12568     case 1:
12569       switch (d->vmode)
12570         {
12571         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12572         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12573         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12574         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12575         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12576         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12577         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12578         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12579         default:
12580           return false;
12581         }
12582       break;
12583     default:
12584       return false;
12585     }
12586
12587   for (i = 0; i < nelt ; i += diff + 1)
12588     for (j = 0; j <= diff; j += 1)
12589       {
12590         /* This is guaranteed to be true as the value of diff
12591            is 7, 3, 1 and we should have enough elements in the
12592            queue to generate this.  Getting a vector mask with a
12593            value of diff other than these values implies that
12594            something is wrong by the time we get here.  */
12595         gcc_assert (i + j < nelt);
12596         if (d->perm[i + j] != i + diff - j)
12597           return false;
12598       }
12599
12600   /* Success! */
12601   if (d->testing_p)
12602     return true;
12603
12604   emit_insn (gen (d->target, d->op0));
12605   return true;
12606 }
12607
12608 static bool
12609 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12610 {
12611   rtx (*gen) (rtx, rtx, rtx);
12612   rtx out = d->target;
12613   rtx in0;
12614   machine_mode vmode = d->vmode;
12615   unsigned int i, elt, nelt = d->nelt;
12616   rtx lane;
12617
12618   elt = d->perm[0];
12619   for (i = 1; i < nelt; i++)
12620     {
12621       if (elt != d->perm[i])
12622         return false;
12623     }
12624
12625   /* The generic preparation in aarch64_expand_vec_perm_const_1
12626      swaps the operand order and the permute indices if it finds
12627      d->perm[0] to be in the second operand.  Thus, we can always
12628      use d->op0 and need not do any extra arithmetic to get the
12629      correct lane number.  */
12630   in0 = d->op0;
12631   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12632
12633   switch (vmode)
12634     {
12635     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12636     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12637     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12638     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12639     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12640     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12641     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12642     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12643     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12644     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12645     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12646     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12647     default:
12648       return false;
12649     }
12650
12651   emit_insn (gen (out, in0, lane));
12652   return true;
12653 }
12654
12655 static bool
12656 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12657 {
12658   rtx rperm[MAX_VECT_LEN], sel;
12659   machine_mode vmode = d->vmode;
12660   unsigned int i, nelt = d->nelt;
12661
12662   if (d->testing_p)
12663     return true;
12664
12665   /* Generic code will try constant permutation twice.  Once with the
12666      original mode and again with the elements lowered to QImode.
12667      So wait and don't do the selector expansion ourselves.  */
12668   if (vmode != V8QImode && vmode != V16QImode)
12669     return false;
12670
12671   for (i = 0; i < nelt; ++i)
12672     {
12673       int nunits = GET_MODE_NUNITS (vmode);
12674
12675       /* If big-endian and two vectors we end up with a weird mixed-endian
12676          mode on NEON.  Reverse the index within each word but not the word
12677          itself.  */
12678       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12679                                            : d->perm[i]);
12680     }
12681   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12682   sel = force_reg (vmode, sel);
12683
12684   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12685   return true;
12686 }
12687
12688 static bool
12689 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12690 {
12691   /* The pattern matching functions above are written to look for a small
12692      number to begin the sequence (0, 1, N/2).  If we begin with an index
12693      from the second operand, we can swap the operands.  */
12694   if (d->perm[0] >= d->nelt)
12695     {
12696       unsigned i, nelt = d->nelt;
12697
12698       gcc_assert (nelt == (nelt & -nelt));
12699       for (i = 0; i < nelt; ++i)
12700         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12701
12702       std::swap (d->op0, d->op1);
12703     }
12704
12705   if (TARGET_SIMD)
12706     {
12707       if (aarch64_evpc_rev (d))
12708         return true;
12709       else if (aarch64_evpc_ext (d))
12710         return true;
12711       else if (aarch64_evpc_dup (d))
12712         return true;
12713       else if (aarch64_evpc_zip (d))
12714         return true;
12715       else if (aarch64_evpc_uzp (d))
12716         return true;
12717       else if (aarch64_evpc_trn (d))
12718         return true;
12719       return aarch64_evpc_tbl (d);
12720     }
12721   return false;
12722 }
12723
12724 /* Expand a vec_perm_const pattern.  */
12725
12726 bool
12727 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12728 {
12729   struct expand_vec_perm_d d;
12730   int i, nelt, which;
12731
12732   d.target = target;
12733   d.op0 = op0;
12734   d.op1 = op1;
12735
12736   d.vmode = GET_MODE (target);
12737   gcc_assert (VECTOR_MODE_P (d.vmode));
12738   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12739   d.testing_p = false;
12740
12741   for (i = which = 0; i < nelt; ++i)
12742     {
12743       rtx e = XVECEXP (sel, 0, i);
12744       int ei = INTVAL (e) & (2 * nelt - 1);
12745       which |= (ei < nelt ? 1 : 2);
12746       d.perm[i] = ei;
12747     }
12748
12749   switch (which)
12750     {
12751     default:
12752       gcc_unreachable ();
12753
12754     case 3:
12755       d.one_vector_p = false;
12756       if (!rtx_equal_p (op0, op1))
12757         break;
12758
12759       /* The elements of PERM do not suggest that only the first operand
12760          is used, but both operands are identical.  Allow easier matching
12761          of the permutation by folding the permutation into the single
12762          input vector.  */
12763       /* Fall Through.  */
12764     case 2:
12765       for (i = 0; i < nelt; ++i)
12766         d.perm[i] &= nelt - 1;
12767       d.op0 = op1;
12768       d.one_vector_p = true;
12769       break;
12770
12771     case 1:
12772       d.op1 = op0;
12773       d.one_vector_p = true;
12774       break;
12775     }
12776
12777   return aarch64_expand_vec_perm_const_1 (&d);
12778 }
12779
12780 static bool
12781 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12782                                      const unsigned char *sel)
12783 {
12784   struct expand_vec_perm_d d;
12785   unsigned int i, nelt, which;
12786   bool ret;
12787
12788   d.vmode = vmode;
12789   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12790   d.testing_p = true;
12791   memcpy (d.perm, sel, nelt);
12792
12793   /* Calculate whether all elements are in one vector.  */
12794   for (i = which = 0; i < nelt; ++i)
12795     {
12796       unsigned char e = d.perm[i];
12797       gcc_assert (e < 2 * nelt);
12798       which |= (e < nelt ? 1 : 2);
12799     }
12800
12801   /* If all elements are from the second vector, reindex as if from the
12802      first vector.  */
12803   if (which == 2)
12804     for (i = 0; i < nelt; ++i)
12805       d.perm[i] -= nelt;
12806
12807   /* Check whether the mask can be applied to a single vector.  */
12808   d.one_vector_p = (which != 3);
12809
12810   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12811   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12812   if (!d.one_vector_p)
12813     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12814
12815   start_sequence ();
12816   ret = aarch64_expand_vec_perm_const_1 (&d);
12817   end_sequence ();
12818
12819   return ret;
12820 }
12821
12822 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
12823 bool
12824 aarch64_cannot_change_mode_class (machine_mode from,
12825                                   machine_mode to,
12826                                   enum reg_class rclass)
12827 {
12828   /* We cannot allow word_mode subregs of full vector modes.
12829      Otherwise the middle-end will assume it's ok to store to
12830      (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
12831      of the 128-bit register.  However, after reload the subreg will
12832      be dropped leaving a plain DImode store.  See PR67609 for a more
12833      detailed dicussion.  In all other cases, we want to be permissive
12834      and return false.  */
12835   return (reg_classes_intersect_p (FP_REGS, rclass)
12836           && GET_MODE_SIZE (to) == UNITS_PER_WORD
12837           && GET_MODE_SIZE (from) > UNITS_PER_WORD);
12838 }
12839
12840 rtx
12841 aarch64_reverse_mask (enum machine_mode mode)
12842 {
12843   /* We have to reverse each vector because we dont have
12844      a permuted load that can reverse-load according to ABI rules.  */
12845   rtx mask;
12846   rtvec v = rtvec_alloc (16);
12847   int i, j;
12848   int nunits = GET_MODE_NUNITS (mode);
12849   int usize = GET_MODE_UNIT_SIZE (mode);
12850
12851   gcc_assert (BYTES_BIG_ENDIAN);
12852   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12853
12854   for (i = 0; i < nunits; i++)
12855     for (j = 0; j < usize; j++)
12856       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12857   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12858   return force_reg (V16QImode, mask);
12859 }
12860
12861 /* Implement MODES_TIEABLE_P.  */
12862
12863 bool
12864 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12865 {
12866   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12867     return true;
12868
12869   /* We specifically want to allow elements of "structure" modes to
12870      be tieable to the structure.  This more general condition allows
12871      other rarer situations too.  */
12872   if (TARGET_SIMD
12873       && aarch64_vector_mode_p (mode1)
12874       && aarch64_vector_mode_p (mode2))
12875     return true;
12876
12877   return false;
12878 }
12879
12880 /* Return a new RTX holding the result of moving POINTER forward by
12881    AMOUNT bytes.  */
12882
12883 static rtx
12884 aarch64_move_pointer (rtx pointer, int amount)
12885 {
12886   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12887
12888   return adjust_automodify_address (pointer, GET_MODE (pointer),
12889                                     next, amount);
12890 }
12891
12892 /* Return a new RTX holding the result of moving POINTER forward by the
12893    size of the mode it points to.  */
12894
12895 static rtx
12896 aarch64_progress_pointer (rtx pointer)
12897 {
12898   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12899
12900   return aarch64_move_pointer (pointer, amount);
12901 }
12902
12903 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12904    MODE bytes.  */
12905
12906 static void
12907 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12908                                               machine_mode mode)
12909 {
12910   rtx reg = gen_reg_rtx (mode);
12911
12912   /* "Cast" the pointers to the correct mode.  */
12913   *src = adjust_address (*src, mode, 0);
12914   *dst = adjust_address (*dst, mode, 0);
12915   /* Emit the memcpy.  */
12916   emit_move_insn (reg, *src);
12917   emit_move_insn (*dst, reg);
12918   /* Move the pointers forward.  */
12919   *src = aarch64_progress_pointer (*src);
12920   *dst = aarch64_progress_pointer (*dst);
12921 }
12922
12923 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
12924    we succeed, otherwise return false.  */
12925
12926 bool
12927 aarch64_expand_movmem (rtx *operands)
12928 {
12929   unsigned int n;
12930   rtx dst = operands[0];
12931   rtx src = operands[1];
12932   rtx base;
12933   bool speed_p = !optimize_function_for_size_p (cfun);
12934
12935   /* When optimizing for size, give a better estimate of the length of a
12936      memcpy call, but use the default otherwise.  */
12937   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12938
12939   /* We can't do anything smart if the amount to copy is not constant.  */
12940   if (!CONST_INT_P (operands[2]))
12941     return false;
12942
12943   n = UINTVAL (operands[2]);
12944
12945   /* Try to keep the number of instructions low.  For cases below 16 bytes we
12946      need to make at most two moves.  For cases above 16 bytes it will be one
12947      move for each 16 byte chunk, then at most two additional moves.  */
12948   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12949     return false;
12950
12951   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12952   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12953
12954   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12955   src = adjust_automodify_address (src, VOIDmode, base, 0);
12956
12957   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12958      1-byte chunk.  */
12959   if (n < 4)
12960     {
12961       if (n >= 2)
12962         {
12963           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12964           n -= 2;
12965         }
12966
12967       if (n == 1)
12968         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12969
12970       return true;
12971     }
12972
12973   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
12974      4-byte chunk, partially overlapping with the previously copied chunk.  */
12975   if (n < 8)
12976     {
12977       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12978       n -= 4;
12979       if (n > 0)
12980         {
12981           int move = n - 4;
12982
12983           src = aarch64_move_pointer (src, move);
12984           dst = aarch64_move_pointer (dst, move);
12985           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12986         }
12987       return true;
12988     }
12989
12990   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
12991      them, then (if applicable) an 8-byte chunk.  */
12992   while (n >= 8)
12993     {
12994       if (n / 16)
12995         {
12996           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
12997           n -= 16;
12998         }
12999       else
13000         {
13001           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13002           n -= 8;
13003         }
13004     }
13005
13006   /* Finish the final bytes of the copy.  We can always do this in one
13007      instruction.  We either copy the exact amount we need, or partially
13008      overlap with the previous chunk we copied and copy 8-bytes.  */
13009   if (n == 0)
13010     return true;
13011   else if (n == 1)
13012     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13013   else if (n == 2)
13014     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13015   else if (n == 4)
13016     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13017   else
13018     {
13019       if (n == 3)
13020         {
13021           src = aarch64_move_pointer (src, -1);
13022           dst = aarch64_move_pointer (dst, -1);
13023           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13024         }
13025       else
13026         {
13027           int move = n - 8;
13028
13029           src = aarch64_move_pointer (src, move);
13030           dst = aarch64_move_pointer (dst, move);
13031           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13032         }
13033     }
13034
13035   return true;
13036 }
13037
13038 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13039
13040 static unsigned HOST_WIDE_INT
13041 aarch64_asan_shadow_offset (void)
13042 {
13043   return (HOST_WIDE_INT_1 << 36);
13044 }
13045
13046 static bool
13047 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13048                                         unsigned int align,
13049                                         enum by_pieces_operation op,
13050                                         bool speed_p)
13051 {
13052   /* STORE_BY_PIECES can be used when copying a constant string, but
13053      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13054      For now we always fail this and let the move_by_pieces code copy
13055      the string from read-only memory.  */
13056   if (op == STORE_BY_PIECES)
13057     return false;
13058
13059   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13060 }
13061
13062 static rtx
13063 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13064                         int code, tree treeop0, tree treeop1)
13065 {
13066   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13067   rtx op0, op1;
13068   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13069   insn_code icode;
13070   struct expand_operand ops[4];
13071
13072   start_sequence ();
13073   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13074
13075   op_mode = GET_MODE (op0);
13076   if (op_mode == VOIDmode)
13077     op_mode = GET_MODE (op1);
13078
13079   switch (op_mode)
13080     {
13081     case QImode:
13082     case HImode:
13083     case SImode:
13084       cmp_mode = SImode;
13085       icode = CODE_FOR_cmpsi;
13086       break;
13087
13088     case DImode:
13089       cmp_mode = DImode;
13090       icode = CODE_FOR_cmpdi;
13091       break;
13092
13093     case SFmode:
13094       cmp_mode = SFmode;
13095       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13096       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13097       break;
13098
13099     case DFmode:
13100       cmp_mode = DFmode;
13101       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13102       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13103       break;
13104
13105     default:
13106       end_sequence ();
13107       return NULL_RTX;
13108     }
13109
13110   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13111   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13112   if (!op0 || !op1)
13113     {
13114       end_sequence ();
13115       return NULL_RTX;
13116     }
13117   *prep_seq = get_insns ();
13118   end_sequence ();
13119
13120   create_fixed_operand (&ops[0], op0);
13121   create_fixed_operand (&ops[1], op1);
13122
13123   start_sequence ();
13124   if (!maybe_expand_insn (icode, 2, ops))
13125     {
13126       end_sequence ();
13127       return NULL_RTX;
13128     }
13129   *gen_seq = get_insns ();
13130   end_sequence ();
13131
13132   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13133                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13134 }
13135
13136 static rtx
13137 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13138                        tree treeop0, tree treeop1, int bit_code)
13139 {
13140   rtx op0, op1, target;
13141   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13142   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13143   insn_code icode;
13144   struct expand_operand ops[6];
13145   int aarch64_cond;
13146
13147   push_to_sequence ((rtx_insn*) *prep_seq);
13148   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13149
13150   op_mode = GET_MODE (op0);
13151   if (op_mode == VOIDmode)
13152     op_mode = GET_MODE (op1);
13153
13154   switch (op_mode)
13155     {
13156     case QImode:
13157     case HImode:
13158     case SImode:
13159       cmp_mode = SImode;
13160       icode = CODE_FOR_ccmpsi;
13161       break;
13162
13163     case DImode:
13164       cmp_mode = DImode;
13165       icode = CODE_FOR_ccmpdi;
13166       break;
13167
13168     case SFmode:
13169       cmp_mode = SFmode;
13170       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13171       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13172       break;
13173
13174     case DFmode:
13175       cmp_mode = DFmode;
13176       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13177       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13178       break;
13179
13180     default:
13181       end_sequence ();
13182       return NULL_RTX;
13183     }
13184
13185   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13186   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13187   if (!op0 || !op1)
13188     {
13189       end_sequence ();
13190       return NULL_RTX;
13191     }
13192   *prep_seq = get_insns ();
13193   end_sequence ();
13194
13195   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13196   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13197
13198   if (bit_code != AND)
13199     {
13200       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13201                                                 GET_MODE (XEXP (prev, 0))),
13202                              VOIDmode, XEXP (prev, 0), const0_rtx);
13203       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13204     }
13205
13206   create_fixed_operand (&ops[0], XEXP (prev, 0));
13207   create_fixed_operand (&ops[1], target);
13208   create_fixed_operand (&ops[2], op0);
13209   create_fixed_operand (&ops[3], op1);
13210   create_fixed_operand (&ops[4], prev);
13211   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13212
13213   push_to_sequence ((rtx_insn*) *gen_seq);
13214   if (!maybe_expand_insn (icode, 6, ops))
13215     {
13216       end_sequence ();
13217       return NULL_RTX;
13218     }
13219
13220   *gen_seq = get_insns ();
13221   end_sequence ();
13222
13223   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13224 }
13225
13226 #undef TARGET_GEN_CCMP_FIRST
13227 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13228
13229 #undef TARGET_GEN_CCMP_NEXT
13230 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13231
13232 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13233    instruction fusion of some sort.  */
13234
13235 static bool
13236 aarch64_macro_fusion_p (void)
13237 {
13238   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13239 }
13240
13241
13242 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13243    should be kept together during scheduling.  */
13244
13245 static bool
13246 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13247 {
13248   rtx set_dest;
13249   rtx prev_set = single_set (prev);
13250   rtx curr_set = single_set (curr);
13251   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13252   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13253
13254   if (!aarch64_macro_fusion_p ())
13255     return false;
13256
13257   if (simple_sets_p
13258       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13259     {
13260       /* We are trying to match:
13261          prev (mov)  == (set (reg r0) (const_int imm16))
13262          curr (movk) == (set (zero_extract (reg r0)
13263                                            (const_int 16)
13264                                            (const_int 16))
13265                              (const_int imm16_1))  */
13266
13267       set_dest = SET_DEST (curr_set);
13268
13269       if (GET_CODE (set_dest) == ZERO_EXTRACT
13270           && CONST_INT_P (SET_SRC (curr_set))
13271           && CONST_INT_P (SET_SRC (prev_set))
13272           && CONST_INT_P (XEXP (set_dest, 2))
13273           && INTVAL (XEXP (set_dest, 2)) == 16
13274           && REG_P (XEXP (set_dest, 0))
13275           && REG_P (SET_DEST (prev_set))
13276           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13277         {
13278           return true;
13279         }
13280     }
13281
13282   if (simple_sets_p
13283       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13284     {
13285
13286       /*  We're trying to match:
13287           prev (adrp) == (set (reg r1)
13288                               (high (symbol_ref ("SYM"))))
13289           curr (add) == (set (reg r0)
13290                              (lo_sum (reg r1)
13291                                      (symbol_ref ("SYM"))))
13292           Note that r0 need not necessarily be the same as r1, especially
13293           during pre-regalloc scheduling.  */
13294
13295       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13296           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13297         {
13298           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13299               && REG_P (XEXP (SET_SRC (curr_set), 0))
13300               && REGNO (XEXP (SET_SRC (curr_set), 0))
13301                  == REGNO (SET_DEST (prev_set))
13302               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13303                               XEXP (SET_SRC (curr_set), 1)))
13304             return true;
13305         }
13306     }
13307
13308   if (simple_sets_p
13309       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13310     {
13311
13312       /* We're trying to match:
13313          prev (movk) == (set (zero_extract (reg r0)
13314                                            (const_int 16)
13315                                            (const_int 32))
13316                              (const_int imm16_1))
13317          curr (movk) == (set (zero_extract (reg r0)
13318                                            (const_int 16)
13319                                            (const_int 48))
13320                              (const_int imm16_2))  */
13321
13322       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13323           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13324           && REG_P (XEXP (SET_DEST (prev_set), 0))
13325           && REG_P (XEXP (SET_DEST (curr_set), 0))
13326           && REGNO (XEXP (SET_DEST (prev_set), 0))
13327              == REGNO (XEXP (SET_DEST (curr_set), 0))
13328           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13329           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13330           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13331           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13332           && CONST_INT_P (SET_SRC (prev_set))
13333           && CONST_INT_P (SET_SRC (curr_set)))
13334         return true;
13335
13336     }
13337   if (simple_sets_p
13338       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13339     {
13340       /* We're trying to match:
13341           prev (adrp) == (set (reg r0)
13342                               (high (symbol_ref ("SYM"))))
13343           curr (ldr) == (set (reg r1)
13344                              (mem (lo_sum (reg r0)
13345                                              (symbol_ref ("SYM")))))
13346                  or
13347           curr (ldr) == (set (reg r1)
13348                              (zero_extend (mem
13349                                            (lo_sum (reg r0)
13350                                                    (symbol_ref ("SYM"))))))  */
13351       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13352           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13353         {
13354           rtx curr_src = SET_SRC (curr_set);
13355
13356           if (GET_CODE (curr_src) == ZERO_EXTEND)
13357             curr_src = XEXP (curr_src, 0);
13358
13359           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13360               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13361               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13362                  == REGNO (SET_DEST (prev_set))
13363               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13364                               XEXP (SET_SRC (prev_set), 0)))
13365               return true;
13366         }
13367     }
13368
13369   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
13370        && aarch_crypto_can_dual_issue (prev, curr))
13371     return true;
13372
13373   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13374       && any_condjump_p (curr))
13375     {
13376       enum attr_type prev_type = get_attr_type (prev);
13377
13378       /* FIXME: this misses some which is considered simple arthematic
13379          instructions for ThunderX.  Simple shifts are missed here.  */
13380       if (prev_type == TYPE_ALUS_SREG
13381           || prev_type == TYPE_ALUS_IMM
13382           || prev_type == TYPE_LOGICS_REG
13383           || prev_type == TYPE_LOGICS_IMM)
13384         return true;
13385     }
13386
13387   return false;
13388 }
13389
13390 /* If MEM is in the form of [base+offset], extract the two parts
13391    of address and set to BASE and OFFSET, otherwise return false
13392    after clearing BASE and OFFSET.  */
13393
13394 bool
13395 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13396 {
13397   rtx addr;
13398
13399   gcc_assert (MEM_P (mem));
13400
13401   addr = XEXP (mem, 0);
13402
13403   if (REG_P (addr))
13404     {
13405       *base = addr;
13406       *offset = const0_rtx;
13407       return true;
13408     }
13409
13410   if (GET_CODE (addr) == PLUS
13411       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13412     {
13413       *base = XEXP (addr, 0);
13414       *offset = XEXP (addr, 1);
13415       return true;
13416     }
13417
13418   *base = NULL_RTX;
13419   *offset = NULL_RTX;
13420
13421   return false;
13422 }
13423
13424 /* Types for scheduling fusion.  */
13425 enum sched_fusion_type
13426 {
13427   SCHED_FUSION_NONE = 0,
13428   SCHED_FUSION_LD_SIGN_EXTEND,
13429   SCHED_FUSION_LD_ZERO_EXTEND,
13430   SCHED_FUSION_LD,
13431   SCHED_FUSION_ST,
13432   SCHED_FUSION_NUM
13433 };
13434
13435 /* If INSN is a load or store of address in the form of [base+offset],
13436    extract the two parts and set to BASE and OFFSET.  Return scheduling
13437    fusion type this INSN is.  */
13438
13439 static enum sched_fusion_type
13440 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13441 {
13442   rtx x, dest, src;
13443   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13444
13445   gcc_assert (INSN_P (insn));
13446   x = PATTERN (insn);
13447   if (GET_CODE (x) != SET)
13448     return SCHED_FUSION_NONE;
13449
13450   src = SET_SRC (x);
13451   dest = SET_DEST (x);
13452
13453   machine_mode dest_mode = GET_MODE (dest);
13454
13455   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13456     return SCHED_FUSION_NONE;
13457
13458   if (GET_CODE (src) == SIGN_EXTEND)
13459     {
13460       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13461       src = XEXP (src, 0);
13462       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13463         return SCHED_FUSION_NONE;
13464     }
13465   else if (GET_CODE (src) == ZERO_EXTEND)
13466     {
13467       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13468       src = XEXP (src, 0);
13469       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13470         return SCHED_FUSION_NONE;
13471     }
13472
13473   if (GET_CODE (src) == MEM && REG_P (dest))
13474     extract_base_offset_in_addr (src, base, offset);
13475   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13476     {
13477       fusion = SCHED_FUSION_ST;
13478       extract_base_offset_in_addr (dest, base, offset);
13479     }
13480   else
13481     return SCHED_FUSION_NONE;
13482
13483   if (*base == NULL_RTX || *offset == NULL_RTX)
13484     fusion = SCHED_FUSION_NONE;
13485
13486   return fusion;
13487 }
13488
13489 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13490
13491    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13492    and PRI are only calculated for these instructions.  For other instruction,
13493    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13494    type instruction fusion can be added by returning different priorities.
13495
13496    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13497
13498 static void
13499 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13500                                int *fusion_pri, int *pri)
13501 {
13502   int tmp, off_val;
13503   rtx base, offset;
13504   enum sched_fusion_type fusion;
13505
13506   gcc_assert (INSN_P (insn));
13507
13508   tmp = max_pri - 1;
13509   fusion = fusion_load_store (insn, &base, &offset);
13510   if (fusion == SCHED_FUSION_NONE)
13511     {
13512       *pri = tmp;
13513       *fusion_pri = tmp;
13514       return;
13515     }
13516
13517   /* Set FUSION_PRI according to fusion type and base register.  */
13518   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13519
13520   /* Calculate PRI.  */
13521   tmp /= 2;
13522
13523   /* INSN with smaller offset goes first.  */
13524   off_val = (int)(INTVAL (offset));
13525   if (off_val >= 0)
13526     tmp -= (off_val & 0xfffff);
13527   else
13528     tmp += ((- off_val) & 0xfffff);
13529
13530   *pri = tmp;
13531   return;
13532 }
13533
13534 /* Given OPERANDS of consecutive load/store, check if we can merge
13535    them into ldp/stp.  LOAD is true if they are load instructions.
13536    MODE is the mode of memory operands.  */
13537
13538 bool
13539 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13540                                 enum machine_mode mode)
13541 {
13542   HOST_WIDE_INT offval_1, offval_2, msize;
13543   enum reg_class rclass_1, rclass_2;
13544   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13545
13546   if (load)
13547     {
13548       mem_1 = operands[1];
13549       mem_2 = operands[3];
13550       reg_1 = operands[0];
13551       reg_2 = operands[2];
13552       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13553       if (REGNO (reg_1) == REGNO (reg_2))
13554         return false;
13555     }
13556   else
13557     {
13558       mem_1 = operands[0];
13559       mem_2 = operands[2];
13560       reg_1 = operands[1];
13561       reg_2 = operands[3];
13562     }
13563
13564   /* The mems cannot be volatile.  */
13565   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13566     return false;
13567
13568   /* Check if the addresses are in the form of [base+offset].  */
13569   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13570   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13571     return false;
13572   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13573   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13574     return false;
13575
13576   /* Check if the bases are same.  */
13577   if (!rtx_equal_p (base_1, base_2))
13578     return false;
13579
13580   offval_1 = INTVAL (offset_1);
13581   offval_2 = INTVAL (offset_2);
13582   msize = GET_MODE_SIZE (mode);
13583   /* Check if the offsets are consecutive.  */
13584   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13585     return false;
13586
13587   /* Check if the addresses are clobbered by load.  */
13588   if (load)
13589     {
13590       if (reg_mentioned_p (reg_1, mem_1))
13591         return false;
13592
13593       /* In increasing order, the last load can clobber the address.  */
13594       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13595       return false;
13596     }
13597
13598   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13599     rclass_1 = FP_REGS;
13600   else
13601     rclass_1 = GENERAL_REGS;
13602
13603   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13604     rclass_2 = FP_REGS;
13605   else
13606     rclass_2 = GENERAL_REGS;
13607
13608   /* Check if the registers are of same class.  */
13609   if (rclass_1 != rclass_2)
13610     return false;
13611
13612   return true;
13613 }
13614
13615 /* Given OPERANDS of consecutive load/store, check if we can merge
13616    them into ldp/stp by adjusting the offset.  LOAD is true if they
13617    are load instructions.  MODE is the mode of memory operands.
13618
13619    Given below consecutive stores:
13620
13621      str  w1, [xb, 0x100]
13622      str  w1, [xb, 0x104]
13623      str  w1, [xb, 0x108]
13624      str  w1, [xb, 0x10c]
13625
13626    Though the offsets are out of the range supported by stp, we can
13627    still pair them after adjusting the offset, like:
13628
13629      add  scratch, xb, 0x100
13630      stp  w1, w1, [scratch]
13631      stp  w1, w1, [scratch, 0x8]
13632
13633    The peephole patterns detecting this opportunity should guarantee
13634    the scratch register is avaliable.  */
13635
13636 bool
13637 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13638                                        enum machine_mode mode)
13639 {
13640   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13641   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13642   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13643   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13644
13645   if (load)
13646     {
13647       reg_1 = operands[0];
13648       mem_1 = operands[1];
13649       reg_2 = operands[2];
13650       mem_2 = operands[3];
13651       reg_3 = operands[4];
13652       mem_3 = operands[5];
13653       reg_4 = operands[6];
13654       mem_4 = operands[7];
13655       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13656                   && REG_P (reg_3) && REG_P (reg_4));
13657       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13658         return false;
13659     }
13660   else
13661     {
13662       mem_1 = operands[0];
13663       reg_1 = operands[1];
13664       mem_2 = operands[2];
13665       reg_2 = operands[3];
13666       mem_3 = operands[4];
13667       reg_3 = operands[5];
13668       mem_4 = operands[6];
13669       reg_4 = operands[7];
13670     }
13671   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13672   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13673     return false;
13674
13675   /* The mems cannot be volatile.  */
13676   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13677       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13678     return false;
13679
13680   /* Check if the addresses are in the form of [base+offset].  */
13681   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13682   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13683     return false;
13684   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13685   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13686     return false;
13687   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13688   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13689     return false;
13690   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13691   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13692     return false;
13693
13694   /* Check if the bases are same.  */
13695   if (!rtx_equal_p (base_1, base_2)
13696       || !rtx_equal_p (base_2, base_3)
13697       || !rtx_equal_p (base_3, base_4))
13698     return false;
13699
13700   offval_1 = INTVAL (offset_1);
13701   offval_2 = INTVAL (offset_2);
13702   offval_3 = INTVAL (offset_3);
13703   offval_4 = INTVAL (offset_4);
13704   msize = GET_MODE_SIZE (mode);
13705   /* Check if the offsets are consecutive.  */
13706   if ((offval_1 != (offval_2 + msize)
13707        || offval_1 != (offval_3 + msize * 2)
13708        || offval_1 != (offval_4 + msize * 3))
13709       && (offval_4 != (offval_3 + msize)
13710           || offval_4 != (offval_2 + msize * 2)
13711           || offval_4 != (offval_1 + msize * 3)))
13712     return false;
13713
13714   /* Check if the addresses are clobbered by load.  */
13715   if (load)
13716     {
13717       if (reg_mentioned_p (reg_1, mem_1)
13718           || reg_mentioned_p (reg_2, mem_2)
13719           || reg_mentioned_p (reg_3, mem_3))
13720         return false;
13721
13722       /* In increasing order, the last load can clobber the address.  */
13723       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13724         return false;
13725     }
13726
13727   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13728     rclass_1 = FP_REGS;
13729   else
13730     rclass_1 = GENERAL_REGS;
13731
13732   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13733     rclass_2 = FP_REGS;
13734   else
13735     rclass_2 = GENERAL_REGS;
13736
13737   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13738     rclass_3 = FP_REGS;
13739   else
13740     rclass_3 = GENERAL_REGS;
13741
13742   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13743     rclass_4 = FP_REGS;
13744   else
13745     rclass_4 = GENERAL_REGS;
13746
13747   /* Check if the registers are of same class.  */
13748   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13749     return false;
13750
13751   return true;
13752 }
13753
13754 /* Given OPERANDS of consecutive load/store, this function pairs them
13755    into ldp/stp after adjusting the offset.  It depends on the fact
13756    that addresses of load/store instructions are in increasing order.
13757    MODE is the mode of memory operands.  CODE is the rtl operator
13758    which should be applied to all memory operands, it's SIGN_EXTEND,
13759    ZERO_EXTEND or UNKNOWN.  */
13760
13761 bool
13762 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13763                              enum machine_mode mode, RTX_CODE code)
13764 {
13765   rtx base, offset, t1, t2;
13766   rtx mem_1, mem_2, mem_3, mem_4;
13767   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13768
13769   if (load)
13770     {
13771       mem_1 = operands[1];
13772       mem_2 = operands[3];
13773       mem_3 = operands[5];
13774       mem_4 = operands[7];
13775     }
13776   else
13777     {
13778       mem_1 = operands[0];
13779       mem_2 = operands[2];
13780       mem_3 = operands[4];
13781       mem_4 = operands[6];
13782       gcc_assert (code == UNKNOWN);
13783     }
13784
13785   extract_base_offset_in_addr (mem_1, &base, &offset);
13786   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13787
13788   /* Adjust offset thus it can fit in ldp/stp instruction.  */
13789   msize = GET_MODE_SIZE (mode);
13790   stp_off_limit = msize * 0x40;
13791   off_val = INTVAL (offset);
13792   abs_off = (off_val < 0) ? -off_val : off_val;
13793   new_off = abs_off % stp_off_limit;
13794   adj_off = abs_off - new_off;
13795
13796   /* Further adjust to make sure all offsets are OK.  */
13797   if ((new_off + msize * 2) >= stp_off_limit)
13798     {
13799       adj_off += stp_off_limit;
13800       new_off -= stp_off_limit;
13801     }
13802
13803   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
13804   if (adj_off >= 0x1000)
13805     return false;
13806
13807   if (off_val < 0)
13808     {
13809       adj_off = -adj_off;
13810       new_off = -new_off;
13811     }
13812
13813   /* Create new memory references.  */
13814   mem_1 = change_address (mem_1, VOIDmode,
13815                           plus_constant (DImode, operands[8], new_off));
13816
13817   /* Check if the adjusted address is OK for ldp/stp.  */
13818   if (!aarch64_mem_pair_operand (mem_1, mode))
13819     return false;
13820
13821   msize = GET_MODE_SIZE (mode);
13822   mem_2 = change_address (mem_2, VOIDmode,
13823                           plus_constant (DImode,
13824                                          operands[8],
13825                                          new_off + msize));
13826   mem_3 = change_address (mem_3, VOIDmode,
13827                           plus_constant (DImode,
13828                                          operands[8],
13829                                          new_off + msize * 2));
13830   mem_4 = change_address (mem_4, VOIDmode,
13831                           plus_constant (DImode,
13832                                          operands[8],
13833                                          new_off + msize * 3));
13834
13835   if (code == ZERO_EXTEND)
13836     {
13837       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13838       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13839       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13840       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13841     }
13842   else if (code == SIGN_EXTEND)
13843     {
13844       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13845       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13846       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13847       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13848     }
13849
13850   if (load)
13851     {
13852       operands[1] = mem_1;
13853       operands[3] = mem_2;
13854       operands[5] = mem_3;
13855       operands[7] = mem_4;
13856     }
13857   else
13858     {
13859       operands[0] = mem_1;
13860       operands[2] = mem_2;
13861       operands[4] = mem_3;
13862       operands[6] = mem_4;
13863     }
13864
13865   /* Emit adjusting instruction.  */
13866   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13867   /* Emit ldp/stp instructions.  */
13868   t1 = gen_rtx_SET (operands[0], operands[1]);
13869   t2 = gen_rtx_SET (operands[2], operands[3]);
13870   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13871   t1 = gen_rtx_SET (operands[4], operands[5]);
13872   t2 = gen_rtx_SET (operands[6], operands[7]);
13873   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13874   return true;
13875 }
13876
13877 /* Return 1 if pseudo register should be created and used to hold
13878    GOT address for PIC code.  */
13879
13880 bool
13881 aarch64_use_pseudo_pic_reg (void)
13882 {
13883   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13884 }
13885
13886 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
13887
13888 static int
13889 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13890 {
13891   switch (XINT (x, 1))
13892     {
13893     case UNSPEC_GOTSMALLPIC:
13894     case UNSPEC_GOTSMALLPIC28K:
13895     case UNSPEC_GOTTINYPIC:
13896       return 0;
13897     default:
13898       break;
13899     }
13900
13901   return default_unspec_may_trap_p (x, flags);
13902 }
13903
13904
13905 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13906    return the log2 of that value.  Otherwise return -1.  */
13907
13908 int
13909 aarch64_fpconst_pow_of_2 (rtx x)
13910 {
13911   const REAL_VALUE_TYPE *r;
13912
13913   if (!CONST_DOUBLE_P (x))
13914     return -1;
13915
13916   r = CONST_DOUBLE_REAL_VALUE (x);
13917
13918   if (REAL_VALUE_NEGATIVE (*r)
13919       || REAL_VALUE_ISNAN (*r)
13920       || REAL_VALUE_ISINF (*r)
13921       || !real_isinteger (r, DFmode))
13922     return -1;
13923
13924   return exact_log2 (real_to_integer (r));
13925 }
13926
13927 /* If X is a vector of equal CONST_DOUBLE values and that value is
13928    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
13929
13930 int
13931 aarch64_vec_fpconst_pow_of_2 (rtx x)
13932 {
13933   if (GET_CODE (x) != CONST_VECTOR)
13934     return -1;
13935
13936   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13937     return -1;
13938
13939   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13940   if (firstval <= 0)
13941     return -1;
13942
13943   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13944     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13945       return -1;
13946
13947   return firstval;
13948 }
13949
13950 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
13951 static tree
13952 aarch64_promoted_type (const_tree t)
13953 {
13954   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13955     return float_type_node;
13956   return NULL_TREE;
13957 }
13958
13959 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
13960
13961 static bool
13962 aarch64_optab_supported_p (int op, machine_mode, machine_mode,
13963                            optimization_type opt_type)
13964 {
13965   switch (op)
13966     {
13967     case rsqrt_optab:
13968       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
13969
13970     default:
13971       return true;
13972     }
13973 }
13974
13975 #undef TARGET_ADDRESS_COST
13976 #define TARGET_ADDRESS_COST aarch64_address_cost
13977
13978 /* This hook will determines whether unnamed bitfields affect the alignment
13979    of the containing structure.  The hook returns true if the structure
13980    should inherit the alignment requirements of an unnamed bitfield's
13981    type.  */
13982 #undef TARGET_ALIGN_ANON_BITFIELD
13983 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
13984
13985 #undef TARGET_ASM_ALIGNED_DI_OP
13986 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
13987
13988 #undef TARGET_ASM_ALIGNED_HI_OP
13989 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
13990
13991 #undef TARGET_ASM_ALIGNED_SI_OP
13992 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
13993
13994 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
13995 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
13996   hook_bool_const_tree_hwi_hwi_const_tree_true
13997
13998 #undef TARGET_ASM_FILE_START
13999 #define TARGET_ASM_FILE_START aarch64_start_file
14000
14001 #undef TARGET_ASM_OUTPUT_MI_THUNK
14002 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14003
14004 #undef TARGET_ASM_SELECT_RTX_SECTION
14005 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14006
14007 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14008 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14009
14010 #undef TARGET_BUILD_BUILTIN_VA_LIST
14011 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14012
14013 #undef TARGET_CALLEE_COPIES
14014 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14015
14016 #undef TARGET_CAN_ELIMINATE
14017 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14018
14019 #undef TARGET_CAN_INLINE_P
14020 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14021
14022 #undef TARGET_CANNOT_FORCE_CONST_MEM
14023 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14024
14025 #undef TARGET_CASE_VALUES_THRESHOLD
14026 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14027
14028 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14029 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14030
14031 /* Only the least significant bit is used for initialization guard
14032    variables.  */
14033 #undef TARGET_CXX_GUARD_MASK_BIT
14034 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14035
14036 #undef TARGET_C_MODE_FOR_SUFFIX
14037 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14038
14039 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14040 #undef  TARGET_DEFAULT_TARGET_FLAGS
14041 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14042 #endif
14043
14044 #undef TARGET_CLASS_MAX_NREGS
14045 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14046
14047 #undef TARGET_BUILTIN_DECL
14048 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14049
14050 #undef TARGET_BUILTIN_RECIPROCAL
14051 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14052
14053 #undef  TARGET_EXPAND_BUILTIN
14054 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14055
14056 #undef TARGET_EXPAND_BUILTIN_VA_START
14057 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14058
14059 #undef TARGET_FOLD_BUILTIN
14060 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14061
14062 #undef TARGET_FUNCTION_ARG
14063 #define TARGET_FUNCTION_ARG aarch64_function_arg
14064
14065 #undef TARGET_FUNCTION_ARG_ADVANCE
14066 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14067
14068 #undef TARGET_FUNCTION_ARG_BOUNDARY
14069 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14070
14071 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14072 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14073
14074 #undef TARGET_FUNCTION_VALUE
14075 #define TARGET_FUNCTION_VALUE aarch64_function_value
14076
14077 #undef TARGET_FUNCTION_VALUE_REGNO_P
14078 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14079
14080 #undef TARGET_FRAME_POINTER_REQUIRED
14081 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14082
14083 #undef TARGET_GIMPLE_FOLD_BUILTIN
14084 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14085
14086 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14087 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14088
14089 #undef  TARGET_INIT_BUILTINS
14090 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14091
14092 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14093 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14094   aarch64_ira_change_pseudo_allocno_class
14095
14096 #undef TARGET_LEGITIMATE_ADDRESS_P
14097 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14098
14099 #undef TARGET_LEGITIMATE_CONSTANT_P
14100 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14101
14102 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14103 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14104
14105 #undef TARGET_LRA_P
14106 #define TARGET_LRA_P hook_bool_void_true
14107
14108 #undef TARGET_MANGLE_TYPE
14109 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14110
14111 #undef TARGET_MEMORY_MOVE_COST
14112 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14113
14114 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14115 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14116
14117 #undef TARGET_MUST_PASS_IN_STACK
14118 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14119
14120 /* This target hook should return true if accesses to volatile bitfields
14121    should use the narrowest mode possible.  It should return false if these
14122    accesses should use the bitfield container type.  */
14123 #undef TARGET_NARROW_VOLATILE_BITFIELD
14124 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14125
14126 #undef  TARGET_OPTION_OVERRIDE
14127 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14128
14129 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14130 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14131   aarch64_override_options_after_change
14132
14133 #undef TARGET_OPTION_SAVE
14134 #define TARGET_OPTION_SAVE aarch64_option_save
14135
14136 #undef TARGET_OPTION_RESTORE
14137 #define TARGET_OPTION_RESTORE aarch64_option_restore
14138
14139 #undef TARGET_OPTION_PRINT
14140 #define TARGET_OPTION_PRINT aarch64_option_print
14141
14142 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14143 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14144
14145 #undef TARGET_SET_CURRENT_FUNCTION
14146 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14147
14148 #undef TARGET_PASS_BY_REFERENCE
14149 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14150
14151 #undef TARGET_PREFERRED_RELOAD_CLASS
14152 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14153
14154 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14155 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14156
14157 #undef TARGET_PROMOTED_TYPE
14158 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14159
14160 #undef TARGET_SECONDARY_RELOAD
14161 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14162
14163 #undef TARGET_SHIFT_TRUNCATION_MASK
14164 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14165
14166 #undef TARGET_SETUP_INCOMING_VARARGS
14167 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14168
14169 #undef TARGET_STRUCT_VALUE_RTX
14170 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14171
14172 #undef TARGET_REGISTER_MOVE_COST
14173 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14174
14175 #undef TARGET_RETURN_IN_MEMORY
14176 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14177
14178 #undef TARGET_RETURN_IN_MSB
14179 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14180
14181 #undef TARGET_RTX_COSTS
14182 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14183
14184 #undef TARGET_SCHED_ISSUE_RATE
14185 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14186
14187 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14188 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14189   aarch64_sched_first_cycle_multipass_dfa_lookahead
14190
14191 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14192 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14193   aarch64_first_cycle_multipass_dfa_lookahead_guard
14194
14195 #undef TARGET_TRAMPOLINE_INIT
14196 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14197
14198 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14199 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14200
14201 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14202 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14203
14204 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14205 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14206
14207 #undef TARGET_VECTORIZE_ADD_STMT_COST
14208 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14209
14210 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14211 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14212   aarch64_builtin_vectorization_cost
14213
14214 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14215 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14216
14217 #undef TARGET_VECTORIZE_BUILTINS
14218 #define TARGET_VECTORIZE_BUILTINS
14219
14220 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14221 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14222   aarch64_builtin_vectorized_function
14223
14224 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14225 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14226   aarch64_autovectorize_vector_sizes
14227
14228 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14229 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14230   aarch64_atomic_assign_expand_fenv
14231
14232 /* Section anchor support.  */
14233
14234 #undef TARGET_MIN_ANCHOR_OFFSET
14235 #define TARGET_MIN_ANCHOR_OFFSET -256
14236
14237 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14238    byte offset; we can do much more for larger data types, but have no way
14239    to determine the size of the access.  We assume accesses are aligned.  */
14240 #undef TARGET_MAX_ANCHOR_OFFSET
14241 #define TARGET_MAX_ANCHOR_OFFSET 4095
14242
14243 #undef TARGET_VECTOR_ALIGNMENT
14244 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14245
14246 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14247 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14248   aarch64_simd_vector_alignment_reachable
14249
14250 /* vec_perm support.  */
14251
14252 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14253 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14254   aarch64_vectorize_vec_perm_const_ok
14255
14256 #undef TARGET_INIT_LIBFUNCS
14257 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14258
14259 #undef TARGET_FIXED_CONDITION_CODE_REGS
14260 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14261
14262 #undef TARGET_FLAGS_REGNUM
14263 #define TARGET_FLAGS_REGNUM CC_REGNUM
14264
14265 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14266 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14267
14268 #undef TARGET_ASAN_SHADOW_OFFSET
14269 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14270
14271 #undef TARGET_LEGITIMIZE_ADDRESS
14272 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14273
14274 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14275 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14276   aarch64_use_by_pieces_infrastructure_p
14277
14278 #undef TARGET_CAN_USE_DOLOOP_P
14279 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14280
14281 #undef TARGET_SCHED_MACRO_FUSION_P
14282 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14283
14284 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14285 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14286
14287 #undef TARGET_SCHED_FUSION_PRIORITY
14288 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14289
14290 #undef TARGET_UNSPEC_MAY_TRAP_P
14291 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14292
14293 #undef TARGET_USE_PSEUDO_PIC_REG
14294 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14295
14296 #undef TARGET_PRINT_OPERAND
14297 #define TARGET_PRINT_OPERAND aarch64_print_operand
14298
14299 #undef TARGET_PRINT_OPERAND_ADDRESS
14300 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14301
14302 #undef TARGET_OPTAB_SUPPORTED_P
14303 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14304
14305 struct gcc_target targetm = TARGET_INITIALIZER;
14306
14307 #include "gt-aarch64.h"