046195c8262fadf9aac3738426e7816d8b7a7827
[gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Classifies an address.
82
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
85
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
88
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
91
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
94
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
97
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
100
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
103
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
112 };
113
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
121 };
122
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
125 {
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
128
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
135
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
138
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
142
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
145
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
148
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
153 };
154
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
161 {}
162
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
173 {}
174
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
181 {}
182
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
185
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
188
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
193
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
210 aarch64_addr_query_type);
211
212 /* Major revision number of the ARM Architecture implemented by the target. */
213 unsigned aarch64_architecture_version;
214
215 /* The processor for which instructions should be scheduled. */
216 enum aarch64_processor aarch64_tune = cortexa53;
217
218 /* Mask to specify which instruction scheduling options should be used. */
219 unsigned long aarch64_tune_flags = 0;
220
221 /* Global flag for PC relative loads. */
222 bool aarch64_pcrelative_literal_loads;
223
224 /* Global flag for whether frame pointer is enabled. */
225 bool aarch64_use_frame_pointer;
226
227 /* Support for command line parsing of boolean flags in the tuning
228 structures. */
229 struct aarch64_flag_desc
230 {
231 const char* name;
232 unsigned int flag;
233 };
234
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 {
239 { "none", AARCH64_FUSE_NOTHING },
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL },
242 { NULL, AARCH64_FUSE_NOTHING }
243 };
244
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 {
249 { "none", AARCH64_EXTRA_TUNE_NONE },
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL },
252 { NULL, AARCH64_EXTRA_TUNE_NONE }
253 };
254
255 /* Tuning parameters. */
256
257 static const struct cpu_addrcost_table generic_addrcost_table =
258 {
259 {
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
264 },
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
271 };
272
273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
274 {
275 {
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
280 },
281 0, /* pre_modify */
282 0, /* post_modify */
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
286 0, /* imm_offset */
287 };
288
289 static const struct cpu_addrcost_table xgene1_addrcost_table =
290 {
291 {
292 1, /* hi */
293 0, /* si */
294 0, /* di */
295 1, /* ti */
296 },
297 1, /* pre_modify */
298 0, /* post_modify */
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
302 0, /* imm_offset */
303 };
304
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
306 {
307 {
308 1, /* hi */
309 1, /* si */
310 1, /* di */
311 2, /* ti */
312 },
313 0, /* pre_modify */
314 0, /* post_modify */
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
318 0, /* imm_offset */
319 };
320
321 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
322 {
323 {
324 1, /* hi */
325 1, /* si */
326 1, /* di */
327 2, /* ti */
328 },
329 1, /* pre_modify */
330 1, /* post_modify */
331 3, /* register_offset */
332 4, /* register_sextend */
333 3, /* register_zextend */
334 2, /* imm_offset */
335 };
336
337 static const struct cpu_regmove_cost generic_regmove_cost =
338 {
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 5, /* GP2FP */
343 5, /* FP2GP */
344 2 /* FP2FP */
345 };
346
347 static const struct cpu_regmove_cost cortexa57_regmove_cost =
348 {
349 1, /* GP2GP */
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost. */
352 5, /* GP2FP */
353 5, /* FP2GP */
354 2 /* FP2FP */
355 };
356
357 static const struct cpu_regmove_cost cortexa53_regmove_cost =
358 {
359 1, /* GP2GP */
360 /* Avoid the use of slow int<->fp moves for spilling by setting
361 their cost higher than memmov_cost. */
362 5, /* GP2FP */
363 5, /* FP2GP */
364 2 /* FP2FP */
365 };
366
367 static const struct cpu_regmove_cost exynosm1_regmove_cost =
368 {
369 1, /* GP2GP */
370 /* Avoid the use of slow int<->fp moves for spilling by setting
371 their cost higher than memmov_cost (actual, 4 and 9). */
372 9, /* GP2FP */
373 9, /* FP2GP */
374 1 /* FP2FP */
375 };
376
377 static const struct cpu_regmove_cost thunderx_regmove_cost =
378 {
379 2, /* GP2GP */
380 2, /* GP2FP */
381 6, /* FP2GP */
382 4 /* FP2FP */
383 };
384
385 static const struct cpu_regmove_cost xgene1_regmove_cost =
386 {
387 1, /* GP2GP */
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost. */
390 8, /* GP2FP */
391 8, /* FP2GP */
392 2 /* FP2FP */
393 };
394
395 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
396 {
397 2, /* GP2GP */
398 /* Avoid the use of int<->fp moves for spilling. */
399 6, /* GP2FP */
400 6, /* FP2GP */
401 4 /* FP2FP */
402 };
403
404 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
405 {
406 1, /* GP2GP */
407 /* Avoid the use of int<->fp moves for spilling. */
408 8, /* GP2FP */
409 8, /* FP2GP */
410 4 /* FP2FP */
411 };
412
413 /* Generic costs for vector insn classes. */
414 static const struct cpu_vector_cost generic_vector_cost =
415 {
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 1, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 1, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 2, /* vec_permute_cost */
423 1, /* vec_to_scalar_cost */
424 1, /* scalar_to_vec_cost */
425 1, /* vec_align_load_cost */
426 1, /* vec_unalign_load_cost */
427 1, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 1 /* cond_not_taken_branch_cost */
431 };
432
433 /* ThunderX costs for vector insn classes. */
434 static const struct cpu_vector_cost thunderx_vector_cost =
435 {
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 3, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 4, /* vec_int_stmt_cost */
441 1, /* vec_fp_stmt_cost */
442 4, /* vec_permute_cost */
443 2, /* vec_to_scalar_cost */
444 2, /* scalar_to_vec_cost */
445 3, /* vec_align_load_cost */
446 5, /* vec_unalign_load_cost */
447 5, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 3, /* cond_taken_branch_cost */
450 3 /* cond_not_taken_branch_cost */
451 };
452
453 /* Generic costs for vector insn classes. */
454 static const struct cpu_vector_cost cortexa57_vector_cost =
455 {
456 1, /* scalar_int_stmt_cost */
457 1, /* scalar_fp_stmt_cost */
458 4, /* scalar_load_cost */
459 1, /* scalar_store_cost */
460 2, /* vec_int_stmt_cost */
461 2, /* vec_fp_stmt_cost */
462 3, /* vec_permute_cost */
463 8, /* vec_to_scalar_cost */
464 8, /* scalar_to_vec_cost */
465 4, /* vec_align_load_cost */
466 4, /* vec_unalign_load_cost */
467 1, /* vec_unalign_store_cost */
468 1, /* vec_store_cost */
469 1, /* cond_taken_branch_cost */
470 1 /* cond_not_taken_branch_cost */
471 };
472
473 static const struct cpu_vector_cost exynosm1_vector_cost =
474 {
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 3, /* vec_int_stmt_cost */
480 3, /* vec_fp_stmt_cost */
481 3, /* vec_permute_cost */
482 3, /* vec_to_scalar_cost */
483 3, /* scalar_to_vec_cost */
484 5, /* vec_align_load_cost */
485 5, /* vec_unalign_load_cost */
486 1, /* vec_unalign_store_cost */
487 1, /* vec_store_cost */
488 1, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
490 };
491
492 /* Generic costs for vector insn classes. */
493 static const struct cpu_vector_cost xgene1_vector_cost =
494 {
495 1, /* scalar_int_stmt_cost */
496 1, /* scalar_fp_stmt_cost */
497 5, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 2, /* vec_int_stmt_cost */
500 2, /* vec_fp_stmt_cost */
501 2, /* vec_permute_cost */
502 4, /* vec_to_scalar_cost */
503 4, /* scalar_to_vec_cost */
504 10, /* vec_align_load_cost */
505 10, /* vec_unalign_load_cost */
506 2, /* vec_unalign_store_cost */
507 2, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
510 };
511
512 /* Costs for vector insn classes for Vulcan. */
513 static const struct cpu_vector_cost thunderx2t99_vector_cost =
514 {
515 1, /* scalar_int_stmt_cost */
516 6, /* scalar_fp_stmt_cost */
517 4, /* scalar_load_cost */
518 1, /* scalar_store_cost */
519 5, /* vec_int_stmt_cost */
520 6, /* vec_fp_stmt_cost */
521 3, /* vec_permute_cost */
522 6, /* vec_to_scalar_cost */
523 5, /* scalar_to_vec_cost */
524 8, /* vec_align_load_cost */
525 8, /* vec_unalign_load_cost */
526 4, /* vec_unalign_store_cost */
527 4, /* vec_store_cost */
528 2, /* cond_taken_branch_cost */
529 1 /* cond_not_taken_branch_cost */
530 };
531
532 /* Generic costs for branch instructions. */
533 static const struct cpu_branch_cost generic_branch_cost =
534 {
535 1, /* Predictable. */
536 3 /* Unpredictable. */
537 };
538
539 /* Generic approximation modes. */
540 static const cpu_approx_modes generic_approx_modes =
541 {
542 AARCH64_APPROX_NONE, /* division */
543 AARCH64_APPROX_NONE, /* sqrt */
544 AARCH64_APPROX_NONE /* recip_sqrt */
545 };
546
547 /* Approximation modes for Exynos M1. */
548 static const cpu_approx_modes exynosm1_approx_modes =
549 {
550 AARCH64_APPROX_NONE, /* division */
551 AARCH64_APPROX_ALL, /* sqrt */
552 AARCH64_APPROX_ALL /* recip_sqrt */
553 };
554
555 /* Approximation modes for X-Gene 1. */
556 static const cpu_approx_modes xgene1_approx_modes =
557 {
558 AARCH64_APPROX_NONE, /* division */
559 AARCH64_APPROX_NONE, /* sqrt */
560 AARCH64_APPROX_ALL /* recip_sqrt */
561 };
562
563 /* Generic prefetch settings (which disable prefetch). */
564 static const cpu_prefetch_tune generic_prefetch_tune =
565 {
566 0, /* num_slots */
567 -1, /* l1_cache_size */
568 -1, /* l1_cache_line_size */
569 -1, /* l2_cache_size */
570 true, /* prefetch_dynamic_strides */
571 -1, /* minimum_stride */
572 -1 /* default_opt_level */
573 };
574
575 static const cpu_prefetch_tune exynosm1_prefetch_tune =
576 {
577 0, /* num_slots */
578 -1, /* l1_cache_size */
579 64, /* l1_cache_line_size */
580 -1, /* l2_cache_size */
581 true, /* prefetch_dynamic_strides */
582 -1, /* minimum_stride */
583 -1 /* default_opt_level */
584 };
585
586 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
587 {
588 4, /* num_slots */
589 32, /* l1_cache_size */
590 64, /* l1_cache_line_size */
591 512, /* l2_cache_size */
592 false, /* prefetch_dynamic_strides */
593 2048, /* minimum_stride */
594 3 /* default_opt_level */
595 };
596
597 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
598 {
599 8, /* num_slots */
600 32, /* l1_cache_size */
601 128, /* l1_cache_line_size */
602 16*1024, /* l2_cache_size */
603 true, /* prefetch_dynamic_strides */
604 -1, /* minimum_stride */
605 3 /* default_opt_level */
606 };
607
608 static const cpu_prefetch_tune thunderx_prefetch_tune =
609 {
610 8, /* num_slots */
611 32, /* l1_cache_size */
612 128, /* l1_cache_line_size */
613 -1, /* l2_cache_size */
614 true, /* prefetch_dynamic_strides */
615 -1, /* minimum_stride */
616 -1 /* default_opt_level */
617 };
618
619 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
620 {
621 8, /* num_slots */
622 32, /* l1_cache_size */
623 64, /* l1_cache_line_size */
624 256, /* l2_cache_size */
625 true, /* prefetch_dynamic_strides */
626 -1, /* minimum_stride */
627 -1 /* default_opt_level */
628 };
629
630 static const struct tune_params generic_tunings =
631 {
632 &cortexa57_extra_costs,
633 &generic_addrcost_table,
634 &generic_regmove_cost,
635 &generic_vector_cost,
636 &generic_branch_cost,
637 &generic_approx_modes,
638 4, /* memmov_cost */
639 2, /* issue_rate */
640 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
641 "8", /* function_align. */
642 "4", /* jump_align. */
643 "8", /* loop_align. */
644 2, /* int_reassoc_width. */
645 4, /* fp_reassoc_width. */
646 1, /* vec_reassoc_width. */
647 2, /* min_div_recip_mul_sf. */
648 2, /* min_div_recip_mul_df. */
649 0, /* max_case_values. */
650 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
651 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
652 &generic_prefetch_tune
653 };
654
655 static const struct tune_params cortexa35_tunings =
656 {
657 &cortexa53_extra_costs,
658 &generic_addrcost_table,
659 &cortexa53_regmove_cost,
660 &generic_vector_cost,
661 &generic_branch_cost,
662 &generic_approx_modes,
663 4, /* memmov_cost */
664 1, /* issue_rate */
665 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
666 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
667 "16", /* function_align. */
668 "4", /* jump_align. */
669 "8", /* loop_align. */
670 2, /* int_reassoc_width. */
671 4, /* fp_reassoc_width. */
672 1, /* vec_reassoc_width. */
673 2, /* min_div_recip_mul_sf. */
674 2, /* min_div_recip_mul_df. */
675 0, /* max_case_values. */
676 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
677 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
678 &generic_prefetch_tune
679 };
680
681 static const struct tune_params cortexa53_tunings =
682 {
683 &cortexa53_extra_costs,
684 &generic_addrcost_table,
685 &cortexa53_regmove_cost,
686 &generic_vector_cost,
687 &generic_branch_cost,
688 &generic_approx_modes,
689 4, /* memmov_cost */
690 2, /* issue_rate */
691 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
692 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
693 "16", /* function_align. */
694 "4", /* jump_align. */
695 "8", /* loop_align. */
696 2, /* int_reassoc_width. */
697 4, /* fp_reassoc_width. */
698 1, /* vec_reassoc_width. */
699 2, /* min_div_recip_mul_sf. */
700 2, /* min_div_recip_mul_df. */
701 0, /* max_case_values. */
702 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
703 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
704 &generic_prefetch_tune
705 };
706
707 static const struct tune_params cortexa57_tunings =
708 {
709 &cortexa57_extra_costs,
710 &generic_addrcost_table,
711 &cortexa57_regmove_cost,
712 &cortexa57_vector_cost,
713 &generic_branch_cost,
714 &generic_approx_modes,
715 4, /* memmov_cost */
716 3, /* issue_rate */
717 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
718 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
719 "16", /* function_align. */
720 "4", /* jump_align. */
721 "8", /* loop_align. */
722 2, /* int_reassoc_width. */
723 4, /* fp_reassoc_width. */
724 1, /* vec_reassoc_width. */
725 2, /* min_div_recip_mul_sf. */
726 2, /* min_div_recip_mul_df. */
727 0, /* max_case_values. */
728 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
729 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
730 &generic_prefetch_tune
731 };
732
733 static const struct tune_params cortexa72_tunings =
734 {
735 &cortexa57_extra_costs,
736 &generic_addrcost_table,
737 &cortexa57_regmove_cost,
738 &cortexa57_vector_cost,
739 &generic_branch_cost,
740 &generic_approx_modes,
741 4, /* memmov_cost */
742 3, /* issue_rate */
743 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
744 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
745 "16", /* function_align. */
746 "4", /* jump_align. */
747 "8", /* loop_align. */
748 2, /* int_reassoc_width. */
749 4, /* fp_reassoc_width. */
750 1, /* vec_reassoc_width. */
751 2, /* min_div_recip_mul_sf. */
752 2, /* min_div_recip_mul_df. */
753 0, /* max_case_values. */
754 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
755 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
756 &generic_prefetch_tune
757 };
758
759 static const struct tune_params cortexa73_tunings =
760 {
761 &cortexa57_extra_costs,
762 &generic_addrcost_table,
763 &cortexa57_regmove_cost,
764 &cortexa57_vector_cost,
765 &generic_branch_cost,
766 &generic_approx_modes,
767 4, /* memmov_cost. */
768 2, /* issue_rate. */
769 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
770 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
771 "16", /* function_align. */
772 "4", /* jump_align. */
773 "8", /* loop_align. */
774 2, /* int_reassoc_width. */
775 4, /* fp_reassoc_width. */
776 1, /* vec_reassoc_width. */
777 2, /* min_div_recip_mul_sf. */
778 2, /* min_div_recip_mul_df. */
779 0, /* max_case_values. */
780 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
781 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
782 &generic_prefetch_tune
783 };
784
785
786
787 static const struct tune_params exynosm1_tunings =
788 {
789 &exynosm1_extra_costs,
790 &exynosm1_addrcost_table,
791 &exynosm1_regmove_cost,
792 &exynosm1_vector_cost,
793 &generic_branch_cost,
794 &exynosm1_approx_modes,
795 4, /* memmov_cost */
796 3, /* issue_rate */
797 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
798 "4", /* function_align. */
799 "4", /* jump_align. */
800 "4", /* loop_align. */
801 2, /* int_reassoc_width. */
802 4, /* fp_reassoc_width. */
803 1, /* vec_reassoc_width. */
804 2, /* min_div_recip_mul_sf. */
805 2, /* min_div_recip_mul_df. */
806 48, /* max_case_values. */
807 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
808 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
809 &exynosm1_prefetch_tune
810 };
811
812 static const struct tune_params thunderxt88_tunings =
813 {
814 &thunderx_extra_costs,
815 &generic_addrcost_table,
816 &thunderx_regmove_cost,
817 &thunderx_vector_cost,
818 &generic_branch_cost,
819 &generic_approx_modes,
820 6, /* memmov_cost */
821 2, /* issue_rate */
822 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
823 "8", /* function_align. */
824 "8", /* jump_align. */
825 "8", /* loop_align. */
826 2, /* int_reassoc_width. */
827 4, /* fp_reassoc_width. */
828 1, /* vec_reassoc_width. */
829 2, /* min_div_recip_mul_sf. */
830 2, /* min_div_recip_mul_df. */
831 0, /* max_case_values. */
832 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
833 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
834 &thunderxt88_prefetch_tune
835 };
836
837 static const struct tune_params thunderx_tunings =
838 {
839 &thunderx_extra_costs,
840 &generic_addrcost_table,
841 &thunderx_regmove_cost,
842 &thunderx_vector_cost,
843 &generic_branch_cost,
844 &generic_approx_modes,
845 6, /* memmov_cost */
846 2, /* issue_rate */
847 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
848 "8", /* function_align. */
849 "8", /* jump_align. */
850 "8", /* loop_align. */
851 2, /* int_reassoc_width. */
852 4, /* fp_reassoc_width. */
853 1, /* vec_reassoc_width. */
854 2, /* min_div_recip_mul_sf. */
855 2, /* min_div_recip_mul_df. */
856 0, /* max_case_values. */
857 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
858 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
859 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
860 &thunderx_prefetch_tune
861 };
862
863 static const struct tune_params xgene1_tunings =
864 {
865 &xgene1_extra_costs,
866 &xgene1_addrcost_table,
867 &xgene1_regmove_cost,
868 &xgene1_vector_cost,
869 &generic_branch_cost,
870 &xgene1_approx_modes,
871 6, /* memmov_cost */
872 4, /* issue_rate */
873 AARCH64_FUSE_NOTHING, /* fusible_ops */
874 "16", /* function_align. */
875 "8", /* jump_align. */
876 "16", /* loop_align. */
877 2, /* int_reassoc_width. */
878 4, /* fp_reassoc_width. */
879 1, /* vec_reassoc_width. */
880 2, /* min_div_recip_mul_sf. */
881 2, /* min_div_recip_mul_df. */
882 0, /* max_case_values. */
883 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
884 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
885 &generic_prefetch_tune
886 };
887
888 static const struct tune_params qdf24xx_tunings =
889 {
890 &qdf24xx_extra_costs,
891 &qdf24xx_addrcost_table,
892 &qdf24xx_regmove_cost,
893 &generic_vector_cost,
894 &generic_branch_cost,
895 &generic_approx_modes,
896 4, /* memmov_cost */
897 4, /* issue_rate */
898 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
899 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
900 "16", /* function_align. */
901 "8", /* jump_align. */
902 "16", /* loop_align. */
903 2, /* int_reassoc_width. */
904 4, /* fp_reassoc_width. */
905 1, /* vec_reassoc_width. */
906 2, /* min_div_recip_mul_sf. */
907 2, /* min_div_recip_mul_df. */
908 0, /* max_case_values. */
909 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
910 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
911 &qdf24xx_prefetch_tune
912 };
913
914 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
915 for now. */
916 static const struct tune_params saphira_tunings =
917 {
918 &generic_extra_costs,
919 &generic_addrcost_table,
920 &generic_regmove_cost,
921 &generic_vector_cost,
922 &generic_branch_cost,
923 &generic_approx_modes,
924 4, /* memmov_cost */
925 4, /* issue_rate */
926 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
927 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
928 "16", /* function_align. */
929 "8", /* jump_align. */
930 "16", /* loop_align. */
931 2, /* int_reassoc_width. */
932 4, /* fp_reassoc_width. */
933 1, /* vec_reassoc_width. */
934 2, /* min_div_recip_mul_sf. */
935 2, /* min_div_recip_mul_df. */
936 0, /* max_case_values. */
937 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
938 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
939 &generic_prefetch_tune
940 };
941
942 static const struct tune_params thunderx2t99_tunings =
943 {
944 &thunderx2t99_extra_costs,
945 &thunderx2t99_addrcost_table,
946 &thunderx2t99_regmove_cost,
947 &thunderx2t99_vector_cost,
948 &generic_branch_cost,
949 &generic_approx_modes,
950 4, /* memmov_cost. */
951 4, /* issue_rate. */
952 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
953 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
954 "16", /* function_align. */
955 "8", /* jump_align. */
956 "16", /* loop_align. */
957 3, /* int_reassoc_width. */
958 2, /* fp_reassoc_width. */
959 2, /* vec_reassoc_width. */
960 2, /* min_div_recip_mul_sf. */
961 2, /* min_div_recip_mul_df. */
962 0, /* max_case_values. */
963 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
964 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
965 &thunderx2t99_prefetch_tune
966 };
967
968 /* Support for fine-grained override of the tuning structures. */
969 struct aarch64_tuning_override_function
970 {
971 const char* name;
972 void (*parse_override)(const char*, struct tune_params*);
973 };
974
975 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
976 static void aarch64_parse_tune_string (const char*, struct tune_params*);
977
978 static const struct aarch64_tuning_override_function
979 aarch64_tuning_override_functions[] =
980 {
981 { "fuse", aarch64_parse_fuse_string },
982 { "tune", aarch64_parse_tune_string },
983 { NULL, NULL }
984 };
985
986 /* A processor implementing AArch64. */
987 struct processor
988 {
989 const char *const name;
990 enum aarch64_processor ident;
991 enum aarch64_processor sched_core;
992 enum aarch64_arch arch;
993 unsigned architecture_version;
994 const unsigned long flags;
995 const struct tune_params *const tune;
996 };
997
998 /* Architectures implementing AArch64. */
999 static const struct processor all_architectures[] =
1000 {
1001 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1002 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1003 #include "aarch64-arches.def"
1004 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1005 };
1006
1007 /* Processor cores implementing AArch64. */
1008 static const struct processor all_cores[] =
1009 {
1010 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1011 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1012 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1013 FLAGS, &COSTS##_tunings},
1014 #include "aarch64-cores.def"
1015 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1016 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1017 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1018 };
1019
1020
1021 /* Target specification. These are populated by the -march, -mtune, -mcpu
1022 handling code or by target attributes. */
1023 static const struct processor *selected_arch;
1024 static const struct processor *selected_cpu;
1025 static const struct processor *selected_tune;
1026
1027 /* The current tuning set. */
1028 struct tune_params aarch64_tune_params = generic_tunings;
1029
1030 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031
1032 /* An ISA extension in the co-processor and main instruction set space. */
1033 struct aarch64_option_extension
1034 {
1035 const char *const name;
1036 const unsigned long flags_on;
1037 const unsigned long flags_off;
1038 };
1039
1040 typedef enum aarch64_cond_code
1041 {
1042 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1043 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1044 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1045 }
1046 aarch64_cc;
1047
1048 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049
1050 /* The condition codes of the processor, and the inverse function. */
1051 static const char * const aarch64_condition_codes[] =
1052 {
1053 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1054 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1055 };
1056
1057 /* Generate code to enable conditional branches in functions over 1 MiB. */
1058 const char *
1059 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1060 const char * branch_format)
1061 {
1062 rtx_code_label * tmp_label = gen_label_rtx ();
1063 char label_buf[256];
1064 char buffer[128];
1065 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1066 CODE_LABEL_NUMBER (tmp_label));
1067 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1068 rtx dest_label = operands[pos_label];
1069 operands[pos_label] = tmp_label;
1070
1071 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1072 output_asm_insn (buffer, operands);
1073
1074 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1075 operands[pos_label] = dest_label;
1076 output_asm_insn (buffer, operands);
1077 return "";
1078 }
1079
1080 void
1081 aarch64_err_no_fpadvsimd (machine_mode mode)
1082 {
1083 if (TARGET_GENERAL_REGS_ONLY)
1084 if (FLOAT_MODE_P (mode))
1085 error ("%qs is incompatible with the use of floating-point types",
1086 "-mgeneral-regs-only");
1087 else
1088 error ("%qs is incompatible with the use of vector types",
1089 "-mgeneral-regs-only");
1090 else
1091 if (FLOAT_MODE_P (mode))
1092 error ("%qs feature modifier is incompatible with the use of"
1093 " floating-point types", "+nofp");
1094 else
1095 error ("%qs feature modifier is incompatible with the use of"
1096 " vector types", "+nofp");
1097 }
1098
1099 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1100 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1101 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1102 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1103 and GENERAL_REGS is lower than the memory cost (in this case the best class
1104 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1105 cost results in bad allocations with many redundant int<->FP moves which
1106 are expensive on various cores.
1107 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1108 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1109 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1110 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1111 The result of this is that it is no longer inefficient to have a higher
1112 memory move cost than the register move cost.
1113 */
1114
1115 static reg_class_t
1116 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1117 reg_class_t best_class)
1118 {
1119 machine_mode mode;
1120
1121 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1122 || !reg_class_subset_p (FP_REGS, allocno_class))
1123 return allocno_class;
1124
1125 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1126 || !reg_class_subset_p (FP_REGS, best_class))
1127 return best_class;
1128
1129 mode = PSEUDO_REGNO_MODE (regno);
1130 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1131 }
1132
1133 static unsigned int
1134 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1135 {
1136 if (GET_MODE_UNIT_SIZE (mode) == 4)
1137 return aarch64_tune_params.min_div_recip_mul_sf;
1138 return aarch64_tune_params.min_div_recip_mul_df;
1139 }
1140
1141 /* Return the reassociation width of treeop OPC with mode MODE. */
1142 static int
1143 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1144 {
1145 if (VECTOR_MODE_P (mode))
1146 return aarch64_tune_params.vec_reassoc_width;
1147 if (INTEGRAL_MODE_P (mode))
1148 return aarch64_tune_params.int_reassoc_width;
1149 /* Avoid reassociating floating point addition so we emit more FMAs. */
1150 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1151 return aarch64_tune_params.fp_reassoc_width;
1152 return 1;
1153 }
1154
1155 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1156 unsigned
1157 aarch64_dbx_register_number (unsigned regno)
1158 {
1159 if (GP_REGNUM_P (regno))
1160 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1161 else if (regno == SP_REGNUM)
1162 return AARCH64_DWARF_SP;
1163 else if (FP_REGNUM_P (regno))
1164 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1165 else if (PR_REGNUM_P (regno))
1166 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1167 else if (regno == VG_REGNUM)
1168 return AARCH64_DWARF_VG;
1169
1170 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1171 equivalent DWARF register. */
1172 return DWARF_FRAME_REGISTERS;
1173 }
1174
1175 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1176 static bool
1177 aarch64_advsimd_struct_mode_p (machine_mode mode)
1178 {
1179 return (TARGET_SIMD
1180 && (mode == OImode || mode == CImode || mode == XImode));
1181 }
1182
1183 /* Return true if MODE is an SVE predicate mode. */
1184 static bool
1185 aarch64_sve_pred_mode_p (machine_mode mode)
1186 {
1187 return (TARGET_SVE
1188 && (mode == VNx16BImode
1189 || mode == VNx8BImode
1190 || mode == VNx4BImode
1191 || mode == VNx2BImode));
1192 }
1193
1194 /* Three mutually-exclusive flags describing a vector or predicate type. */
1195 const unsigned int VEC_ADVSIMD = 1;
1196 const unsigned int VEC_SVE_DATA = 2;
1197 const unsigned int VEC_SVE_PRED = 4;
1198 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1199 a structure of 2, 3 or 4 vectors. */
1200 const unsigned int VEC_STRUCT = 8;
1201 /* Useful combinations of the above. */
1202 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1203 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1204
1205 /* Return a set of flags describing the vector properties of mode MODE.
1206 Ignore modes that are not supported by the current target. */
1207 static unsigned int
1208 aarch64_classify_vector_mode (machine_mode mode)
1209 {
1210 if (aarch64_advsimd_struct_mode_p (mode))
1211 return VEC_ADVSIMD | VEC_STRUCT;
1212
1213 if (aarch64_sve_pred_mode_p (mode))
1214 return VEC_SVE_PRED;
1215
1216 scalar_mode inner = GET_MODE_INNER (mode);
1217 if (VECTOR_MODE_P (mode)
1218 && (inner == QImode
1219 || inner == HImode
1220 || inner == HFmode
1221 || inner == SImode
1222 || inner == SFmode
1223 || inner == DImode
1224 || inner == DFmode))
1225 {
1226 if (TARGET_SVE)
1227 {
1228 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1229 return VEC_SVE_DATA;
1230 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1231 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1232 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1233 return VEC_SVE_DATA | VEC_STRUCT;
1234 }
1235
1236 /* This includes V1DF but not V1DI (which doesn't exist). */
1237 if (TARGET_SIMD
1238 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1239 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1240 return VEC_ADVSIMD;
1241 }
1242
1243 return 0;
1244 }
1245
1246 /* Return true if MODE is any of the data vector modes, including
1247 structure modes. */
1248 static bool
1249 aarch64_vector_data_mode_p (machine_mode mode)
1250 {
1251 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1252 }
1253
1254 /* Return true if MODE is an SVE data vector mode; either a single vector
1255 or a structure of vectors. */
1256 static bool
1257 aarch64_sve_data_mode_p (machine_mode mode)
1258 {
1259 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1260 }
1261
1262 /* Implement target hook TARGET_ARRAY_MODE. */
1263 static opt_machine_mode
1264 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1265 {
1266 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1267 && IN_RANGE (nelems, 2, 4))
1268 return mode_for_vector (GET_MODE_INNER (mode),
1269 GET_MODE_NUNITS (mode) * nelems);
1270
1271 return opt_machine_mode ();
1272 }
1273
1274 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1275 static bool
1276 aarch64_array_mode_supported_p (machine_mode mode,
1277 unsigned HOST_WIDE_INT nelems)
1278 {
1279 if (TARGET_SIMD
1280 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1281 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1282 && (nelems >= 2 && nelems <= 4))
1283 return true;
1284
1285 return false;
1286 }
1287
1288 /* Return the SVE predicate mode to use for elements that have
1289 ELEM_NBYTES bytes, if such a mode exists. */
1290
1291 opt_machine_mode
1292 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1293 {
1294 if (TARGET_SVE)
1295 {
1296 if (elem_nbytes == 1)
1297 return VNx16BImode;
1298 if (elem_nbytes == 2)
1299 return VNx8BImode;
1300 if (elem_nbytes == 4)
1301 return VNx4BImode;
1302 if (elem_nbytes == 8)
1303 return VNx2BImode;
1304 }
1305 return opt_machine_mode ();
1306 }
1307
1308 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1309
1310 static opt_machine_mode
1311 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1312 {
1313 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1314 {
1315 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1316 machine_mode pred_mode;
1317 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1318 return pred_mode;
1319 }
1320
1321 return default_get_mask_mode (nunits, nbytes);
1322 }
1323
1324 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1325 prefer to use the first arithmetic operand as the else value if
1326 the else value doesn't matter, since that exactly matches the SVE
1327 destructive merging form. For ternary operations we could either
1328 pick the first operand and use FMAD-like instructions or the last
1329 operand and use FMLA-like instructions; the latter seems more
1330 natural. */
1331
1332 static tree
1333 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1334 {
1335 return nops == 3 ? ops[2] : ops[0];
1336 }
1337
1338 /* Implement TARGET_HARD_REGNO_NREGS. */
1339
1340 static unsigned int
1341 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1342 {
1343 /* ??? Logically we should only need to provide a value when
1344 HARD_REGNO_MODE_OK says that the combination is valid,
1345 but at the moment we need to handle all modes. Just ignore
1346 any runtime parts for registers that can't store them. */
1347 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1348 switch (aarch64_regno_regclass (regno))
1349 {
1350 case FP_REGS:
1351 case FP_LO_REGS:
1352 if (aarch64_sve_data_mode_p (mode))
1353 return exact_div (GET_MODE_SIZE (mode),
1354 BYTES_PER_SVE_VECTOR).to_constant ();
1355 return CEIL (lowest_size, UNITS_PER_VREG);
1356 case PR_REGS:
1357 case PR_LO_REGS:
1358 case PR_HI_REGS:
1359 return 1;
1360 default:
1361 return CEIL (lowest_size, UNITS_PER_WORD);
1362 }
1363 gcc_unreachable ();
1364 }
1365
1366 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1367
1368 static bool
1369 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1370 {
1371 if (GET_MODE_CLASS (mode) == MODE_CC)
1372 return regno == CC_REGNUM;
1373
1374 if (regno == VG_REGNUM)
1375 /* This must have the same size as _Unwind_Word. */
1376 return mode == DImode;
1377
1378 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1379 if (vec_flags & VEC_SVE_PRED)
1380 return PR_REGNUM_P (regno);
1381
1382 if (PR_REGNUM_P (regno))
1383 return 0;
1384
1385 if (regno == SP_REGNUM)
1386 /* The purpose of comparing with ptr_mode is to support the
1387 global register variable associated with the stack pointer
1388 register via the syntax of asm ("wsp") in ILP32. */
1389 return mode == Pmode || mode == ptr_mode;
1390
1391 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1392 return mode == Pmode;
1393
1394 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1395 return true;
1396
1397 if (FP_REGNUM_P (regno))
1398 {
1399 if (vec_flags & VEC_STRUCT)
1400 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1401 else
1402 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1403 }
1404
1405 return false;
1406 }
1407
1408 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1409 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1410 clobbers the top 64 bits when restoring the bottom 64 bits. */
1411
1412 static bool
1413 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1414 {
1415 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1416 }
1417
1418 /* Implement REGMODE_NATURAL_SIZE. */
1419 poly_uint64
1420 aarch64_regmode_natural_size (machine_mode mode)
1421 {
1422 /* The natural size for SVE data modes is one SVE data vector,
1423 and similarly for predicates. We can't independently modify
1424 anything smaller than that. */
1425 /* ??? For now, only do this for variable-width SVE registers.
1426 Doing it for constant-sized registers breaks lower-subreg.c. */
1427 /* ??? And once that's fixed, we should probably have similar
1428 code for Advanced SIMD. */
1429 if (!aarch64_sve_vg.is_constant ())
1430 {
1431 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1432 if (vec_flags & VEC_SVE_PRED)
1433 return BYTES_PER_SVE_PRED;
1434 if (vec_flags & VEC_SVE_DATA)
1435 return BYTES_PER_SVE_VECTOR;
1436 }
1437 return UNITS_PER_WORD;
1438 }
1439
1440 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1441 machine_mode
1442 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1443 machine_mode mode)
1444 {
1445 /* The predicate mode determines which bits are significant and
1446 which are "don't care". Decreasing the number of lanes would
1447 lose data while increasing the number of lanes would make bits
1448 unnecessarily significant. */
1449 if (PR_REGNUM_P (regno))
1450 return mode;
1451 if (known_ge (GET_MODE_SIZE (mode), 4))
1452 return mode;
1453 else
1454 return SImode;
1455 }
1456
1457 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1458 that strcpy from constants will be faster. */
1459
1460 static HOST_WIDE_INT
1461 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1462 {
1463 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1464 return MAX (align, BITS_PER_WORD);
1465 return align;
1466 }
1467
1468 /* Return true if calls to DECL should be treated as
1469 long-calls (ie called via a register). */
1470 static bool
1471 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1472 {
1473 return false;
1474 }
1475
1476 /* Return true if calls to symbol-ref SYM should be treated as
1477 long-calls (ie called via a register). */
1478 bool
1479 aarch64_is_long_call_p (rtx sym)
1480 {
1481 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1482 }
1483
1484 /* Return true if calls to symbol-ref SYM should not go through
1485 plt stubs. */
1486
1487 bool
1488 aarch64_is_noplt_call_p (rtx sym)
1489 {
1490 const_tree decl = SYMBOL_REF_DECL (sym);
1491
1492 if (flag_pic
1493 && decl
1494 && (!flag_plt
1495 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1496 && !targetm.binds_local_p (decl))
1497 return true;
1498
1499 return false;
1500 }
1501
1502 /* Return true if the offsets to a zero/sign-extract operation
1503 represent an expression that matches an extend operation. The
1504 operands represent the paramters from
1505
1506 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1507 bool
1508 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1509 rtx extract_imm)
1510 {
1511 HOST_WIDE_INT mult_val, extract_val;
1512
1513 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1514 return false;
1515
1516 mult_val = INTVAL (mult_imm);
1517 extract_val = INTVAL (extract_imm);
1518
1519 if (extract_val > 8
1520 && extract_val < GET_MODE_BITSIZE (mode)
1521 && exact_log2 (extract_val & ~7) > 0
1522 && (extract_val & 7) <= 4
1523 && mult_val == (1 << (extract_val & 7)))
1524 return true;
1525
1526 return false;
1527 }
1528
1529 /* Emit an insn that's a simple single-set. Both the operands must be
1530 known to be valid. */
1531 inline static rtx_insn *
1532 emit_set_insn (rtx x, rtx y)
1533 {
1534 return emit_insn (gen_rtx_SET (x, y));
1535 }
1536
1537 /* X and Y are two things to compare using CODE. Emit the compare insn and
1538 return the rtx for register 0 in the proper mode. */
1539 rtx
1540 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1541 {
1542 machine_mode mode = SELECT_CC_MODE (code, x, y);
1543 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1544
1545 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1546 return cc_reg;
1547 }
1548
1549 /* Build the SYMBOL_REF for __tls_get_addr. */
1550
1551 static GTY(()) rtx tls_get_addr_libfunc;
1552
1553 rtx
1554 aarch64_tls_get_addr (void)
1555 {
1556 if (!tls_get_addr_libfunc)
1557 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1558 return tls_get_addr_libfunc;
1559 }
1560
1561 /* Return the TLS model to use for ADDR. */
1562
1563 static enum tls_model
1564 tls_symbolic_operand_type (rtx addr)
1565 {
1566 enum tls_model tls_kind = TLS_MODEL_NONE;
1567 if (GET_CODE (addr) == CONST)
1568 {
1569 poly_int64 addend;
1570 rtx sym = strip_offset (addr, &addend);
1571 if (GET_CODE (sym) == SYMBOL_REF)
1572 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1573 }
1574 else if (GET_CODE (addr) == SYMBOL_REF)
1575 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1576
1577 return tls_kind;
1578 }
1579
1580 /* We'll allow lo_sum's in addresses in our legitimate addresses
1581 so that combine would take care of combining addresses where
1582 necessary, but for generation purposes, we'll generate the address
1583 as :
1584 RTL Absolute
1585 tmp = hi (symbol_ref); adrp x1, foo
1586 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1587 nop
1588
1589 PIC TLS
1590 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1591 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1592 bl __tls_get_addr
1593 nop
1594
1595 Load TLS symbol, depending on TLS mechanism and TLS access model.
1596
1597 Global Dynamic - Traditional TLS:
1598 adrp tmp, :tlsgd:imm
1599 add dest, tmp, #:tlsgd_lo12:imm
1600 bl __tls_get_addr
1601
1602 Global Dynamic - TLS Descriptors:
1603 adrp dest, :tlsdesc:imm
1604 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1605 add dest, dest, #:tlsdesc_lo12:imm
1606 blr tmp
1607 mrs tp, tpidr_el0
1608 add dest, dest, tp
1609
1610 Initial Exec:
1611 mrs tp, tpidr_el0
1612 adrp tmp, :gottprel:imm
1613 ldr dest, [tmp, #:gottprel_lo12:imm]
1614 add dest, dest, tp
1615
1616 Local Exec:
1617 mrs tp, tpidr_el0
1618 add t0, tp, #:tprel_hi12:imm, lsl #12
1619 add t0, t0, #:tprel_lo12_nc:imm
1620 */
1621
1622 static void
1623 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1624 enum aarch64_symbol_type type)
1625 {
1626 switch (type)
1627 {
1628 case SYMBOL_SMALL_ABSOLUTE:
1629 {
1630 /* In ILP32, the mode of dest can be either SImode or DImode. */
1631 rtx tmp_reg = dest;
1632 machine_mode mode = GET_MODE (dest);
1633
1634 gcc_assert (mode == Pmode || mode == ptr_mode);
1635
1636 if (can_create_pseudo_p ())
1637 tmp_reg = gen_reg_rtx (mode);
1638
1639 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1640 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1641 return;
1642 }
1643
1644 case SYMBOL_TINY_ABSOLUTE:
1645 emit_insn (gen_rtx_SET (dest, imm));
1646 return;
1647
1648 case SYMBOL_SMALL_GOT_28K:
1649 {
1650 machine_mode mode = GET_MODE (dest);
1651 rtx gp_rtx = pic_offset_table_rtx;
1652 rtx insn;
1653 rtx mem;
1654
1655 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1656 here before rtl expand. Tree IVOPT will generate rtl pattern to
1657 decide rtx costs, in which case pic_offset_table_rtx is not
1658 initialized. For that case no need to generate the first adrp
1659 instruction as the final cost for global variable access is
1660 one instruction. */
1661 if (gp_rtx != NULL)
1662 {
1663 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1664 using the page base as GOT base, the first page may be wasted,
1665 in the worst scenario, there is only 28K space for GOT).
1666
1667 The generate instruction sequence for accessing global variable
1668 is:
1669
1670 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1671
1672 Only one instruction needed. But we must initialize
1673 pic_offset_table_rtx properly. We generate initialize insn for
1674 every global access, and allow CSE to remove all redundant.
1675
1676 The final instruction sequences will look like the following
1677 for multiply global variables access.
1678
1679 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1680
1681 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1682 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1683 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1684 ... */
1685
1686 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1687 crtl->uses_pic_offset_table = 1;
1688 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1689
1690 if (mode != GET_MODE (gp_rtx))
1691 gp_rtx = gen_lowpart (mode, gp_rtx);
1692
1693 }
1694
1695 if (mode == ptr_mode)
1696 {
1697 if (mode == DImode)
1698 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1699 else
1700 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1701
1702 mem = XVECEXP (SET_SRC (insn), 0, 0);
1703 }
1704 else
1705 {
1706 gcc_assert (mode == Pmode);
1707
1708 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1709 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1710 }
1711
1712 /* The operand is expected to be MEM. Whenever the related insn
1713 pattern changed, above code which calculate mem should be
1714 updated. */
1715 gcc_assert (GET_CODE (mem) == MEM);
1716 MEM_READONLY_P (mem) = 1;
1717 MEM_NOTRAP_P (mem) = 1;
1718 emit_insn (insn);
1719 return;
1720 }
1721
1722 case SYMBOL_SMALL_GOT_4G:
1723 {
1724 /* In ILP32, the mode of dest can be either SImode or DImode,
1725 while the got entry is always of SImode size. The mode of
1726 dest depends on how dest is used: if dest is assigned to a
1727 pointer (e.g. in the memory), it has SImode; it may have
1728 DImode if dest is dereferenced to access the memeory.
1729 This is why we have to handle three different ldr_got_small
1730 patterns here (two patterns for ILP32). */
1731
1732 rtx insn;
1733 rtx mem;
1734 rtx tmp_reg = dest;
1735 machine_mode mode = GET_MODE (dest);
1736
1737 if (can_create_pseudo_p ())
1738 tmp_reg = gen_reg_rtx (mode);
1739
1740 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1741 if (mode == ptr_mode)
1742 {
1743 if (mode == DImode)
1744 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1745 else
1746 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1747
1748 mem = XVECEXP (SET_SRC (insn), 0, 0);
1749 }
1750 else
1751 {
1752 gcc_assert (mode == Pmode);
1753
1754 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1755 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1756 }
1757
1758 gcc_assert (GET_CODE (mem) == MEM);
1759 MEM_READONLY_P (mem) = 1;
1760 MEM_NOTRAP_P (mem) = 1;
1761 emit_insn (insn);
1762 return;
1763 }
1764
1765 case SYMBOL_SMALL_TLSGD:
1766 {
1767 rtx_insn *insns;
1768 machine_mode mode = GET_MODE (dest);
1769 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1770
1771 start_sequence ();
1772 if (TARGET_ILP32)
1773 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1774 else
1775 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1776 insns = get_insns ();
1777 end_sequence ();
1778
1779 RTL_CONST_CALL_P (insns) = 1;
1780 emit_libcall_block (insns, dest, result, imm);
1781 return;
1782 }
1783
1784 case SYMBOL_SMALL_TLSDESC:
1785 {
1786 machine_mode mode = GET_MODE (dest);
1787 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1788 rtx tp;
1789
1790 gcc_assert (mode == Pmode || mode == ptr_mode);
1791
1792 /* In ILP32, the got entry is always of SImode size. Unlike
1793 small GOT, the dest is fixed at reg 0. */
1794 if (TARGET_ILP32)
1795 emit_insn (gen_tlsdesc_small_si (imm));
1796 else
1797 emit_insn (gen_tlsdesc_small_di (imm));
1798 tp = aarch64_load_tp (NULL);
1799
1800 if (mode != Pmode)
1801 tp = gen_lowpart (mode, tp);
1802
1803 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1804 if (REG_P (dest))
1805 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1806 return;
1807 }
1808
1809 case SYMBOL_SMALL_TLSIE:
1810 {
1811 /* In ILP32, the mode of dest can be either SImode or DImode,
1812 while the got entry is always of SImode size. The mode of
1813 dest depends on how dest is used: if dest is assigned to a
1814 pointer (e.g. in the memory), it has SImode; it may have
1815 DImode if dest is dereferenced to access the memeory.
1816 This is why we have to handle three different tlsie_small
1817 patterns here (two patterns for ILP32). */
1818 machine_mode mode = GET_MODE (dest);
1819 rtx tmp_reg = gen_reg_rtx (mode);
1820 rtx tp = aarch64_load_tp (NULL);
1821
1822 if (mode == ptr_mode)
1823 {
1824 if (mode == DImode)
1825 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1826 else
1827 {
1828 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1829 tp = gen_lowpart (mode, tp);
1830 }
1831 }
1832 else
1833 {
1834 gcc_assert (mode == Pmode);
1835 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1836 }
1837
1838 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1839 if (REG_P (dest))
1840 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1841 return;
1842 }
1843
1844 case SYMBOL_TLSLE12:
1845 case SYMBOL_TLSLE24:
1846 case SYMBOL_TLSLE32:
1847 case SYMBOL_TLSLE48:
1848 {
1849 machine_mode mode = GET_MODE (dest);
1850 rtx tp = aarch64_load_tp (NULL);
1851
1852 if (mode != Pmode)
1853 tp = gen_lowpart (mode, tp);
1854
1855 switch (type)
1856 {
1857 case SYMBOL_TLSLE12:
1858 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1859 (dest, tp, imm));
1860 break;
1861 case SYMBOL_TLSLE24:
1862 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1863 (dest, tp, imm));
1864 break;
1865 case SYMBOL_TLSLE32:
1866 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1867 (dest, imm));
1868 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1869 (dest, dest, tp));
1870 break;
1871 case SYMBOL_TLSLE48:
1872 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1873 (dest, imm));
1874 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1875 (dest, dest, tp));
1876 break;
1877 default:
1878 gcc_unreachable ();
1879 }
1880
1881 if (REG_P (dest))
1882 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1883 return;
1884 }
1885
1886 case SYMBOL_TINY_GOT:
1887 emit_insn (gen_ldr_got_tiny (dest, imm));
1888 return;
1889
1890 case SYMBOL_TINY_TLSIE:
1891 {
1892 machine_mode mode = GET_MODE (dest);
1893 rtx tp = aarch64_load_tp (NULL);
1894
1895 if (mode == ptr_mode)
1896 {
1897 if (mode == DImode)
1898 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1899 else
1900 {
1901 tp = gen_lowpart (mode, tp);
1902 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1903 }
1904 }
1905 else
1906 {
1907 gcc_assert (mode == Pmode);
1908 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1909 }
1910
1911 if (REG_P (dest))
1912 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1913 return;
1914 }
1915
1916 default:
1917 gcc_unreachable ();
1918 }
1919 }
1920
1921 /* Emit a move from SRC to DEST. Assume that the move expanders can
1922 handle all moves if !can_create_pseudo_p (). The distinction is
1923 important because, unlike emit_move_insn, the move expanders know
1924 how to force Pmode objects into the constant pool even when the
1925 constant pool address is not itself legitimate. */
1926 static rtx
1927 aarch64_emit_move (rtx dest, rtx src)
1928 {
1929 return (can_create_pseudo_p ()
1930 ? emit_move_insn (dest, src)
1931 : emit_move_insn_1 (dest, src));
1932 }
1933
1934 /* Apply UNOPTAB to OP and store the result in DEST. */
1935
1936 static void
1937 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1938 {
1939 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1940 if (dest != tmp)
1941 emit_move_insn (dest, tmp);
1942 }
1943
1944 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1945
1946 static void
1947 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1948 {
1949 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1950 OPTAB_DIRECT);
1951 if (dest != tmp)
1952 emit_move_insn (dest, tmp);
1953 }
1954
1955 /* Split a 128-bit move operation into two 64-bit move operations,
1956 taking care to handle partial overlap of register to register
1957 copies. Special cases are needed when moving between GP regs and
1958 FP regs. SRC can be a register, constant or memory; DST a register
1959 or memory. If either operand is memory it must not have any side
1960 effects. */
1961 void
1962 aarch64_split_128bit_move (rtx dst, rtx src)
1963 {
1964 rtx dst_lo, dst_hi;
1965 rtx src_lo, src_hi;
1966
1967 machine_mode mode = GET_MODE (dst);
1968
1969 gcc_assert (mode == TImode || mode == TFmode);
1970 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1971 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1972
1973 if (REG_P (dst) && REG_P (src))
1974 {
1975 int src_regno = REGNO (src);
1976 int dst_regno = REGNO (dst);
1977
1978 /* Handle FP <-> GP regs. */
1979 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1980 {
1981 src_lo = gen_lowpart (word_mode, src);
1982 src_hi = gen_highpart (word_mode, src);
1983
1984 if (mode == TImode)
1985 {
1986 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1987 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1988 }
1989 else
1990 {
1991 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1992 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1993 }
1994 return;
1995 }
1996 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1997 {
1998 dst_lo = gen_lowpart (word_mode, dst);
1999 dst_hi = gen_highpart (word_mode, dst);
2000
2001 if (mode == TImode)
2002 {
2003 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
2004 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
2005 }
2006 else
2007 {
2008 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
2009 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
2010 }
2011 return;
2012 }
2013 }
2014
2015 dst_lo = gen_lowpart (word_mode, dst);
2016 dst_hi = gen_highpart (word_mode, dst);
2017 src_lo = gen_lowpart (word_mode, src);
2018 src_hi = gen_highpart_mode (word_mode, mode, src);
2019
2020 /* At most one pairing may overlap. */
2021 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2022 {
2023 aarch64_emit_move (dst_hi, src_hi);
2024 aarch64_emit_move (dst_lo, src_lo);
2025 }
2026 else
2027 {
2028 aarch64_emit_move (dst_lo, src_lo);
2029 aarch64_emit_move (dst_hi, src_hi);
2030 }
2031 }
2032
2033 bool
2034 aarch64_split_128bit_move_p (rtx dst, rtx src)
2035 {
2036 return (! REG_P (src)
2037 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2038 }
2039
2040 /* Split a complex SIMD combine. */
2041
2042 void
2043 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2044 {
2045 machine_mode src_mode = GET_MODE (src1);
2046 machine_mode dst_mode = GET_MODE (dst);
2047
2048 gcc_assert (VECTOR_MODE_P (dst_mode));
2049 gcc_assert (register_operand (dst, dst_mode)
2050 && register_operand (src1, src_mode)
2051 && register_operand (src2, src_mode));
2052
2053 rtx (*gen) (rtx, rtx, rtx);
2054
2055 switch (src_mode)
2056 {
2057 case E_V8QImode:
2058 gen = gen_aarch64_simd_combinev8qi;
2059 break;
2060 case E_V4HImode:
2061 gen = gen_aarch64_simd_combinev4hi;
2062 break;
2063 case E_V2SImode:
2064 gen = gen_aarch64_simd_combinev2si;
2065 break;
2066 case E_V4HFmode:
2067 gen = gen_aarch64_simd_combinev4hf;
2068 break;
2069 case E_V2SFmode:
2070 gen = gen_aarch64_simd_combinev2sf;
2071 break;
2072 case E_DImode:
2073 gen = gen_aarch64_simd_combinedi;
2074 break;
2075 case E_DFmode:
2076 gen = gen_aarch64_simd_combinedf;
2077 break;
2078 default:
2079 gcc_unreachable ();
2080 }
2081
2082 emit_insn (gen (dst, src1, src2));
2083 return;
2084 }
2085
2086 /* Split a complex SIMD move. */
2087
2088 void
2089 aarch64_split_simd_move (rtx dst, rtx src)
2090 {
2091 machine_mode src_mode = GET_MODE (src);
2092 machine_mode dst_mode = GET_MODE (dst);
2093
2094 gcc_assert (VECTOR_MODE_P (dst_mode));
2095
2096 if (REG_P (dst) && REG_P (src))
2097 {
2098 rtx (*gen) (rtx, rtx);
2099
2100 gcc_assert (VECTOR_MODE_P (src_mode));
2101
2102 switch (src_mode)
2103 {
2104 case E_V16QImode:
2105 gen = gen_aarch64_split_simd_movv16qi;
2106 break;
2107 case E_V8HImode:
2108 gen = gen_aarch64_split_simd_movv8hi;
2109 break;
2110 case E_V4SImode:
2111 gen = gen_aarch64_split_simd_movv4si;
2112 break;
2113 case E_V2DImode:
2114 gen = gen_aarch64_split_simd_movv2di;
2115 break;
2116 case E_V8HFmode:
2117 gen = gen_aarch64_split_simd_movv8hf;
2118 break;
2119 case E_V4SFmode:
2120 gen = gen_aarch64_split_simd_movv4sf;
2121 break;
2122 case E_V2DFmode:
2123 gen = gen_aarch64_split_simd_movv2df;
2124 break;
2125 default:
2126 gcc_unreachable ();
2127 }
2128
2129 emit_insn (gen (dst, src));
2130 return;
2131 }
2132 }
2133
2134 bool
2135 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2136 machine_mode ymode, rtx y)
2137 {
2138 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2139 gcc_assert (r != NULL);
2140 return rtx_equal_p (x, r);
2141 }
2142
2143
2144 static rtx
2145 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2146 {
2147 if (can_create_pseudo_p ())
2148 return force_reg (mode, value);
2149 else
2150 {
2151 gcc_assert (x);
2152 aarch64_emit_move (x, value);
2153 return x;
2154 }
2155 }
2156
2157 /* Return true if we can move VALUE into a register using a single
2158 CNT[BHWD] instruction. */
2159
2160 static bool
2161 aarch64_sve_cnt_immediate_p (poly_int64 value)
2162 {
2163 HOST_WIDE_INT factor = value.coeffs[0];
2164 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2165 return (value.coeffs[1] == factor
2166 && IN_RANGE (factor, 2, 16 * 16)
2167 && (factor & 1) == 0
2168 && factor <= 16 * (factor & -factor));
2169 }
2170
2171 /* Likewise for rtx X. */
2172
2173 bool
2174 aarch64_sve_cnt_immediate_p (rtx x)
2175 {
2176 poly_int64 value;
2177 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2178 }
2179
2180 /* Return the asm string for an instruction with a CNT-like vector size
2181 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2182 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2183 first part of the operands template (the part that comes before the
2184 vector size itself). FACTOR is the number of quadwords.
2185 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2186 If it is zero, we can use any element size. */
2187
2188 static char *
2189 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2190 unsigned int factor,
2191 unsigned int nelts_per_vq)
2192 {
2193 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2194
2195 if (nelts_per_vq == 0)
2196 /* There is some overlap in the ranges of the four CNT instructions.
2197 Here we always use the smallest possible element size, so that the
2198 multiplier is 1 whereever possible. */
2199 nelts_per_vq = factor & -factor;
2200 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2201 gcc_assert (IN_RANGE (shift, 1, 4));
2202 char suffix = "dwhb"[shift - 1];
2203
2204 factor >>= shift;
2205 unsigned int written;
2206 if (factor == 1)
2207 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2208 prefix, suffix, operands);
2209 else
2210 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2211 prefix, suffix, operands, factor);
2212 gcc_assert (written < sizeof (buffer));
2213 return buffer;
2214 }
2215
2216 /* Return the asm string for an instruction with a CNT-like vector size
2217 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2218 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2219 first part of the operands template (the part that comes before the
2220 vector size itself). X is the value of the vector size operand,
2221 as a polynomial integer rtx. */
2222
2223 char *
2224 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2225 rtx x)
2226 {
2227 poly_int64 value = rtx_to_poly_int64 (x);
2228 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2229 return aarch64_output_sve_cnt_immediate (prefix, operands,
2230 value.coeffs[1], 0);
2231 }
2232
2233 /* Return true if we can add VALUE to a register using a single ADDVL
2234 or ADDPL instruction. */
2235
2236 static bool
2237 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2238 {
2239 HOST_WIDE_INT factor = value.coeffs[0];
2240 if (factor == 0 || value.coeffs[1] != factor)
2241 return false;
2242 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2243 and a value of 16 is one vector width. */
2244 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2245 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2246 }
2247
2248 /* Likewise for rtx X. */
2249
2250 bool
2251 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2252 {
2253 poly_int64 value;
2254 return (poly_int_rtx_p (x, &value)
2255 && aarch64_sve_addvl_addpl_immediate_p (value));
2256 }
2257
2258 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2259 and storing the result in operand 0. */
2260
2261 char *
2262 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2263 {
2264 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2265 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2266 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2267
2268 /* Use INC or DEC if possible. */
2269 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2270 {
2271 if (aarch64_sve_cnt_immediate_p (offset_value))
2272 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2273 offset_value.coeffs[1], 0);
2274 if (aarch64_sve_cnt_immediate_p (-offset_value))
2275 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2276 -offset_value.coeffs[1], 0);
2277 }
2278
2279 int factor = offset_value.coeffs[1];
2280 if ((factor & 15) == 0)
2281 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2282 else
2283 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2284 return buffer;
2285 }
2286
2287 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2288 instruction. If it is, store the number of elements in each vector
2289 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2290 factor in *FACTOR_OUT (if nonnull). */
2291
2292 bool
2293 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2294 unsigned int *nelts_per_vq_out)
2295 {
2296 rtx elt;
2297 poly_int64 value;
2298
2299 if (!const_vec_duplicate_p (x, &elt)
2300 || !poly_int_rtx_p (elt, &value))
2301 return false;
2302
2303 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2304 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2305 /* There's no vector INCB. */
2306 return false;
2307
2308 HOST_WIDE_INT factor = value.coeffs[0];
2309 if (value.coeffs[1] != factor)
2310 return false;
2311
2312 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2313 if ((factor % nelts_per_vq) != 0
2314 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2315 return false;
2316
2317 if (factor_out)
2318 *factor_out = factor;
2319 if (nelts_per_vq_out)
2320 *nelts_per_vq_out = nelts_per_vq;
2321 return true;
2322 }
2323
2324 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2325 instruction. */
2326
2327 bool
2328 aarch64_sve_inc_dec_immediate_p (rtx x)
2329 {
2330 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2331 }
2332
2333 /* Return the asm template for an SVE vector INC or DEC instruction.
2334 OPERANDS gives the operands before the vector count and X is the
2335 value of the vector count operand itself. */
2336
2337 char *
2338 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2339 {
2340 int factor;
2341 unsigned int nelts_per_vq;
2342 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2343 gcc_unreachable ();
2344 if (factor < 0)
2345 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2346 nelts_per_vq);
2347 else
2348 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2349 nelts_per_vq);
2350 }
2351
2352 static int
2353 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2354 scalar_int_mode mode)
2355 {
2356 int i;
2357 unsigned HOST_WIDE_INT val, val2, mask;
2358 int one_match, zero_match;
2359 int num_insns;
2360
2361 val = INTVAL (imm);
2362
2363 if (aarch64_move_imm (val, mode))
2364 {
2365 if (generate)
2366 emit_insn (gen_rtx_SET (dest, imm));
2367 return 1;
2368 }
2369
2370 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2371 (with XXXX non-zero). In that case check to see if the move can be done in
2372 a smaller mode. */
2373 val2 = val & 0xffffffff;
2374 if (mode == DImode
2375 && aarch64_move_imm (val2, SImode)
2376 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2377 {
2378 if (generate)
2379 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2380
2381 /* Check if we have to emit a second instruction by checking to see
2382 if any of the upper 32 bits of the original DI mode value is set. */
2383 if (val == val2)
2384 return 1;
2385
2386 i = (val >> 48) ? 48 : 32;
2387
2388 if (generate)
2389 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2390 GEN_INT ((val >> i) & 0xffff)));
2391
2392 return 2;
2393 }
2394
2395 if ((val >> 32) == 0 || mode == SImode)
2396 {
2397 if (generate)
2398 {
2399 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2400 if (mode == SImode)
2401 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2402 GEN_INT ((val >> 16) & 0xffff)));
2403 else
2404 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2405 GEN_INT ((val >> 16) & 0xffff)));
2406 }
2407 return 2;
2408 }
2409
2410 /* Remaining cases are all for DImode. */
2411
2412 mask = 0xffff;
2413 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2414 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2415 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2416 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2417
2418 if (zero_match != 2 && one_match != 2)
2419 {
2420 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2421 For a 64-bit bitmask try whether changing 16 bits to all ones or
2422 zeroes creates a valid bitmask. To check any repeated bitmask,
2423 try using 16 bits from the other 32-bit half of val. */
2424
2425 for (i = 0; i < 64; i += 16, mask <<= 16)
2426 {
2427 val2 = val & ~mask;
2428 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2429 break;
2430 val2 = val | mask;
2431 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2432 break;
2433 val2 = val2 & ~mask;
2434 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2435 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2436 break;
2437 }
2438 if (i != 64)
2439 {
2440 if (generate)
2441 {
2442 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2443 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2444 GEN_INT ((val >> i) & 0xffff)));
2445 }
2446 return 2;
2447 }
2448 }
2449
2450 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2451 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2452 otherwise skip zero bits. */
2453
2454 num_insns = 1;
2455 mask = 0xffff;
2456 val2 = one_match > zero_match ? ~val : val;
2457 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2458
2459 if (generate)
2460 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2461 ? (val | ~(mask << i))
2462 : (val & (mask << i)))));
2463 for (i += 16; i < 64; i += 16)
2464 {
2465 if ((val2 & (mask << i)) == 0)
2466 continue;
2467 if (generate)
2468 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2469 GEN_INT ((val >> i) & 0xffff)));
2470 num_insns ++;
2471 }
2472
2473 return num_insns;
2474 }
2475
2476 /* Return whether imm is a 128-bit immediate which is simple enough to
2477 expand inline. */
2478 bool
2479 aarch64_mov128_immediate (rtx imm)
2480 {
2481 if (GET_CODE (imm) == CONST_INT)
2482 return true;
2483
2484 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2485
2486 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2487 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2488
2489 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2490 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2491 }
2492
2493
2494 /* Return the number of temporary registers that aarch64_add_offset_1
2495 would need to add OFFSET to a register. */
2496
2497 static unsigned int
2498 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2499 {
2500 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2501 }
2502
2503 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2504 a non-polynomial OFFSET. MODE is the mode of the addition.
2505 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2506 be set and CFA adjustments added to the generated instructions.
2507
2508 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2509 temporary if register allocation is already complete. This temporary
2510 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2511 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2512 the immediate again.
2513
2514 Since this function may be used to adjust the stack pointer, we must
2515 ensure that it cannot cause transient stack deallocation (for example
2516 by first incrementing SP and then decrementing when adjusting by a
2517 large immediate). */
2518
2519 static void
2520 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2521 rtx src, HOST_WIDE_INT offset, rtx temp1,
2522 bool frame_related_p, bool emit_move_imm)
2523 {
2524 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2525 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2526
2527 HOST_WIDE_INT moffset = abs_hwi (offset);
2528 rtx_insn *insn;
2529
2530 if (!moffset)
2531 {
2532 if (!rtx_equal_p (dest, src))
2533 {
2534 insn = emit_insn (gen_rtx_SET (dest, src));
2535 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536 }
2537 return;
2538 }
2539
2540 /* Single instruction adjustment. */
2541 if (aarch64_uimm12_shift (moffset))
2542 {
2543 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2544 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2545 return;
2546 }
2547
2548 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2549 and either:
2550
2551 a) the offset cannot be loaded by a 16-bit move or
2552 b) there is no spare register into which we can move it. */
2553 if (moffset < 0x1000000
2554 && ((!temp1 && !can_create_pseudo_p ())
2555 || !aarch64_move_imm (moffset, mode)))
2556 {
2557 HOST_WIDE_INT low_off = moffset & 0xfff;
2558
2559 low_off = offset < 0 ? -low_off : low_off;
2560 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2561 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2562 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2563 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2564 return;
2565 }
2566
2567 /* Emit a move immediate if required and an addition/subtraction. */
2568 if (emit_move_imm)
2569 {
2570 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2571 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2572 }
2573 insn = emit_insn (offset < 0
2574 ? gen_sub3_insn (dest, src, temp1)
2575 : gen_add3_insn (dest, src, temp1));
2576 if (frame_related_p)
2577 {
2578 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2579 rtx adj = plus_constant (mode, src, offset);
2580 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2581 }
2582 }
2583
2584 /* Return the number of temporary registers that aarch64_add_offset
2585 would need to move OFFSET into a register or add OFFSET to a register;
2586 ADD_P is true if we want the latter rather than the former. */
2587
2588 static unsigned int
2589 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2590 {
2591 /* This follows the same structure as aarch64_add_offset. */
2592 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2593 return 0;
2594
2595 unsigned int count = 0;
2596 HOST_WIDE_INT factor = offset.coeffs[1];
2597 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2598 poly_int64 poly_offset (factor, factor);
2599 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2600 /* Need one register for the ADDVL/ADDPL result. */
2601 count += 1;
2602 else if (factor != 0)
2603 {
2604 factor = abs (factor);
2605 if (factor > 16 * (factor & -factor))
2606 /* Need one register for the CNT result and one for the multiplication
2607 factor. If necessary, the second temporary can be reused for the
2608 constant part of the offset. */
2609 return 2;
2610 /* Need one register for the CNT result (which might then
2611 be shifted). */
2612 count += 1;
2613 }
2614 return count + aarch64_add_offset_1_temporaries (constant);
2615 }
2616
2617 /* If X can be represented as a poly_int64, return the number
2618 of temporaries that are required to add it to a register.
2619 Return -1 otherwise. */
2620
2621 int
2622 aarch64_add_offset_temporaries (rtx x)
2623 {
2624 poly_int64 offset;
2625 if (!poly_int_rtx_p (x, &offset))
2626 return -1;
2627 return aarch64_offset_temporaries (true, offset);
2628 }
2629
2630 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2631 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2632 be set and CFA adjustments added to the generated instructions.
2633
2634 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2635 temporary if register allocation is already complete. This temporary
2636 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2637 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2638 false to avoid emitting the immediate again.
2639
2640 TEMP2, if nonnull, is a second temporary register that doesn't
2641 overlap either DEST or REG.
2642
2643 Since this function may be used to adjust the stack pointer, we must
2644 ensure that it cannot cause transient stack deallocation (for example
2645 by first incrementing SP and then decrementing when adjusting by a
2646 large immediate). */
2647
2648 static void
2649 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2650 poly_int64 offset, rtx temp1, rtx temp2,
2651 bool frame_related_p, bool emit_move_imm = true)
2652 {
2653 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2654 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2655 gcc_assert (temp1 == NULL_RTX
2656 || !frame_related_p
2657 || !reg_overlap_mentioned_p (temp1, dest));
2658 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2659
2660 /* Try using ADDVL or ADDPL to add the whole value. */
2661 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2662 {
2663 rtx offset_rtx = gen_int_mode (offset, mode);
2664 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2665 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2666 return;
2667 }
2668
2669 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2670 SVE vector register, over and above the minimum size of 128 bits.
2671 This is equivalent to half the value returned by CNTD with a
2672 vector shape of ALL. */
2673 HOST_WIDE_INT factor = offset.coeffs[1];
2674 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2675
2676 /* Try using ADDVL or ADDPL to add the VG-based part. */
2677 poly_int64 poly_offset (factor, factor);
2678 if (src != const0_rtx
2679 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2680 {
2681 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2682 if (frame_related_p)
2683 {
2684 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2685 RTX_FRAME_RELATED_P (insn) = true;
2686 src = dest;
2687 }
2688 else
2689 {
2690 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2691 src = aarch64_force_temporary (mode, temp1, addr);
2692 temp1 = temp2;
2693 temp2 = NULL_RTX;
2694 }
2695 }
2696 /* Otherwise use a CNT-based sequence. */
2697 else if (factor != 0)
2698 {
2699 /* Use a subtraction if we have a negative factor. */
2700 rtx_code code = PLUS;
2701 if (factor < 0)
2702 {
2703 factor = -factor;
2704 code = MINUS;
2705 }
2706
2707 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2708 into the multiplication. */
2709 rtx val;
2710 int shift = 0;
2711 if (factor & 1)
2712 /* Use a right shift by 1. */
2713 shift = -1;
2714 else
2715 factor /= 2;
2716 HOST_WIDE_INT low_bit = factor & -factor;
2717 if (factor <= 16 * low_bit)
2718 {
2719 if (factor > 16 * 8)
2720 {
2721 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2722 the value with the minimum multiplier and shift it into
2723 position. */
2724 int extra_shift = exact_log2 (low_bit);
2725 shift += extra_shift;
2726 factor >>= extra_shift;
2727 }
2728 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2729 }
2730 else
2731 {
2732 /* Use CNTD, then multiply it by FACTOR. */
2733 val = gen_int_mode (poly_int64 (2, 2), mode);
2734 val = aarch64_force_temporary (mode, temp1, val);
2735
2736 /* Go back to using a negative multiplication factor if we have
2737 no register from which to subtract. */
2738 if (code == MINUS && src == const0_rtx)
2739 {
2740 factor = -factor;
2741 code = PLUS;
2742 }
2743 rtx coeff1 = gen_int_mode (factor, mode);
2744 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2745 val = gen_rtx_MULT (mode, val, coeff1);
2746 }
2747
2748 if (shift > 0)
2749 {
2750 /* Multiply by 1 << SHIFT. */
2751 val = aarch64_force_temporary (mode, temp1, val);
2752 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2753 }
2754 else if (shift == -1)
2755 {
2756 /* Divide by 2. */
2757 val = aarch64_force_temporary (mode, temp1, val);
2758 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2759 }
2760
2761 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2762 if (src != const0_rtx)
2763 {
2764 val = aarch64_force_temporary (mode, temp1, val);
2765 val = gen_rtx_fmt_ee (code, mode, src, val);
2766 }
2767 else if (code == MINUS)
2768 {
2769 val = aarch64_force_temporary (mode, temp1, val);
2770 val = gen_rtx_NEG (mode, val);
2771 }
2772
2773 if (constant == 0 || frame_related_p)
2774 {
2775 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2776 if (frame_related_p)
2777 {
2778 RTX_FRAME_RELATED_P (insn) = true;
2779 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2780 gen_rtx_SET (dest, plus_constant (Pmode, src,
2781 poly_offset)));
2782 }
2783 src = dest;
2784 if (constant == 0)
2785 return;
2786 }
2787 else
2788 {
2789 src = aarch64_force_temporary (mode, temp1, val);
2790 temp1 = temp2;
2791 temp2 = NULL_RTX;
2792 }
2793
2794 emit_move_imm = true;
2795 }
2796
2797 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2798 frame_related_p, emit_move_imm);
2799 }
2800
2801 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2802 than a poly_int64. */
2803
2804 void
2805 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2806 rtx offset_rtx, rtx temp1, rtx temp2)
2807 {
2808 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2809 temp1, temp2, false);
2810 }
2811
2812 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2813 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2814 if TEMP1 already contains abs (DELTA). */
2815
2816 static inline void
2817 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2818 {
2819 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2820 temp1, temp2, true, emit_move_imm);
2821 }
2822
2823 /* Subtract DELTA from the stack pointer, marking the instructions
2824 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2825 if nonnull. */
2826
2827 static inline void
2828 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2829 {
2830 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2831 temp1, temp2, frame_related_p);
2832 }
2833
2834 /* Set DEST to (vec_series BASE STEP). */
2835
2836 static void
2837 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2838 {
2839 machine_mode mode = GET_MODE (dest);
2840 scalar_mode inner = GET_MODE_INNER (mode);
2841
2842 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2843 if (!aarch64_sve_index_immediate_p (base))
2844 base = force_reg (inner, base);
2845 if (!aarch64_sve_index_immediate_p (step))
2846 step = force_reg (inner, step);
2847
2848 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2849 }
2850
2851 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2852 integer of mode INT_MODE. Return true on success. */
2853
2854 static bool
2855 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2856 rtx src)
2857 {
2858 /* If the constant is smaller than 128 bits, we can do the move
2859 using a vector of SRC_MODEs. */
2860 if (src_mode != TImode)
2861 {
2862 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2863 GET_MODE_SIZE (src_mode));
2864 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2865 emit_move_insn (gen_lowpart (dup_mode, dest),
2866 gen_const_vec_duplicate (dup_mode, src));
2867 return true;
2868 }
2869
2870 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2871 src = force_const_mem (src_mode, src);
2872 if (!src)
2873 return false;
2874
2875 /* Make sure that the address is legitimate. */
2876 if (!aarch64_sve_ld1r_operand_p (src))
2877 {
2878 rtx addr = force_reg (Pmode, XEXP (src, 0));
2879 src = replace_equiv_address (src, addr);
2880 }
2881
2882 machine_mode mode = GET_MODE (dest);
2883 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2884 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2885 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2886 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2887 emit_insn (gen_rtx_SET (dest, src));
2888 return true;
2889 }
2890
2891 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2892 isn't a simple duplicate or series. */
2893
2894 static void
2895 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2896 {
2897 machine_mode mode = GET_MODE (src);
2898 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2899 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2900 gcc_assert (npatterns > 1);
2901
2902 if (nelts_per_pattern == 1)
2903 {
2904 /* The constant is a repeating seqeuence of at least two elements,
2905 where the repeating elements occupy no more than 128 bits.
2906 Get an integer representation of the replicated value. */
2907 scalar_int_mode int_mode;
2908 if (BYTES_BIG_ENDIAN)
2909 /* For now, always use LD1RQ to load the value on big-endian
2910 targets, since the handling of smaller integers includes a
2911 subreg that is semantically an element reverse. */
2912 int_mode = TImode;
2913 else
2914 {
2915 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2916 gcc_assert (int_bits <= 128);
2917 int_mode = int_mode_for_size (int_bits, 0).require ();
2918 }
2919 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2920 if (int_value
2921 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2922 return;
2923 }
2924
2925 /* Expand each pattern individually. */
2926 rtx_vector_builder builder;
2927 auto_vec<rtx, 16> vectors (npatterns);
2928 for (unsigned int i = 0; i < npatterns; ++i)
2929 {
2930 builder.new_vector (mode, 1, nelts_per_pattern);
2931 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2932 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2933 vectors.quick_push (force_reg (mode, builder.build ()));
2934 }
2935
2936 /* Use permutes to interleave the separate vectors. */
2937 while (npatterns > 1)
2938 {
2939 npatterns /= 2;
2940 for (unsigned int i = 0; i < npatterns; ++i)
2941 {
2942 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2943 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2944 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2945 vectors[i] = tmp;
2946 }
2947 }
2948 gcc_assert (vectors[0] == dest);
2949 }
2950
2951 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2952 is a pattern that can be used to set DEST to a replicated scalar
2953 element. */
2954
2955 void
2956 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2957 rtx (*gen_vec_duplicate) (rtx, rtx))
2958 {
2959 machine_mode mode = GET_MODE (dest);
2960
2961 /* Check on what type of symbol it is. */
2962 scalar_int_mode int_mode;
2963 if ((GET_CODE (imm) == SYMBOL_REF
2964 || GET_CODE (imm) == LABEL_REF
2965 || GET_CODE (imm) == CONST
2966 || GET_CODE (imm) == CONST_POLY_INT)
2967 && is_a <scalar_int_mode> (mode, &int_mode))
2968 {
2969 rtx mem;
2970 poly_int64 offset;
2971 HOST_WIDE_INT const_offset;
2972 enum aarch64_symbol_type sty;
2973
2974 /* If we have (const (plus symbol offset)), separate out the offset
2975 before we start classifying the symbol. */
2976 rtx base = strip_offset (imm, &offset);
2977
2978 /* We must always add an offset involving VL separately, rather than
2979 folding it into the relocation. */
2980 if (!offset.is_constant (&const_offset))
2981 {
2982 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2983 emit_insn (gen_rtx_SET (dest, imm));
2984 else
2985 {
2986 /* Do arithmetic on 32-bit values if the result is smaller
2987 than that. */
2988 if (partial_subreg_p (int_mode, SImode))
2989 {
2990 /* It is invalid to do symbol calculations in modes
2991 narrower than SImode. */
2992 gcc_assert (base == const0_rtx);
2993 dest = gen_lowpart (SImode, dest);
2994 int_mode = SImode;
2995 }
2996 if (base != const0_rtx)
2997 {
2998 base = aarch64_force_temporary (int_mode, dest, base);
2999 aarch64_add_offset (int_mode, dest, base, offset,
3000 NULL_RTX, NULL_RTX, false);
3001 }
3002 else
3003 aarch64_add_offset (int_mode, dest, base, offset,
3004 dest, NULL_RTX, false);
3005 }
3006 return;
3007 }
3008
3009 sty = aarch64_classify_symbol (base, const_offset);
3010 switch (sty)
3011 {
3012 case SYMBOL_FORCE_TO_MEM:
3013 if (const_offset != 0
3014 && targetm.cannot_force_const_mem (int_mode, imm))
3015 {
3016 gcc_assert (can_create_pseudo_p ());
3017 base = aarch64_force_temporary (int_mode, dest, base);
3018 aarch64_add_offset (int_mode, dest, base, const_offset,
3019 NULL_RTX, NULL_RTX, false);
3020 return;
3021 }
3022
3023 mem = force_const_mem (ptr_mode, imm);
3024 gcc_assert (mem);
3025
3026 /* If we aren't generating PC relative literals, then
3027 we need to expand the literal pool access carefully.
3028 This is something that needs to be done in a number
3029 of places, so could well live as a separate function. */
3030 if (!aarch64_pcrelative_literal_loads)
3031 {
3032 gcc_assert (can_create_pseudo_p ());
3033 base = gen_reg_rtx (ptr_mode);
3034 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3035 if (ptr_mode != Pmode)
3036 base = convert_memory_address (Pmode, base);
3037 mem = gen_rtx_MEM (ptr_mode, base);
3038 }
3039
3040 if (int_mode != ptr_mode)
3041 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3042
3043 emit_insn (gen_rtx_SET (dest, mem));
3044
3045 return;
3046
3047 case SYMBOL_SMALL_TLSGD:
3048 case SYMBOL_SMALL_TLSDESC:
3049 case SYMBOL_SMALL_TLSIE:
3050 case SYMBOL_SMALL_GOT_28K:
3051 case SYMBOL_SMALL_GOT_4G:
3052 case SYMBOL_TINY_GOT:
3053 case SYMBOL_TINY_TLSIE:
3054 if (const_offset != 0)
3055 {
3056 gcc_assert(can_create_pseudo_p ());
3057 base = aarch64_force_temporary (int_mode, dest, base);
3058 aarch64_add_offset (int_mode, dest, base, const_offset,
3059 NULL_RTX, NULL_RTX, false);
3060 return;
3061 }
3062 /* FALLTHRU */
3063
3064 case SYMBOL_SMALL_ABSOLUTE:
3065 case SYMBOL_TINY_ABSOLUTE:
3066 case SYMBOL_TLSLE12:
3067 case SYMBOL_TLSLE24:
3068 case SYMBOL_TLSLE32:
3069 case SYMBOL_TLSLE48:
3070 aarch64_load_symref_appropriately (dest, imm, sty);
3071 return;
3072
3073 default:
3074 gcc_unreachable ();
3075 }
3076 }
3077
3078 if (!CONST_INT_P (imm))
3079 {
3080 rtx base, step, value;
3081 if (GET_CODE (imm) == HIGH
3082 || aarch64_simd_valid_immediate (imm, NULL))
3083 emit_insn (gen_rtx_SET (dest, imm));
3084 else if (const_vec_series_p (imm, &base, &step))
3085 aarch64_expand_vec_series (dest, base, step);
3086 else if (const_vec_duplicate_p (imm, &value))
3087 {
3088 /* If the constant is out of range of an SVE vector move,
3089 load it from memory if we can, otherwise move it into
3090 a register and use a DUP. */
3091 scalar_mode inner_mode = GET_MODE_INNER (mode);
3092 rtx op = force_const_mem (inner_mode, value);
3093 if (!op)
3094 op = force_reg (inner_mode, value);
3095 else if (!aarch64_sve_ld1r_operand_p (op))
3096 {
3097 rtx addr = force_reg (Pmode, XEXP (op, 0));
3098 op = replace_equiv_address (op, addr);
3099 }
3100 emit_insn (gen_vec_duplicate (dest, op));
3101 }
3102 else if (GET_CODE (imm) == CONST_VECTOR
3103 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3104 aarch64_expand_sve_const_vector (dest, imm);
3105 else
3106 {
3107 rtx mem = force_const_mem (mode, imm);
3108 gcc_assert (mem);
3109 emit_move_insn (dest, mem);
3110 }
3111
3112 return;
3113 }
3114
3115 aarch64_internal_mov_immediate (dest, imm, true,
3116 as_a <scalar_int_mode> (mode));
3117 }
3118
3119 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3120 that is known to contain PTRUE. */
3121
3122 void
3123 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3124 {
3125 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3126 gen_rtvec (2, pred, src),
3127 UNSPEC_MERGE_PTRUE)));
3128 }
3129
3130 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3131 operand is in memory. In this case we need to use the predicated LD1
3132 and ST1 instead of LDR and STR, both for correctness on big-endian
3133 targets and because LD1 and ST1 support a wider range of addressing modes.
3134 PRED_MODE is the mode of the predicate.
3135
3136 See the comment at the head of aarch64-sve.md for details about the
3137 big-endian handling. */
3138
3139 void
3140 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3141 {
3142 machine_mode mode = GET_MODE (dest);
3143 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3144 if (!register_operand (src, mode)
3145 && !register_operand (dest, mode))
3146 {
3147 rtx tmp = gen_reg_rtx (mode);
3148 if (MEM_P (src))
3149 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3150 else
3151 emit_move_insn (tmp, src);
3152 src = tmp;
3153 }
3154 aarch64_emit_sve_pred_move (dest, ptrue, src);
3155 }
3156
3157 /* Called only on big-endian targets. See whether an SVE vector move
3158 from SRC to DEST is effectively a REV[BHW] instruction, because at
3159 least one operand is a subreg of an SVE vector that has wider or
3160 narrower elements. Return true and emit the instruction if so.
3161
3162 For example:
3163
3164 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3165
3166 represents a VIEW_CONVERT between the following vectors, viewed
3167 in memory order:
3168
3169 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3170 R1: { [0], [1], [2], [3], ... }
3171
3172 The high part of lane X in R2 should therefore correspond to lane X*2
3173 of R1, but the register representations are:
3174
3175 msb lsb
3176 R2: ...... [1].high [1].low [0].high [0].low
3177 R1: ...... [3] [2] [1] [0]
3178
3179 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3180 We therefore need a reverse operation to swap the high and low values
3181 around.
3182
3183 This is purely an optimization. Without it we would spill the
3184 subreg operand to the stack in one mode and reload it in the
3185 other mode, which has the same effect as the REV. */
3186
3187 bool
3188 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3189 {
3190 gcc_assert (BYTES_BIG_ENDIAN);
3191 if (GET_CODE (dest) == SUBREG)
3192 dest = SUBREG_REG (dest);
3193 if (GET_CODE (src) == SUBREG)
3194 src = SUBREG_REG (src);
3195
3196 /* The optimization handles two single SVE REGs with different element
3197 sizes. */
3198 if (!REG_P (dest)
3199 || !REG_P (src)
3200 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3201 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3202 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3203 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3204 return false;
3205
3206 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3207 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3208 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3209 UNSPEC_REV_SUBREG);
3210 emit_insn (gen_rtx_SET (dest, unspec));
3211 return true;
3212 }
3213
3214 /* Return a copy of X with mode MODE, without changing its other
3215 attributes. Unlike gen_lowpart, this doesn't care whether the
3216 mode change is valid. */
3217
3218 static rtx
3219 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3220 {
3221 if (GET_MODE (x) == mode)
3222 return x;
3223
3224 x = shallow_copy_rtx (x);
3225 set_mode_and_regno (x, mode, REGNO (x));
3226 return x;
3227 }
3228
3229 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3230 operands. */
3231
3232 void
3233 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3234 {
3235 /* Decide which REV operation we need. The mode with narrower elements
3236 determines the mode of the operands and the mode with the wider
3237 elements determines the reverse width. */
3238 machine_mode mode_with_wider_elts = GET_MODE (dest);
3239 machine_mode mode_with_narrower_elts = GET_MODE (src);
3240 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3241 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3242 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3243
3244 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3245 unsigned int unspec;
3246 if (wider_bytes == 8)
3247 unspec = UNSPEC_REV64;
3248 else if (wider_bytes == 4)
3249 unspec = UNSPEC_REV32;
3250 else if (wider_bytes == 2)
3251 unspec = UNSPEC_REV16;
3252 else
3253 gcc_unreachable ();
3254 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3255
3256 /* Emit:
3257
3258 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3259 UNSPEC_MERGE_PTRUE))
3260
3261 with the appropriate modes. */
3262 ptrue = gen_lowpart (pred_mode, ptrue);
3263 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3264 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3265 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3266 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3267 UNSPEC_MERGE_PTRUE);
3268 emit_insn (gen_rtx_SET (dest, src));
3269 }
3270
3271 static bool
3272 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3273 tree exp ATTRIBUTE_UNUSED)
3274 {
3275 /* Currently, always true. */
3276 return true;
3277 }
3278
3279 /* Implement TARGET_PASS_BY_REFERENCE. */
3280
3281 static bool
3282 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3283 machine_mode mode,
3284 const_tree type,
3285 bool named ATTRIBUTE_UNUSED)
3286 {
3287 HOST_WIDE_INT size;
3288 machine_mode dummymode;
3289 int nregs;
3290
3291 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3292 if (mode == BLKmode && type)
3293 size = int_size_in_bytes (type);
3294 else
3295 /* No frontends can create types with variable-sized modes, so we
3296 shouldn't be asked to pass or return them. */
3297 size = GET_MODE_SIZE (mode).to_constant ();
3298
3299 /* Aggregates are passed by reference based on their size. */
3300 if (type && AGGREGATE_TYPE_P (type))
3301 {
3302 size = int_size_in_bytes (type);
3303 }
3304
3305 /* Variable sized arguments are always returned by reference. */
3306 if (size < 0)
3307 return true;
3308
3309 /* Can this be a candidate to be passed in fp/simd register(s)? */
3310 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3311 &dummymode, &nregs,
3312 NULL))
3313 return false;
3314
3315 /* Arguments which are variable sized or larger than 2 registers are
3316 passed by reference unless they are a homogenous floating point
3317 aggregate. */
3318 return size > 2 * UNITS_PER_WORD;
3319 }
3320
3321 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3322 static bool
3323 aarch64_return_in_msb (const_tree valtype)
3324 {
3325 machine_mode dummy_mode;
3326 int dummy_int;
3327
3328 /* Never happens in little-endian mode. */
3329 if (!BYTES_BIG_ENDIAN)
3330 return false;
3331
3332 /* Only composite types smaller than or equal to 16 bytes can
3333 be potentially returned in registers. */
3334 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3335 || int_size_in_bytes (valtype) <= 0
3336 || int_size_in_bytes (valtype) > 16)
3337 return false;
3338
3339 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3340 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3341 is always passed/returned in the least significant bits of fp/simd
3342 register(s). */
3343 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3344 &dummy_mode, &dummy_int, NULL))
3345 return false;
3346
3347 return true;
3348 }
3349
3350 /* Implement TARGET_FUNCTION_VALUE.
3351 Define how to find the value returned by a function. */
3352
3353 static rtx
3354 aarch64_function_value (const_tree type, const_tree func,
3355 bool outgoing ATTRIBUTE_UNUSED)
3356 {
3357 machine_mode mode;
3358 int unsignedp;
3359 int count;
3360 machine_mode ag_mode;
3361
3362 mode = TYPE_MODE (type);
3363 if (INTEGRAL_TYPE_P (type))
3364 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3365
3366 if (aarch64_return_in_msb (type))
3367 {
3368 HOST_WIDE_INT size = int_size_in_bytes (type);
3369
3370 if (size % UNITS_PER_WORD != 0)
3371 {
3372 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3373 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3374 }
3375 }
3376
3377 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3378 &ag_mode, &count, NULL))
3379 {
3380 if (!aarch64_composite_type_p (type, mode))
3381 {
3382 gcc_assert (count == 1 && mode == ag_mode);
3383 return gen_rtx_REG (mode, V0_REGNUM);
3384 }
3385 else
3386 {
3387 int i;
3388 rtx par;
3389
3390 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3391 for (i = 0; i < count; i++)
3392 {
3393 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3394 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3395 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3396 XVECEXP (par, 0, i) = tmp;
3397 }
3398 return par;
3399 }
3400 }
3401 else
3402 return gen_rtx_REG (mode, R0_REGNUM);
3403 }
3404
3405 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3406 Return true if REGNO is the number of a hard register in which the values
3407 of called function may come back. */
3408
3409 static bool
3410 aarch64_function_value_regno_p (const unsigned int regno)
3411 {
3412 /* Maximum of 16 bytes can be returned in the general registers. Examples
3413 of 16-byte return values are: 128-bit integers and 16-byte small
3414 structures (excluding homogeneous floating-point aggregates). */
3415 if (regno == R0_REGNUM || regno == R1_REGNUM)
3416 return true;
3417
3418 /* Up to four fp/simd registers can return a function value, e.g. a
3419 homogeneous floating-point aggregate having four members. */
3420 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3421 return TARGET_FLOAT;
3422
3423 return false;
3424 }
3425
3426 /* Implement TARGET_RETURN_IN_MEMORY.
3427
3428 If the type T of the result of a function is such that
3429 void func (T arg)
3430 would require that arg be passed as a value in a register (or set of
3431 registers) according to the parameter passing rules, then the result
3432 is returned in the same registers as would be used for such an
3433 argument. */
3434
3435 static bool
3436 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3437 {
3438 HOST_WIDE_INT size;
3439 machine_mode ag_mode;
3440 int count;
3441
3442 if (!AGGREGATE_TYPE_P (type)
3443 && TREE_CODE (type) != COMPLEX_TYPE
3444 && TREE_CODE (type) != VECTOR_TYPE)
3445 /* Simple scalar types always returned in registers. */
3446 return false;
3447
3448 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3449 type,
3450 &ag_mode,
3451 &count,
3452 NULL))
3453 return false;
3454
3455 /* Types larger than 2 registers returned in memory. */
3456 size = int_size_in_bytes (type);
3457 return (size < 0 || size > 2 * UNITS_PER_WORD);
3458 }
3459
3460 static bool
3461 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3462 const_tree type, int *nregs)
3463 {
3464 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3465 return aarch64_vfp_is_call_or_return_candidate (mode,
3466 type,
3467 &pcum->aapcs_vfp_rmode,
3468 nregs,
3469 NULL);
3470 }
3471
3472 /* Given MODE and TYPE of a function argument, return the alignment in
3473 bits. The idea is to suppress any stronger alignment requested by
3474 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3475 This is a helper function for local use only. */
3476
3477 static unsigned int
3478 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3479 {
3480 if (!type)
3481 return GET_MODE_ALIGNMENT (mode);
3482
3483 if (integer_zerop (TYPE_SIZE (type)))
3484 return 0;
3485
3486 gcc_assert (TYPE_MODE (type) == mode);
3487
3488 if (!AGGREGATE_TYPE_P (type))
3489 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3490
3491 if (TREE_CODE (type) == ARRAY_TYPE)
3492 return TYPE_ALIGN (TREE_TYPE (type));
3493
3494 unsigned int alignment = 0;
3495 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3496 if (TREE_CODE (field) == FIELD_DECL)
3497 alignment = std::max (alignment, DECL_ALIGN (field));
3498
3499 return alignment;
3500 }
3501
3502 /* Layout a function argument according to the AAPCS64 rules. The rule
3503 numbers refer to the rule numbers in the AAPCS64. */
3504
3505 static void
3506 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3507 const_tree type,
3508 bool named ATTRIBUTE_UNUSED)
3509 {
3510 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3511 int ncrn, nvrn, nregs;
3512 bool allocate_ncrn, allocate_nvrn;
3513 HOST_WIDE_INT size;
3514
3515 /* We need to do this once per argument. */
3516 if (pcum->aapcs_arg_processed)
3517 return;
3518
3519 pcum->aapcs_arg_processed = true;
3520
3521 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3522 if (type)
3523 size = int_size_in_bytes (type);
3524 else
3525 /* No frontends can create types with variable-sized modes, so we
3526 shouldn't be asked to pass or return them. */
3527 size = GET_MODE_SIZE (mode).to_constant ();
3528 size = ROUND_UP (size, UNITS_PER_WORD);
3529
3530 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3531 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3532 mode,
3533 type,
3534 &nregs);
3535
3536 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3537 The following code thus handles passing by SIMD/FP registers first. */
3538
3539 nvrn = pcum->aapcs_nvrn;
3540
3541 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3542 and homogenous short-vector aggregates (HVA). */
3543 if (allocate_nvrn)
3544 {
3545 if (!TARGET_FLOAT)
3546 aarch64_err_no_fpadvsimd (mode);
3547
3548 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3549 {
3550 pcum->aapcs_nextnvrn = nvrn + nregs;
3551 if (!aarch64_composite_type_p (type, mode))
3552 {
3553 gcc_assert (nregs == 1);
3554 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3555 }
3556 else
3557 {
3558 rtx par;
3559 int i;
3560 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3561 for (i = 0; i < nregs; i++)
3562 {
3563 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3564 V0_REGNUM + nvrn + i);
3565 rtx offset = gen_int_mode
3566 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3567 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3568 XVECEXP (par, 0, i) = tmp;
3569 }
3570 pcum->aapcs_reg = par;
3571 }
3572 return;
3573 }
3574 else
3575 {
3576 /* C.3 NSRN is set to 8. */
3577 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3578 goto on_stack;
3579 }
3580 }
3581
3582 ncrn = pcum->aapcs_ncrn;
3583 nregs = size / UNITS_PER_WORD;
3584
3585 /* C6 - C9. though the sign and zero extension semantics are
3586 handled elsewhere. This is the case where the argument fits
3587 entirely general registers. */
3588 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3589 {
3590
3591 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3592
3593 /* C.8 if the argument has an alignment of 16 then the NGRN is
3594 rounded up to the next even number. */
3595 if (nregs == 2
3596 && ncrn % 2
3597 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3598 comparison is there because for > 16 * BITS_PER_UNIT
3599 alignment nregs should be > 2 and therefore it should be
3600 passed by reference rather than value. */
3601 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3602 {
3603 ++ncrn;
3604 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3605 }
3606
3607 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3608 A reg is still generated for it, but the caller should be smart
3609 enough not to use it. */
3610 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3611 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3612 else
3613 {
3614 rtx par;
3615 int i;
3616
3617 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3618 for (i = 0; i < nregs; i++)
3619 {
3620 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3621 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3622 GEN_INT (i * UNITS_PER_WORD));
3623 XVECEXP (par, 0, i) = tmp;
3624 }
3625 pcum->aapcs_reg = par;
3626 }
3627
3628 pcum->aapcs_nextncrn = ncrn + nregs;
3629 return;
3630 }
3631
3632 /* C.11 */
3633 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3634
3635 /* The argument is passed on stack; record the needed number of words for
3636 this argument and align the total size if necessary. */
3637 on_stack:
3638 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3639
3640 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3641 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3642 16 / UNITS_PER_WORD);
3643 return;
3644 }
3645
3646 /* Implement TARGET_FUNCTION_ARG. */
3647
3648 static rtx
3649 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3650 const_tree type, bool named)
3651 {
3652 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3653 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3654
3655 if (mode == VOIDmode)
3656 return NULL_RTX;
3657
3658 aarch64_layout_arg (pcum_v, mode, type, named);
3659 return pcum->aapcs_reg;
3660 }
3661
3662 void
3663 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3664 const_tree fntype ATTRIBUTE_UNUSED,
3665 rtx libname ATTRIBUTE_UNUSED,
3666 const_tree fndecl ATTRIBUTE_UNUSED,
3667 unsigned n_named ATTRIBUTE_UNUSED)
3668 {
3669 pcum->aapcs_ncrn = 0;
3670 pcum->aapcs_nvrn = 0;
3671 pcum->aapcs_nextncrn = 0;
3672 pcum->aapcs_nextnvrn = 0;
3673 pcum->pcs_variant = ARM_PCS_AAPCS64;
3674 pcum->aapcs_reg = NULL_RTX;
3675 pcum->aapcs_arg_processed = false;
3676 pcum->aapcs_stack_words = 0;
3677 pcum->aapcs_stack_size = 0;
3678
3679 if (!TARGET_FLOAT
3680 && fndecl && TREE_PUBLIC (fndecl)
3681 && fntype && fntype != error_mark_node)
3682 {
3683 const_tree type = TREE_TYPE (fntype);
3684 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3685 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3686 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3687 &mode, &nregs, NULL))
3688 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3689 }
3690 return;
3691 }
3692
3693 static void
3694 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3695 machine_mode mode,
3696 const_tree type,
3697 bool named)
3698 {
3699 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3700 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3701 {
3702 aarch64_layout_arg (pcum_v, mode, type, named);
3703 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3704 != (pcum->aapcs_stack_words != 0));
3705 pcum->aapcs_arg_processed = false;
3706 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3707 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3708 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3709 pcum->aapcs_stack_words = 0;
3710 pcum->aapcs_reg = NULL_RTX;
3711 }
3712 }
3713
3714 bool
3715 aarch64_function_arg_regno_p (unsigned regno)
3716 {
3717 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3718 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3719 }
3720
3721 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3722 PARM_BOUNDARY bits of alignment, but will be given anything up
3723 to STACK_BOUNDARY bits if the type requires it. This makes sure
3724 that both before and after the layout of each argument, the Next
3725 Stacked Argument Address (NSAA) will have a minimum alignment of
3726 8 bytes. */
3727
3728 static unsigned int
3729 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3730 {
3731 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3732 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3733 }
3734
3735 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3736
3737 static fixed_size_mode
3738 aarch64_get_reg_raw_mode (int regno)
3739 {
3740 if (TARGET_SVE && FP_REGNUM_P (regno))
3741 /* Don't use the SVE part of the register for __builtin_apply and
3742 __builtin_return. The SVE registers aren't used by the normal PCS,
3743 so using them there would be a waste of time. The PCS extensions
3744 for SVE types are fundamentally incompatible with the
3745 __builtin_return/__builtin_apply interface. */
3746 return as_a <fixed_size_mode> (V16QImode);
3747 return default_get_reg_raw_mode (regno);
3748 }
3749
3750 /* Implement TARGET_FUNCTION_ARG_PADDING.
3751
3752 Small aggregate types are placed in the lowest memory address.
3753
3754 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3755
3756 static pad_direction
3757 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3758 {
3759 /* On little-endian targets, the least significant byte of every stack
3760 argument is passed at the lowest byte address of the stack slot. */
3761 if (!BYTES_BIG_ENDIAN)
3762 return PAD_UPWARD;
3763
3764 /* Otherwise, integral, floating-point and pointer types are padded downward:
3765 the least significant byte of a stack argument is passed at the highest
3766 byte address of the stack slot. */
3767 if (type
3768 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3769 || POINTER_TYPE_P (type))
3770 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3771 return PAD_DOWNWARD;
3772
3773 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3774 return PAD_UPWARD;
3775 }
3776
3777 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3778
3779 It specifies padding for the last (may also be the only)
3780 element of a block move between registers and memory. If
3781 assuming the block is in the memory, padding upward means that
3782 the last element is padded after its highest significant byte,
3783 while in downward padding, the last element is padded at the
3784 its least significant byte side.
3785
3786 Small aggregates and small complex types are always padded
3787 upwards.
3788
3789 We don't need to worry about homogeneous floating-point or
3790 short-vector aggregates; their move is not affected by the
3791 padding direction determined here. Regardless of endianness,
3792 each element of such an aggregate is put in the least
3793 significant bits of a fp/simd register.
3794
3795 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3796 register has useful data, and return the opposite if the most
3797 significant byte does. */
3798
3799 bool
3800 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3801 bool first ATTRIBUTE_UNUSED)
3802 {
3803
3804 /* Small composite types are always padded upward. */
3805 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3806 {
3807 HOST_WIDE_INT size;
3808 if (type)
3809 size = int_size_in_bytes (type);
3810 else
3811 /* No frontends can create types with variable-sized modes, so we
3812 shouldn't be asked to pass or return them. */
3813 size = GET_MODE_SIZE (mode).to_constant ();
3814 if (size < 2 * UNITS_PER_WORD)
3815 return true;
3816 }
3817
3818 /* Otherwise, use the default padding. */
3819 return !BYTES_BIG_ENDIAN;
3820 }
3821
3822 static scalar_int_mode
3823 aarch64_libgcc_cmp_return_mode (void)
3824 {
3825 return SImode;
3826 }
3827
3828 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3829
3830 /* We use the 12-bit shifted immediate arithmetic instructions so values
3831 must be multiple of (1 << 12), i.e. 4096. */
3832 #define ARITH_FACTOR 4096
3833
3834 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3835 #error Cannot use simple address calculation for stack probing
3836 #endif
3837
3838 /* The pair of scratch registers used for stack probing. */
3839 #define PROBE_STACK_FIRST_REG 9
3840 #define PROBE_STACK_SECOND_REG 10
3841
3842 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3843 inclusive. These are offsets from the current stack pointer. */
3844
3845 static void
3846 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3847 {
3848 HOST_WIDE_INT size;
3849 if (!poly_size.is_constant (&size))
3850 {
3851 sorry ("stack probes for SVE frames");
3852 return;
3853 }
3854
3855 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3856
3857 /* See the same assertion on PROBE_INTERVAL above. */
3858 gcc_assert ((first % ARITH_FACTOR) == 0);
3859
3860 /* See if we have a constant small number of probes to generate. If so,
3861 that's the easy case. */
3862 if (size <= PROBE_INTERVAL)
3863 {
3864 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3865
3866 emit_set_insn (reg1,
3867 plus_constant (Pmode,
3868 stack_pointer_rtx, -(first + base)));
3869 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3870 }
3871
3872 /* The run-time loop is made up of 8 insns in the generic case while the
3873 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3874 else if (size <= 4 * PROBE_INTERVAL)
3875 {
3876 HOST_WIDE_INT i, rem;
3877
3878 emit_set_insn (reg1,
3879 plus_constant (Pmode,
3880 stack_pointer_rtx,
3881 -(first + PROBE_INTERVAL)));
3882 emit_stack_probe (reg1);
3883
3884 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3885 it exceeds SIZE. If only two probes are needed, this will not
3886 generate any code. Then probe at FIRST + SIZE. */
3887 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3888 {
3889 emit_set_insn (reg1,
3890 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3891 emit_stack_probe (reg1);
3892 }
3893
3894 rem = size - (i - PROBE_INTERVAL);
3895 if (rem > 256)
3896 {
3897 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3898
3899 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3900 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3901 }
3902 else
3903 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3904 }
3905
3906 /* Otherwise, do the same as above, but in a loop. Note that we must be
3907 extra careful with variables wrapping around because we might be at
3908 the very top (or the very bottom) of the address space and we have
3909 to be able to handle this case properly; in particular, we use an
3910 equality test for the loop condition. */
3911 else
3912 {
3913 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3914
3915 /* Step 1: round SIZE to the previous multiple of the interval. */
3916
3917 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3918
3919
3920 /* Step 2: compute initial and final value of the loop counter. */
3921
3922 /* TEST_ADDR = SP + FIRST. */
3923 emit_set_insn (reg1,
3924 plus_constant (Pmode, stack_pointer_rtx, -first));
3925
3926 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3927 HOST_WIDE_INT adjustment = - (first + rounded_size);
3928 if (! aarch64_uimm12_shift (adjustment))
3929 {
3930 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3931 true, Pmode);
3932 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3933 }
3934 else
3935 emit_set_insn (reg2,
3936 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3937
3938 /* Step 3: the loop
3939
3940 do
3941 {
3942 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3943 probe at TEST_ADDR
3944 }
3945 while (TEST_ADDR != LAST_ADDR)
3946
3947 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3948 until it is equal to ROUNDED_SIZE. */
3949
3950 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3951
3952
3953 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3954 that SIZE is equal to ROUNDED_SIZE. */
3955
3956 if (size != rounded_size)
3957 {
3958 HOST_WIDE_INT rem = size - rounded_size;
3959
3960 if (rem > 256)
3961 {
3962 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3963
3964 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3965 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3966 }
3967 else
3968 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3969 }
3970 }
3971
3972 /* Make sure nothing is scheduled before we are done. */
3973 emit_insn (gen_blockage ());
3974 }
3975
3976 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3977 absolute addresses. */
3978
3979 const char *
3980 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3981 {
3982 static int labelno = 0;
3983 char loop_lab[32];
3984 rtx xops[2];
3985
3986 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3987
3988 /* Loop. */
3989 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3990
3991 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3992 xops[0] = reg1;
3993 xops[1] = GEN_INT (PROBE_INTERVAL);
3994 output_asm_insn ("sub\t%0, %0, %1", xops);
3995
3996 /* Probe at TEST_ADDR. */
3997 output_asm_insn ("str\txzr, [%0]", xops);
3998
3999 /* Test if TEST_ADDR == LAST_ADDR. */
4000 xops[1] = reg2;
4001 output_asm_insn ("cmp\t%0, %1", xops);
4002
4003 /* Branch. */
4004 fputs ("\tb.ne\t", asm_out_file);
4005 assemble_name_raw (asm_out_file, loop_lab);
4006 fputc ('\n', asm_out_file);
4007
4008 return "";
4009 }
4010
4011 /* Determine whether a frame chain needs to be generated. */
4012 static bool
4013 aarch64_needs_frame_chain (void)
4014 {
4015 /* Force a frame chain for EH returns so the return address is at FP+8. */
4016 if (frame_pointer_needed || crtl->calls_eh_return)
4017 return true;
4018
4019 /* A leaf function cannot have calls or write LR. */
4020 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4021
4022 /* Don't use a frame chain in leaf functions if leaf frame pointers
4023 are disabled. */
4024 if (flag_omit_leaf_frame_pointer && is_leaf)
4025 return false;
4026
4027 return aarch64_use_frame_pointer;
4028 }
4029
4030 /* Mark the registers that need to be saved by the callee and calculate
4031 the size of the callee-saved registers area and frame record (both FP
4032 and LR may be omitted). */
4033 static void
4034 aarch64_layout_frame (void)
4035 {
4036 HOST_WIDE_INT offset = 0;
4037 int regno, last_fp_reg = INVALID_REGNUM;
4038
4039 if (reload_completed && cfun->machine->frame.laid_out)
4040 return;
4041
4042 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4043
4044 #define SLOT_NOT_REQUIRED (-2)
4045 #define SLOT_REQUIRED (-1)
4046
4047 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4048 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4049
4050 /* First mark all the registers that really need to be saved... */
4051 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4052 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4053
4054 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4055 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4056
4057 /* ... that includes the eh data registers (if needed)... */
4058 if (crtl->calls_eh_return)
4059 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4060 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4061 = SLOT_REQUIRED;
4062
4063 /* ... and any callee saved register that dataflow says is live. */
4064 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4065 if (df_regs_ever_live_p (regno)
4066 && (regno == R30_REGNUM
4067 || !call_used_regs[regno]))
4068 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4069
4070 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4071 if (df_regs_ever_live_p (regno)
4072 && !call_used_regs[regno])
4073 {
4074 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4075 last_fp_reg = regno;
4076 }
4077
4078 if (cfun->machine->frame.emit_frame_chain)
4079 {
4080 /* FP and LR are placed in the linkage record. */
4081 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4082 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4083 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4084 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4085 offset = 2 * UNITS_PER_WORD;
4086 }
4087
4088 /* Now assign stack slots for them. */
4089 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4090 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4091 {
4092 cfun->machine->frame.reg_offset[regno] = offset;
4093 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4094 cfun->machine->frame.wb_candidate1 = regno;
4095 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4096 cfun->machine->frame.wb_candidate2 = regno;
4097 offset += UNITS_PER_WORD;
4098 }
4099
4100 HOST_WIDE_INT max_int_offset = offset;
4101 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4102 bool has_align_gap = offset != max_int_offset;
4103
4104 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4105 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4106 {
4107 /* If there is an alignment gap between integer and fp callee-saves,
4108 allocate the last fp register to it if possible. */
4109 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4110 {
4111 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4112 break;
4113 }
4114
4115 cfun->machine->frame.reg_offset[regno] = offset;
4116 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4117 cfun->machine->frame.wb_candidate1 = regno;
4118 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4119 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4120 cfun->machine->frame.wb_candidate2 = regno;
4121 offset += UNITS_PER_WORD;
4122 }
4123
4124 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4125
4126 cfun->machine->frame.saved_regs_size = offset;
4127
4128 HOST_WIDE_INT varargs_and_saved_regs_size
4129 = offset + cfun->machine->frame.saved_varargs_size;
4130
4131 cfun->machine->frame.hard_fp_offset
4132 = aligned_upper_bound (varargs_and_saved_regs_size
4133 + get_frame_size (),
4134 STACK_BOUNDARY / BITS_PER_UNIT);
4135
4136 /* Both these values are already aligned. */
4137 gcc_assert (multiple_p (crtl->outgoing_args_size,
4138 STACK_BOUNDARY / BITS_PER_UNIT));
4139 cfun->machine->frame.frame_size
4140 = (cfun->machine->frame.hard_fp_offset
4141 + crtl->outgoing_args_size);
4142
4143 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4144
4145 cfun->machine->frame.initial_adjust = 0;
4146 cfun->machine->frame.final_adjust = 0;
4147 cfun->machine->frame.callee_adjust = 0;
4148 cfun->machine->frame.callee_offset = 0;
4149
4150 HOST_WIDE_INT max_push_offset = 0;
4151 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4152 max_push_offset = 512;
4153 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4154 max_push_offset = 256;
4155
4156 HOST_WIDE_INT const_size, const_fp_offset;
4157 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4158 && const_size < max_push_offset
4159 && known_eq (crtl->outgoing_args_size, 0))
4160 {
4161 /* Simple, small frame with no outgoing arguments:
4162 stp reg1, reg2, [sp, -frame_size]!
4163 stp reg3, reg4, [sp, 16] */
4164 cfun->machine->frame.callee_adjust = const_size;
4165 }
4166 else if (known_lt (crtl->outgoing_args_size
4167 + cfun->machine->frame.saved_regs_size, 512)
4168 && !(cfun->calls_alloca
4169 && known_lt (cfun->machine->frame.hard_fp_offset,
4170 max_push_offset)))
4171 {
4172 /* Frame with small outgoing arguments:
4173 sub sp, sp, frame_size
4174 stp reg1, reg2, [sp, outgoing_args_size]
4175 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4176 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4177 cfun->machine->frame.callee_offset
4178 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4179 }
4180 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4181 && const_fp_offset < max_push_offset)
4182 {
4183 /* Frame with large outgoing arguments but a small local area:
4184 stp reg1, reg2, [sp, -hard_fp_offset]!
4185 stp reg3, reg4, [sp, 16]
4186 sub sp, sp, outgoing_args_size */
4187 cfun->machine->frame.callee_adjust = const_fp_offset;
4188 cfun->machine->frame.final_adjust
4189 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4190 }
4191 else
4192 {
4193 /* Frame with large local area and outgoing arguments using frame pointer:
4194 sub sp, sp, hard_fp_offset
4195 stp x29, x30, [sp, 0]
4196 add x29, sp, 0
4197 stp reg3, reg4, [sp, 16]
4198 sub sp, sp, outgoing_args_size */
4199 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4200 cfun->machine->frame.final_adjust
4201 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4202 }
4203
4204 cfun->machine->frame.laid_out = true;
4205 }
4206
4207 /* Return true if the register REGNO is saved on entry to
4208 the current function. */
4209
4210 static bool
4211 aarch64_register_saved_on_entry (int regno)
4212 {
4213 return cfun->machine->frame.reg_offset[regno] >= 0;
4214 }
4215
4216 /* Return the next register up from REGNO up to LIMIT for the callee
4217 to save. */
4218
4219 static unsigned
4220 aarch64_next_callee_save (unsigned regno, unsigned limit)
4221 {
4222 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4223 regno ++;
4224 return regno;
4225 }
4226
4227 /* Push the register number REGNO of mode MODE to the stack with write-back
4228 adjusting the stack by ADJUSTMENT. */
4229
4230 static void
4231 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4232 HOST_WIDE_INT adjustment)
4233 {
4234 rtx base_rtx = stack_pointer_rtx;
4235 rtx insn, reg, mem;
4236
4237 reg = gen_rtx_REG (mode, regno);
4238 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4239 plus_constant (Pmode, base_rtx, -adjustment));
4240 mem = gen_frame_mem (mode, mem);
4241
4242 insn = emit_move_insn (mem, reg);
4243 RTX_FRAME_RELATED_P (insn) = 1;
4244 }
4245
4246 /* Generate and return an instruction to store the pair of registers
4247 REG and REG2 of mode MODE to location BASE with write-back adjusting
4248 the stack location BASE by ADJUSTMENT. */
4249
4250 static rtx
4251 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4252 HOST_WIDE_INT adjustment)
4253 {
4254 switch (mode)
4255 {
4256 case E_DImode:
4257 return gen_storewb_pairdi_di (base, base, reg, reg2,
4258 GEN_INT (-adjustment),
4259 GEN_INT (UNITS_PER_WORD - adjustment));
4260 case E_DFmode:
4261 return gen_storewb_pairdf_di (base, base, reg, reg2,
4262 GEN_INT (-adjustment),
4263 GEN_INT (UNITS_PER_WORD - adjustment));
4264 default:
4265 gcc_unreachable ();
4266 }
4267 }
4268
4269 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4270 stack pointer by ADJUSTMENT. */
4271
4272 static void
4273 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4274 {
4275 rtx_insn *insn;
4276 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4277
4278 if (regno2 == INVALID_REGNUM)
4279 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4280
4281 rtx reg1 = gen_rtx_REG (mode, regno1);
4282 rtx reg2 = gen_rtx_REG (mode, regno2);
4283
4284 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4285 reg2, adjustment));
4286 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4287 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4288 RTX_FRAME_RELATED_P (insn) = 1;
4289 }
4290
4291 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4292 adjusting it by ADJUSTMENT afterwards. */
4293
4294 static rtx
4295 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4296 HOST_WIDE_INT adjustment)
4297 {
4298 switch (mode)
4299 {
4300 case E_DImode:
4301 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4302 GEN_INT (UNITS_PER_WORD));
4303 case E_DFmode:
4304 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4305 GEN_INT (UNITS_PER_WORD));
4306 default:
4307 gcc_unreachable ();
4308 }
4309 }
4310
4311 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4312 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4313 into CFI_OPS. */
4314
4315 static void
4316 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4317 rtx *cfi_ops)
4318 {
4319 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4320 rtx reg1 = gen_rtx_REG (mode, regno1);
4321
4322 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4323
4324 if (regno2 == INVALID_REGNUM)
4325 {
4326 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4327 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4328 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4329 }
4330 else
4331 {
4332 rtx reg2 = gen_rtx_REG (mode, regno2);
4333 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4334 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4335 reg2, adjustment));
4336 }
4337 }
4338
4339 /* Generate and return a store pair instruction of mode MODE to store
4340 register REG1 to MEM1 and register REG2 to MEM2. */
4341
4342 static rtx
4343 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4344 rtx reg2)
4345 {
4346 switch (mode)
4347 {
4348 case E_DImode:
4349 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4350
4351 case E_DFmode:
4352 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4353
4354 default:
4355 gcc_unreachable ();
4356 }
4357 }
4358
4359 /* Generate and regurn a load pair isntruction of mode MODE to load register
4360 REG1 from MEM1 and register REG2 from MEM2. */
4361
4362 static rtx
4363 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4364 rtx mem2)
4365 {
4366 switch (mode)
4367 {
4368 case E_DImode:
4369 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4370
4371 case E_DFmode:
4372 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4373
4374 default:
4375 gcc_unreachable ();
4376 }
4377 }
4378
4379 /* Return TRUE if return address signing should be enabled for the current
4380 function, otherwise return FALSE. */
4381
4382 bool
4383 aarch64_return_address_signing_enabled (void)
4384 {
4385 /* This function should only be called after frame laid out. */
4386 gcc_assert (cfun->machine->frame.laid_out);
4387
4388 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4389 if it's LR is pushed onto stack. */
4390 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4391 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4392 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4393 }
4394
4395 /* Emit code to save the callee-saved registers from register number START
4396 to LIMIT to the stack at the location starting at offset START_OFFSET,
4397 skipping any write-back candidates if SKIP_WB is true. */
4398
4399 static void
4400 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4401 unsigned start, unsigned limit, bool skip_wb)
4402 {
4403 rtx_insn *insn;
4404 unsigned regno;
4405 unsigned regno2;
4406
4407 for (regno = aarch64_next_callee_save (start, limit);
4408 regno <= limit;
4409 regno = aarch64_next_callee_save (regno + 1, limit))
4410 {
4411 rtx reg, mem;
4412 poly_int64 offset;
4413
4414 if (skip_wb
4415 && (regno == cfun->machine->frame.wb_candidate1
4416 || regno == cfun->machine->frame.wb_candidate2))
4417 continue;
4418
4419 if (cfun->machine->reg_is_wrapped_separately[regno])
4420 continue;
4421
4422 reg = gen_rtx_REG (mode, regno);
4423 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4424 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4425 offset));
4426
4427 regno2 = aarch64_next_callee_save (regno + 1, limit);
4428
4429 if (regno2 <= limit
4430 && !cfun->machine->reg_is_wrapped_separately[regno2]
4431 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4432 == cfun->machine->frame.reg_offset[regno2]))
4433
4434 {
4435 rtx reg2 = gen_rtx_REG (mode, regno2);
4436 rtx mem2;
4437
4438 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4439 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4440 offset));
4441 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4442 reg2));
4443
4444 /* The first part of a frame-related parallel insn is
4445 always assumed to be relevant to the frame
4446 calculations; subsequent parts, are only
4447 frame-related if explicitly marked. */
4448 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4449 regno = regno2;
4450 }
4451 else
4452 insn = emit_move_insn (mem, reg);
4453
4454 RTX_FRAME_RELATED_P (insn) = 1;
4455 }
4456 }
4457
4458 /* Emit code to restore the callee registers of mode MODE from register
4459 number START up to and including LIMIT. Restore from the stack offset
4460 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4461 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4462
4463 static void
4464 aarch64_restore_callee_saves (machine_mode mode,
4465 poly_int64 start_offset, unsigned start,
4466 unsigned limit, bool skip_wb, rtx *cfi_ops)
4467 {
4468 rtx base_rtx = stack_pointer_rtx;
4469 unsigned regno;
4470 unsigned regno2;
4471 poly_int64 offset;
4472
4473 for (regno = aarch64_next_callee_save (start, limit);
4474 regno <= limit;
4475 regno = aarch64_next_callee_save (regno + 1, limit))
4476 {
4477 if (cfun->machine->reg_is_wrapped_separately[regno])
4478 continue;
4479
4480 rtx reg, mem;
4481
4482 if (skip_wb
4483 && (regno == cfun->machine->frame.wb_candidate1
4484 || regno == cfun->machine->frame.wb_candidate2))
4485 continue;
4486
4487 reg = gen_rtx_REG (mode, regno);
4488 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4489 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4490
4491 regno2 = aarch64_next_callee_save (regno + 1, limit);
4492
4493 if (regno2 <= limit
4494 && !cfun->machine->reg_is_wrapped_separately[regno2]
4495 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4496 == cfun->machine->frame.reg_offset[regno2]))
4497 {
4498 rtx reg2 = gen_rtx_REG (mode, regno2);
4499 rtx mem2;
4500
4501 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4502 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4503 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4504
4505 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4506 regno = regno2;
4507 }
4508 else
4509 emit_move_insn (reg, mem);
4510 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4511 }
4512 }
4513
4514 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4515 of MODE. */
4516
4517 static inline bool
4518 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4519 {
4520 HOST_WIDE_INT multiple;
4521 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4522 && IN_RANGE (multiple, -8, 7));
4523 }
4524
4525 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4526 of MODE. */
4527
4528 static inline bool
4529 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4530 {
4531 HOST_WIDE_INT multiple;
4532 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4533 && IN_RANGE (multiple, 0, 63));
4534 }
4535
4536 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4537 of MODE. */
4538
4539 bool
4540 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4541 {
4542 HOST_WIDE_INT multiple;
4543 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4544 && IN_RANGE (multiple, -64, 63));
4545 }
4546
4547 /* Return true if OFFSET is a signed 9-bit value. */
4548
4549 static inline bool
4550 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4551 poly_int64 offset)
4552 {
4553 HOST_WIDE_INT const_offset;
4554 return (offset.is_constant (&const_offset)
4555 && IN_RANGE (const_offset, -256, 255));
4556 }
4557
4558 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4559 of MODE. */
4560
4561 static inline bool
4562 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4563 {
4564 HOST_WIDE_INT multiple;
4565 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4566 && IN_RANGE (multiple, -256, 255));
4567 }
4568
4569 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4570 of MODE. */
4571
4572 static inline bool
4573 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4574 {
4575 HOST_WIDE_INT multiple;
4576 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4577 && IN_RANGE (multiple, 0, 4095));
4578 }
4579
4580 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4581
4582 static sbitmap
4583 aarch64_get_separate_components (void)
4584 {
4585 aarch64_layout_frame ();
4586
4587 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4588 bitmap_clear (components);
4589
4590 /* The registers we need saved to the frame. */
4591 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4592 if (aarch64_register_saved_on_entry (regno))
4593 {
4594 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4595 if (!frame_pointer_needed)
4596 offset += cfun->machine->frame.frame_size
4597 - cfun->machine->frame.hard_fp_offset;
4598 /* Check that we can access the stack slot of the register with one
4599 direct load with no adjustments needed. */
4600 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4601 bitmap_set_bit (components, regno);
4602 }
4603
4604 /* Don't mess with the hard frame pointer. */
4605 if (frame_pointer_needed)
4606 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4607
4608 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4609 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4610 /* If aarch64_layout_frame has chosen registers to store/restore with
4611 writeback don't interfere with them to avoid having to output explicit
4612 stack adjustment instructions. */
4613 if (reg2 != INVALID_REGNUM)
4614 bitmap_clear_bit (components, reg2);
4615 if (reg1 != INVALID_REGNUM)
4616 bitmap_clear_bit (components, reg1);
4617
4618 bitmap_clear_bit (components, LR_REGNUM);
4619 bitmap_clear_bit (components, SP_REGNUM);
4620
4621 return components;
4622 }
4623
4624 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4625
4626 static sbitmap
4627 aarch64_components_for_bb (basic_block bb)
4628 {
4629 bitmap in = DF_LIVE_IN (bb);
4630 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4631 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4632
4633 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4634 bitmap_clear (components);
4635
4636 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4637 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4638 if ((!call_used_regs[regno])
4639 && (bitmap_bit_p (in, regno)
4640 || bitmap_bit_p (gen, regno)
4641 || bitmap_bit_p (kill, regno)))
4642 {
4643 unsigned regno2, offset, offset2;
4644 bitmap_set_bit (components, regno);
4645
4646 /* If there is a callee-save at an adjacent offset, add it too
4647 to increase the use of LDP/STP. */
4648 offset = cfun->machine->frame.reg_offset[regno];
4649 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4650
4651 if (regno2 <= LAST_SAVED_REGNUM)
4652 {
4653 offset2 = cfun->machine->frame.reg_offset[regno2];
4654 if ((offset & ~8) == (offset2 & ~8))
4655 bitmap_set_bit (components, regno2);
4656 }
4657 }
4658
4659 return components;
4660 }
4661
4662 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4663 Nothing to do for aarch64. */
4664
4665 static void
4666 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4667 {
4668 }
4669
4670 /* Return the next set bit in BMP from START onwards. Return the total number
4671 of bits in BMP if no set bit is found at or after START. */
4672
4673 static unsigned int
4674 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4675 {
4676 unsigned int nbits = SBITMAP_SIZE (bmp);
4677 if (start == nbits)
4678 return start;
4679
4680 gcc_assert (start < nbits);
4681 for (unsigned int i = start; i < nbits; i++)
4682 if (bitmap_bit_p (bmp, i))
4683 return i;
4684
4685 return nbits;
4686 }
4687
4688 /* Do the work for aarch64_emit_prologue_components and
4689 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4690 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4691 for these components or the epilogue sequence. That is, it determines
4692 whether we should emit stores or loads and what kind of CFA notes to attach
4693 to the insns. Otherwise the logic for the two sequences is very
4694 similar. */
4695
4696 static void
4697 aarch64_process_components (sbitmap components, bool prologue_p)
4698 {
4699 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4700 ? HARD_FRAME_POINTER_REGNUM
4701 : STACK_POINTER_REGNUM);
4702
4703 unsigned last_regno = SBITMAP_SIZE (components);
4704 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4705 rtx_insn *insn = NULL;
4706
4707 while (regno != last_regno)
4708 {
4709 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4710 so DFmode for the vector registers is enough. */
4711 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4712 rtx reg = gen_rtx_REG (mode, regno);
4713 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4714 if (!frame_pointer_needed)
4715 offset += cfun->machine->frame.frame_size
4716 - cfun->machine->frame.hard_fp_offset;
4717 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4718 rtx mem = gen_frame_mem (mode, addr);
4719
4720 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4721 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4722 /* No more registers to handle after REGNO.
4723 Emit a single save/restore and exit. */
4724 if (regno2 == last_regno)
4725 {
4726 insn = emit_insn (set);
4727 RTX_FRAME_RELATED_P (insn) = 1;
4728 if (prologue_p)
4729 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4730 else
4731 add_reg_note (insn, REG_CFA_RESTORE, reg);
4732 break;
4733 }
4734
4735 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4736 /* The next register is not of the same class or its offset is not
4737 mergeable with the current one into a pair. */
4738 if (!satisfies_constraint_Ump (mem)
4739 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4740 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4741 GET_MODE_SIZE (mode)))
4742 {
4743 insn = emit_insn (set);
4744 RTX_FRAME_RELATED_P (insn) = 1;
4745 if (prologue_p)
4746 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4747 else
4748 add_reg_note (insn, REG_CFA_RESTORE, reg);
4749
4750 regno = regno2;
4751 continue;
4752 }
4753
4754 /* REGNO2 can be saved/restored in a pair with REGNO. */
4755 rtx reg2 = gen_rtx_REG (mode, regno2);
4756 if (!frame_pointer_needed)
4757 offset2 += cfun->machine->frame.frame_size
4758 - cfun->machine->frame.hard_fp_offset;
4759 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4760 rtx mem2 = gen_frame_mem (mode, addr2);
4761 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4762 : gen_rtx_SET (reg2, mem2);
4763
4764 if (prologue_p)
4765 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4766 else
4767 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4768
4769 RTX_FRAME_RELATED_P (insn) = 1;
4770 if (prologue_p)
4771 {
4772 add_reg_note (insn, REG_CFA_OFFSET, set);
4773 add_reg_note (insn, REG_CFA_OFFSET, set2);
4774 }
4775 else
4776 {
4777 add_reg_note (insn, REG_CFA_RESTORE, reg);
4778 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4779 }
4780
4781 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4782 }
4783 }
4784
4785 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4786
4787 static void
4788 aarch64_emit_prologue_components (sbitmap components)
4789 {
4790 aarch64_process_components (components, true);
4791 }
4792
4793 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4794
4795 static void
4796 aarch64_emit_epilogue_components (sbitmap components)
4797 {
4798 aarch64_process_components (components, false);
4799 }
4800
4801 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4802
4803 static void
4804 aarch64_set_handled_components (sbitmap components)
4805 {
4806 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4807 if (bitmap_bit_p (components, regno))
4808 cfun->machine->reg_is_wrapped_separately[regno] = true;
4809 }
4810
4811 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4812 is saved at BASE + OFFSET. */
4813
4814 static void
4815 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4816 rtx base, poly_int64 offset)
4817 {
4818 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4819 add_reg_note (insn, REG_CFA_EXPRESSION,
4820 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4821 }
4822
4823 /* AArch64 stack frames generated by this compiler look like:
4824
4825 +-------------------------------+
4826 | |
4827 | incoming stack arguments |
4828 | |
4829 +-------------------------------+
4830 | | <-- incoming stack pointer (aligned)
4831 | callee-allocated save area |
4832 | for register varargs |
4833 | |
4834 +-------------------------------+
4835 | local variables | <-- frame_pointer_rtx
4836 | |
4837 +-------------------------------+
4838 | padding0 | \
4839 +-------------------------------+ |
4840 | callee-saved registers | | frame.saved_regs_size
4841 +-------------------------------+ |
4842 | LR' | |
4843 +-------------------------------+ |
4844 | FP' | / <- hard_frame_pointer_rtx (aligned)
4845 +-------------------------------+
4846 | dynamic allocation |
4847 +-------------------------------+
4848 | padding |
4849 +-------------------------------+
4850 | outgoing stack arguments | <-- arg_pointer
4851 | |
4852 +-------------------------------+
4853 | | <-- stack_pointer_rtx (aligned)
4854
4855 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4856 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4857 unchanged. */
4858
4859 /* Generate the prologue instructions for entry into a function.
4860 Establish the stack frame by decreasing the stack pointer with a
4861 properly calculated size and, if necessary, create a frame record
4862 filled with the values of LR and previous frame pointer. The
4863 current FP is also set up if it is in use. */
4864
4865 void
4866 aarch64_expand_prologue (void)
4867 {
4868 aarch64_layout_frame ();
4869
4870 poly_int64 frame_size = cfun->machine->frame.frame_size;
4871 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4872 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4873 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4874 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4875 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4876 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4877 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4878 rtx_insn *insn;
4879
4880 /* Sign return address for functions. */
4881 if (aarch64_return_address_signing_enabled ())
4882 {
4883 insn = emit_insn (gen_pacisp ());
4884 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4885 RTX_FRAME_RELATED_P (insn) = 1;
4886 }
4887
4888 if (flag_stack_usage_info)
4889 current_function_static_stack_size = constant_lower_bound (frame_size);
4890
4891 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4892 {
4893 if (crtl->is_leaf && !cfun->calls_alloca)
4894 {
4895 if (maybe_gt (frame_size, PROBE_INTERVAL)
4896 && maybe_gt (frame_size, get_stack_check_protect ()))
4897 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4898 (frame_size
4899 - get_stack_check_protect ()));
4900 }
4901 else if (maybe_gt (frame_size, 0))
4902 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4903 }
4904
4905 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4906 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4907
4908 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4909
4910 if (callee_adjust != 0)
4911 aarch64_push_regs (reg1, reg2, callee_adjust);
4912
4913 if (emit_frame_chain)
4914 {
4915 poly_int64 reg_offset = callee_adjust;
4916 if (callee_adjust == 0)
4917 {
4918 reg1 = R29_REGNUM;
4919 reg2 = R30_REGNUM;
4920 reg_offset = callee_offset;
4921 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4922 }
4923 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4924 stack_pointer_rtx, callee_offset,
4925 ip1_rtx, ip0_rtx, frame_pointer_needed);
4926 if (frame_pointer_needed && !frame_size.is_constant ())
4927 {
4928 /* Variable-sized frames need to describe the save slot
4929 address using DW_CFA_expression rather than DW_CFA_offset.
4930 This means that, without taking further action, the
4931 locations of the registers that we've already saved would
4932 remain based on the stack pointer even after we redefine
4933 the CFA based on the frame pointer. We therefore need new
4934 DW_CFA_expressions to re-express the save slots with addresses
4935 based on the frame pointer. */
4936 rtx_insn *insn = get_last_insn ();
4937 gcc_assert (RTX_FRAME_RELATED_P (insn));
4938
4939 /* Add an explicit CFA definition if this was previously
4940 implicit. */
4941 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4942 {
4943 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4944 callee_offset);
4945 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4946 gen_rtx_SET (hard_frame_pointer_rtx, src));
4947 }
4948
4949 /* Change the save slot expressions for the registers that
4950 we've already saved. */
4951 reg_offset -= callee_offset;
4952 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4953 reg_offset + UNITS_PER_WORD);
4954 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4955 reg_offset);
4956 }
4957 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4958 }
4959
4960 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4961 callee_adjust != 0 || emit_frame_chain);
4962 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4963 callee_adjust != 0 || emit_frame_chain);
4964 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4965 }
4966
4967 /* Return TRUE if we can use a simple_return insn.
4968
4969 This function checks whether the callee saved stack is empty, which
4970 means no restore actions are need. The pro_and_epilogue will use
4971 this to check whether shrink-wrapping opt is feasible. */
4972
4973 bool
4974 aarch64_use_return_insn_p (void)
4975 {
4976 if (!reload_completed)
4977 return false;
4978
4979 if (crtl->profile)
4980 return false;
4981
4982 aarch64_layout_frame ();
4983
4984 return known_eq (cfun->machine->frame.frame_size, 0);
4985 }
4986
4987 /* Generate the epilogue instructions for returning from a function.
4988 This is almost exactly the reverse of the prolog sequence, except
4989 that we need to insert barriers to avoid scheduling loads that read
4990 from a deallocated stack, and we optimize the unwind records by
4991 emitting them all together if possible. */
4992 void
4993 aarch64_expand_epilogue (bool for_sibcall)
4994 {
4995 aarch64_layout_frame ();
4996
4997 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4998 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4999 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5000 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5001 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5002 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5003 rtx cfi_ops = NULL;
5004 rtx_insn *insn;
5005 /* A stack clash protection prologue may not have left IP0_REGNUM or
5006 IP1_REGNUM in a usable state. The same is true for allocations
5007 with an SVE component, since we then need both temporary registers
5008 for each allocation. */
5009 bool can_inherit_p = (initial_adjust.is_constant ()
5010 && final_adjust.is_constant ()
5011 && !flag_stack_clash_protection);
5012
5013 /* We need to add memory barrier to prevent read from deallocated stack. */
5014 bool need_barrier_p
5015 = maybe_ne (get_frame_size ()
5016 + cfun->machine->frame.saved_varargs_size, 0);
5017
5018 /* Emit a barrier to prevent loads from a deallocated stack. */
5019 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5020 || cfun->calls_alloca
5021 || crtl->calls_eh_return)
5022 {
5023 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5024 need_barrier_p = false;
5025 }
5026
5027 /* Restore the stack pointer from the frame pointer if it may not
5028 be the same as the stack pointer. */
5029 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5030 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5031 if (frame_pointer_needed
5032 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5033 /* If writeback is used when restoring callee-saves, the CFA
5034 is restored on the instruction doing the writeback. */
5035 aarch64_add_offset (Pmode, stack_pointer_rtx,
5036 hard_frame_pointer_rtx, -callee_offset,
5037 ip1_rtx, ip0_rtx, callee_adjust == 0);
5038 else
5039 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5040 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5041
5042 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5043 callee_adjust != 0, &cfi_ops);
5044 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5045 callee_adjust != 0, &cfi_ops);
5046
5047 if (need_barrier_p)
5048 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5049
5050 if (callee_adjust != 0)
5051 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5052
5053 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5054 {
5055 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5056 insn = get_last_insn ();
5057 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5058 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5059 RTX_FRAME_RELATED_P (insn) = 1;
5060 cfi_ops = NULL;
5061 }
5062
5063 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5064 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5065
5066 if (cfi_ops)
5067 {
5068 /* Emit delayed restores and reset the CFA to be SP. */
5069 insn = get_last_insn ();
5070 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5071 REG_NOTES (insn) = cfi_ops;
5072 RTX_FRAME_RELATED_P (insn) = 1;
5073 }
5074
5075 /* We prefer to emit the combined return/authenticate instruction RETAA,
5076 however there are three cases in which we must instead emit an explicit
5077 authentication instruction.
5078
5079 1) Sibcalls don't return in a normal way, so if we're about to call one
5080 we must authenticate.
5081
5082 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5083 generating code for !TARGET_ARMV8_3 we can't use it and must
5084 explicitly authenticate.
5085
5086 3) On an eh_return path we make extra stack adjustments to update the
5087 canonical frame address to be the exception handler's CFA. We want
5088 to authenticate using the CFA of the function which calls eh_return.
5089 */
5090 if (aarch64_return_address_signing_enabled ()
5091 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5092 {
5093 insn = emit_insn (gen_autisp ());
5094 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5095 RTX_FRAME_RELATED_P (insn) = 1;
5096 }
5097
5098 /* Stack adjustment for exception handler. */
5099 if (crtl->calls_eh_return)
5100 {
5101 /* We need to unwind the stack by the offset computed by
5102 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5103 to be SP; letting the CFA move during this adjustment
5104 is just as correct as retaining the CFA from the body
5105 of the function. Therefore, do nothing special. */
5106 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5107 }
5108
5109 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5110 if (!for_sibcall)
5111 emit_jump_insn (ret_rtx);
5112 }
5113
5114 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5115 normally or return to a previous frame after unwinding.
5116
5117 An EH return uses a single shared return sequence. The epilogue is
5118 exactly like a normal epilogue except that it has an extra input
5119 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5120 that must be applied after the frame has been destroyed. An extra label
5121 is inserted before the epilogue which initializes this register to zero,
5122 and this is the entry point for a normal return.
5123
5124 An actual EH return updates the return address, initializes the stack
5125 adjustment and jumps directly into the epilogue (bypassing the zeroing
5126 of the adjustment). Since the return address is typically saved on the
5127 stack when a function makes a call, the saved LR must be updated outside
5128 the epilogue.
5129
5130 This poses problems as the store is generated well before the epilogue,
5131 so the offset of LR is not known yet. Also optimizations will remove the
5132 store as it appears dead, even after the epilogue is generated (as the
5133 base or offset for loading LR is different in many cases).
5134
5135 To avoid these problems this implementation forces the frame pointer
5136 in eh_return functions so that the location of LR is fixed and known early.
5137 It also marks the store volatile, so no optimization is permitted to
5138 remove the store. */
5139 rtx
5140 aarch64_eh_return_handler_rtx (void)
5141 {
5142 rtx tmp = gen_frame_mem (Pmode,
5143 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5144
5145 /* Mark the store volatile, so no optimization is permitted to remove it. */
5146 MEM_VOLATILE_P (tmp) = true;
5147 return tmp;
5148 }
5149
5150 /* Output code to add DELTA to the first argument, and then jump
5151 to FUNCTION. Used for C++ multiple inheritance. */
5152 static void
5153 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5154 HOST_WIDE_INT delta,
5155 HOST_WIDE_INT vcall_offset,
5156 tree function)
5157 {
5158 /* The this pointer is always in x0. Note that this differs from
5159 Arm where the this pointer maybe bumped to r1 if r0 is required
5160 to return a pointer to an aggregate. On AArch64 a result value
5161 pointer will be in x8. */
5162 int this_regno = R0_REGNUM;
5163 rtx this_rtx, temp0, temp1, addr, funexp;
5164 rtx_insn *insn;
5165
5166 reload_completed = 1;
5167 emit_note (NOTE_INSN_PROLOGUE_END);
5168
5169 this_rtx = gen_rtx_REG (Pmode, this_regno);
5170 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5171 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5172
5173 if (vcall_offset == 0)
5174 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5175 else
5176 {
5177 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5178
5179 addr = this_rtx;
5180 if (delta != 0)
5181 {
5182 if (delta >= -256 && delta < 256)
5183 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5184 plus_constant (Pmode, this_rtx, delta));
5185 else
5186 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5187 temp1, temp0, false);
5188 }
5189
5190 if (Pmode == ptr_mode)
5191 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5192 else
5193 aarch64_emit_move (temp0,
5194 gen_rtx_ZERO_EXTEND (Pmode,
5195 gen_rtx_MEM (ptr_mode, addr)));
5196
5197 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5198 addr = plus_constant (Pmode, temp0, vcall_offset);
5199 else
5200 {
5201 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5202 Pmode);
5203 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5204 }
5205
5206 if (Pmode == ptr_mode)
5207 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5208 else
5209 aarch64_emit_move (temp1,
5210 gen_rtx_SIGN_EXTEND (Pmode,
5211 gen_rtx_MEM (ptr_mode, addr)));
5212
5213 emit_insn (gen_add2_insn (this_rtx, temp1));
5214 }
5215
5216 /* Generate a tail call to the target function. */
5217 if (!TREE_USED (function))
5218 {
5219 assemble_external (function);
5220 TREE_USED (function) = 1;
5221 }
5222 funexp = XEXP (DECL_RTL (function), 0);
5223 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5224 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5225 SIBLING_CALL_P (insn) = 1;
5226
5227 insn = get_insns ();
5228 shorten_branches (insn);
5229 final_start_function (insn, file, 1);
5230 final (insn, file, 1);
5231 final_end_function ();
5232
5233 /* Stop pretending to be a post-reload pass. */
5234 reload_completed = 0;
5235 }
5236
5237 static bool
5238 aarch64_tls_referenced_p (rtx x)
5239 {
5240 if (!TARGET_HAVE_TLS)
5241 return false;
5242 subrtx_iterator::array_type array;
5243 FOR_EACH_SUBRTX (iter, array, x, ALL)
5244 {
5245 const_rtx x = *iter;
5246 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5247 return true;
5248 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5249 TLS offsets, not real symbol references. */
5250 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5251 iter.skip_subrtxes ();
5252 }
5253 return false;
5254 }
5255
5256
5257 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5258 a left shift of 0 or 12 bits. */
5259 bool
5260 aarch64_uimm12_shift (HOST_WIDE_INT val)
5261 {
5262 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5263 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5264 );
5265 }
5266
5267
5268 /* Return true if val is an immediate that can be loaded into a
5269 register by a MOVZ instruction. */
5270 static bool
5271 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5272 {
5273 if (GET_MODE_SIZE (mode) > 4)
5274 {
5275 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5276 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5277 return 1;
5278 }
5279 else
5280 {
5281 /* Ignore sign extension. */
5282 val &= (HOST_WIDE_INT) 0xffffffff;
5283 }
5284 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5285 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5286 }
5287
5288 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5289 64-bit (DImode) integer. */
5290
5291 static unsigned HOST_WIDE_INT
5292 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5293 {
5294 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5295 while (size < 64)
5296 {
5297 val &= (HOST_WIDE_INT_1U << size) - 1;
5298 val |= val << size;
5299 size *= 2;
5300 }
5301 return val;
5302 }
5303
5304 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5305
5306 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5307 {
5308 0x0000000100000001ull,
5309 0x0001000100010001ull,
5310 0x0101010101010101ull,
5311 0x1111111111111111ull,
5312 0x5555555555555555ull,
5313 };
5314
5315
5316 /* Return true if val is a valid bitmask immediate. */
5317
5318 bool
5319 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5320 {
5321 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5322 int bits;
5323
5324 /* Check for a single sequence of one bits and return quickly if so.
5325 The special cases of all ones and all zeroes returns false. */
5326 val = aarch64_replicate_bitmask_imm (val_in, mode);
5327 tmp = val + (val & -val);
5328
5329 if (tmp == (tmp & -tmp))
5330 return (val + 1) > 1;
5331
5332 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5333 if (mode == SImode)
5334 val = (val << 32) | (val & 0xffffffff);
5335
5336 /* Invert if the immediate doesn't start with a zero bit - this means we
5337 only need to search for sequences of one bits. */
5338 if (val & 1)
5339 val = ~val;
5340
5341 /* Find the first set bit and set tmp to val with the first sequence of one
5342 bits removed. Return success if there is a single sequence of ones. */
5343 first_one = val & -val;
5344 tmp = val & (val + first_one);
5345
5346 if (tmp == 0)
5347 return true;
5348
5349 /* Find the next set bit and compute the difference in bit position. */
5350 next_one = tmp & -tmp;
5351 bits = clz_hwi (first_one) - clz_hwi (next_one);
5352 mask = val ^ tmp;
5353
5354 /* Check the bit position difference is a power of 2, and that the first
5355 sequence of one bits fits within 'bits' bits. */
5356 if ((mask >> bits) != 0 || bits != (bits & -bits))
5357 return false;
5358
5359 /* Check the sequence of one bits is repeated 64/bits times. */
5360 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5361 }
5362
5363 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5364 Assumed precondition: VAL_IN Is not zero. */
5365
5366 unsigned HOST_WIDE_INT
5367 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5368 {
5369 int lowest_bit_set = ctz_hwi (val_in);
5370 int highest_bit_set = floor_log2 (val_in);
5371 gcc_assert (val_in != 0);
5372
5373 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5374 (HOST_WIDE_INT_1U << lowest_bit_set));
5375 }
5376
5377 /* Create constant where bits outside of lowest bit set to highest bit set
5378 are set to 1. */
5379
5380 unsigned HOST_WIDE_INT
5381 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5382 {
5383 return val_in | ~aarch64_and_split_imm1 (val_in);
5384 }
5385
5386 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5387
5388 bool
5389 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5390 {
5391 scalar_int_mode int_mode;
5392 if (!is_a <scalar_int_mode> (mode, &int_mode))
5393 return false;
5394
5395 if (aarch64_bitmask_imm (val_in, int_mode))
5396 return false;
5397
5398 if (aarch64_move_imm (val_in, int_mode))
5399 return false;
5400
5401 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5402
5403 return aarch64_bitmask_imm (imm2, int_mode);
5404 }
5405
5406 /* Return true if val is an immediate that can be loaded into a
5407 register in a single instruction. */
5408 bool
5409 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5410 {
5411 scalar_int_mode int_mode;
5412 if (!is_a <scalar_int_mode> (mode, &int_mode))
5413 return false;
5414
5415 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5416 return 1;
5417 return aarch64_bitmask_imm (val, int_mode);
5418 }
5419
5420 static bool
5421 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5422 {
5423 rtx base, offset;
5424
5425 if (GET_CODE (x) == HIGH)
5426 return true;
5427
5428 /* There's no way to calculate VL-based values using relocations. */
5429 subrtx_iterator::array_type array;
5430 FOR_EACH_SUBRTX (iter, array, x, ALL)
5431 if (GET_CODE (*iter) == CONST_POLY_INT)
5432 return true;
5433
5434 split_const (x, &base, &offset);
5435 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5436 {
5437 if (aarch64_classify_symbol (base, INTVAL (offset))
5438 != SYMBOL_FORCE_TO_MEM)
5439 return true;
5440 else
5441 /* Avoid generating a 64-bit relocation in ILP32; leave
5442 to aarch64_expand_mov_immediate to handle it properly. */
5443 return mode != ptr_mode;
5444 }
5445
5446 return aarch64_tls_referenced_p (x);
5447 }
5448
5449 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5450 The expansion for a table switch is quite expensive due to the number
5451 of instructions, the table lookup and hard to predict indirect jump.
5452 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5453 set, otherwise use tables for > 16 cases as a tradeoff between size and
5454 performance. When optimizing for size, use the default setting. */
5455
5456 static unsigned int
5457 aarch64_case_values_threshold (void)
5458 {
5459 /* Use the specified limit for the number of cases before using jump
5460 tables at higher optimization levels. */
5461 if (optimize > 2
5462 && selected_cpu->tune->max_case_values != 0)
5463 return selected_cpu->tune->max_case_values;
5464 else
5465 return optimize_size ? default_case_values_threshold () : 17;
5466 }
5467
5468 /* Return true if register REGNO is a valid index register.
5469 STRICT_P is true if REG_OK_STRICT is in effect. */
5470
5471 bool
5472 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5473 {
5474 if (!HARD_REGISTER_NUM_P (regno))
5475 {
5476 if (!strict_p)
5477 return true;
5478
5479 if (!reg_renumber)
5480 return false;
5481
5482 regno = reg_renumber[regno];
5483 }
5484 return GP_REGNUM_P (regno);
5485 }
5486
5487 /* Return true if register REGNO is a valid base register for mode MODE.
5488 STRICT_P is true if REG_OK_STRICT is in effect. */
5489
5490 bool
5491 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5492 {
5493 if (!HARD_REGISTER_NUM_P (regno))
5494 {
5495 if (!strict_p)
5496 return true;
5497
5498 if (!reg_renumber)
5499 return false;
5500
5501 regno = reg_renumber[regno];
5502 }
5503
5504 /* The fake registers will be eliminated to either the stack or
5505 hard frame pointer, both of which are usually valid base registers.
5506 Reload deals with the cases where the eliminated form isn't valid. */
5507 return (GP_REGNUM_P (regno)
5508 || regno == SP_REGNUM
5509 || regno == FRAME_POINTER_REGNUM
5510 || regno == ARG_POINTER_REGNUM);
5511 }
5512
5513 /* Return true if X is a valid base register for mode MODE.
5514 STRICT_P is true if REG_OK_STRICT is in effect. */
5515
5516 static bool
5517 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5518 {
5519 if (!strict_p
5520 && GET_CODE (x) == SUBREG
5521 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5522 x = SUBREG_REG (x);
5523
5524 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5525 }
5526
5527 /* Return true if address offset is a valid index. If it is, fill in INFO
5528 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5529
5530 static bool
5531 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5532 machine_mode mode, bool strict_p)
5533 {
5534 enum aarch64_address_type type;
5535 rtx index;
5536 int shift;
5537
5538 /* (reg:P) */
5539 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5540 && GET_MODE (x) == Pmode)
5541 {
5542 type = ADDRESS_REG_REG;
5543 index = x;
5544 shift = 0;
5545 }
5546 /* (sign_extend:DI (reg:SI)) */
5547 else if ((GET_CODE (x) == SIGN_EXTEND
5548 || GET_CODE (x) == ZERO_EXTEND)
5549 && GET_MODE (x) == DImode
5550 && GET_MODE (XEXP (x, 0)) == SImode)
5551 {
5552 type = (GET_CODE (x) == SIGN_EXTEND)
5553 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5554 index = XEXP (x, 0);
5555 shift = 0;
5556 }
5557 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5558 else if (GET_CODE (x) == MULT
5559 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5560 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5561 && GET_MODE (XEXP (x, 0)) == DImode
5562 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5563 && CONST_INT_P (XEXP (x, 1)))
5564 {
5565 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5566 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5567 index = XEXP (XEXP (x, 0), 0);
5568 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5569 }
5570 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5571 else if (GET_CODE (x) == ASHIFT
5572 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5573 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5574 && GET_MODE (XEXP (x, 0)) == DImode
5575 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5576 && CONST_INT_P (XEXP (x, 1)))
5577 {
5578 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5579 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5580 index = XEXP (XEXP (x, 0), 0);
5581 shift = INTVAL (XEXP (x, 1));
5582 }
5583 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5584 else if ((GET_CODE (x) == SIGN_EXTRACT
5585 || GET_CODE (x) == ZERO_EXTRACT)
5586 && GET_MODE (x) == DImode
5587 && GET_CODE (XEXP (x, 0)) == MULT
5588 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5589 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5590 {
5591 type = (GET_CODE (x) == SIGN_EXTRACT)
5592 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5593 index = XEXP (XEXP (x, 0), 0);
5594 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5595 if (INTVAL (XEXP (x, 1)) != 32 + shift
5596 || INTVAL (XEXP (x, 2)) != 0)
5597 shift = -1;
5598 }
5599 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5600 (const_int 0xffffffff<<shift)) */
5601 else if (GET_CODE (x) == AND
5602 && GET_MODE (x) == DImode
5603 && GET_CODE (XEXP (x, 0)) == MULT
5604 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5605 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5606 && CONST_INT_P (XEXP (x, 1)))
5607 {
5608 type = ADDRESS_REG_UXTW;
5609 index = XEXP (XEXP (x, 0), 0);
5610 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5611 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5612 shift = -1;
5613 }
5614 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5615 else if ((GET_CODE (x) == SIGN_EXTRACT
5616 || GET_CODE (x) == ZERO_EXTRACT)
5617 && GET_MODE (x) == DImode
5618 && GET_CODE (XEXP (x, 0)) == ASHIFT
5619 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5620 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5621 {
5622 type = (GET_CODE (x) == SIGN_EXTRACT)
5623 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5624 index = XEXP (XEXP (x, 0), 0);
5625 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5626 if (INTVAL (XEXP (x, 1)) != 32 + shift
5627 || INTVAL (XEXP (x, 2)) != 0)
5628 shift = -1;
5629 }
5630 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5631 (const_int 0xffffffff<<shift)) */
5632 else if (GET_CODE (x) == AND
5633 && GET_MODE (x) == DImode
5634 && GET_CODE (XEXP (x, 0)) == ASHIFT
5635 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5636 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5637 && CONST_INT_P (XEXP (x, 1)))
5638 {
5639 type = ADDRESS_REG_UXTW;
5640 index = XEXP (XEXP (x, 0), 0);
5641 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5642 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5643 shift = -1;
5644 }
5645 /* (mult:P (reg:P) (const_int scale)) */
5646 else if (GET_CODE (x) == MULT
5647 && GET_MODE (x) == Pmode
5648 && GET_MODE (XEXP (x, 0)) == Pmode
5649 && CONST_INT_P (XEXP (x, 1)))
5650 {
5651 type = ADDRESS_REG_REG;
5652 index = XEXP (x, 0);
5653 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5654 }
5655 /* (ashift:P (reg:P) (const_int shift)) */
5656 else if (GET_CODE (x) == ASHIFT
5657 && GET_MODE (x) == Pmode
5658 && GET_MODE (XEXP (x, 0)) == Pmode
5659 && CONST_INT_P (XEXP (x, 1)))
5660 {
5661 type = ADDRESS_REG_REG;
5662 index = XEXP (x, 0);
5663 shift = INTVAL (XEXP (x, 1));
5664 }
5665 else
5666 return false;
5667
5668 if (!strict_p
5669 && GET_CODE (index) == SUBREG
5670 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5671 index = SUBREG_REG (index);
5672
5673 if (aarch64_sve_data_mode_p (mode))
5674 {
5675 if (type != ADDRESS_REG_REG
5676 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5677 return false;
5678 }
5679 else
5680 {
5681 if (shift != 0
5682 && !(IN_RANGE (shift, 1, 3)
5683 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5684 return false;
5685 }
5686
5687 if (REG_P (index)
5688 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5689 {
5690 info->type = type;
5691 info->offset = index;
5692 info->shift = shift;
5693 return true;
5694 }
5695
5696 return false;
5697 }
5698
5699 /* Return true if MODE is one of the modes for which we
5700 support LDP/STP operations. */
5701
5702 static bool
5703 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5704 {
5705 return mode == SImode || mode == DImode
5706 || mode == SFmode || mode == DFmode
5707 || (aarch64_vector_mode_supported_p (mode)
5708 && (known_eq (GET_MODE_SIZE (mode), 8)
5709 || (known_eq (GET_MODE_SIZE (mode), 16)
5710 && (aarch64_tune_params.extra_tuning_flags
5711 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5712 }
5713
5714 /* Return true if REGNO is a virtual pointer register, or an eliminable
5715 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5716 include stack_pointer or hard_frame_pointer. */
5717 static bool
5718 virt_or_elim_regno_p (unsigned regno)
5719 {
5720 return ((regno >= FIRST_VIRTUAL_REGISTER
5721 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5722 || regno == FRAME_POINTER_REGNUM
5723 || regno == ARG_POINTER_REGNUM);
5724 }
5725
5726 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5727 If it is, fill in INFO appropriately. STRICT_P is true if
5728 REG_OK_STRICT is in effect. */
5729
5730 static bool
5731 aarch64_classify_address (struct aarch64_address_info *info,
5732 rtx x, machine_mode mode, bool strict_p,
5733 aarch64_addr_query_type type = ADDR_QUERY_M)
5734 {
5735 enum rtx_code code = GET_CODE (x);
5736 rtx op0, op1;
5737 poly_int64 offset;
5738
5739 HOST_WIDE_INT const_size;
5740
5741 /* On BE, we use load/store pair for all large int mode load/stores.
5742 TI/TFmode may also use a load/store pair. */
5743 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5744 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5745 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5746 || type == ADDR_QUERY_LDP_STP_N
5747 || mode == TImode
5748 || mode == TFmode
5749 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5750
5751 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5752 corresponds to the actual size of the memory being loaded/stored and the
5753 mode of the corresponding addressing mode is half of that. */
5754 if (type == ADDR_QUERY_LDP_STP_N
5755 && known_eq (GET_MODE_SIZE (mode), 16))
5756 mode = DFmode;
5757
5758 bool allow_reg_index_p = (!load_store_pair_p
5759 && (known_lt (GET_MODE_SIZE (mode), 16)
5760 || vec_flags == VEC_ADVSIMD
5761 || vec_flags == VEC_SVE_DATA));
5762
5763 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5764 [Rn, #offset, MUL VL]. */
5765 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5766 && (code != REG && code != PLUS))
5767 return false;
5768
5769 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5770 REG addressing. */
5771 if (advsimd_struct_p
5772 && !BYTES_BIG_ENDIAN
5773 && (code != POST_INC && code != REG))
5774 return false;
5775
5776 gcc_checking_assert (GET_MODE (x) == VOIDmode
5777 || SCALAR_INT_MODE_P (GET_MODE (x)));
5778
5779 switch (code)
5780 {
5781 case REG:
5782 case SUBREG:
5783 info->type = ADDRESS_REG_IMM;
5784 info->base = x;
5785 info->offset = const0_rtx;
5786 info->const_offset = 0;
5787 return aarch64_base_register_rtx_p (x, strict_p);
5788
5789 case PLUS:
5790 op0 = XEXP (x, 0);
5791 op1 = XEXP (x, 1);
5792
5793 if (! strict_p
5794 && REG_P (op0)
5795 && virt_or_elim_regno_p (REGNO (op0))
5796 && poly_int_rtx_p (op1, &offset))
5797 {
5798 info->type = ADDRESS_REG_IMM;
5799 info->base = op0;
5800 info->offset = op1;
5801 info->const_offset = offset;
5802
5803 return true;
5804 }
5805
5806 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5807 && aarch64_base_register_rtx_p (op0, strict_p)
5808 && poly_int_rtx_p (op1, &offset))
5809 {
5810 info->type = ADDRESS_REG_IMM;
5811 info->base = op0;
5812 info->offset = op1;
5813 info->const_offset = offset;
5814
5815 /* TImode and TFmode values are allowed in both pairs of X
5816 registers and individual Q registers. The available
5817 address modes are:
5818 X,X: 7-bit signed scaled offset
5819 Q: 9-bit signed offset
5820 We conservatively require an offset representable in either mode.
5821 When performing the check for pairs of X registers i.e. LDP/STP
5822 pass down DImode since that is the natural size of the LDP/STP
5823 instruction memory accesses. */
5824 if (mode == TImode || mode == TFmode)
5825 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5826 && (offset_9bit_signed_unscaled_p (mode, offset)
5827 || offset_12bit_unsigned_scaled_p (mode, offset)));
5828
5829 /* A 7bit offset check because OImode will emit a ldp/stp
5830 instruction (only big endian will get here).
5831 For ldp/stp instructions, the offset is scaled for the size of a
5832 single element of the pair. */
5833 if (mode == OImode)
5834 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5835
5836 /* Three 9/12 bit offsets checks because CImode will emit three
5837 ldr/str instructions (only big endian will get here). */
5838 if (mode == CImode)
5839 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5840 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5841 || offset_12bit_unsigned_scaled_p (V16QImode,
5842 offset + 32)));
5843
5844 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5845 instructions (only big endian will get here). */
5846 if (mode == XImode)
5847 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5848 && aarch64_offset_7bit_signed_scaled_p (TImode,
5849 offset + 32));
5850
5851 /* Make "m" use the LD1 offset range for SVE data modes, so
5852 that pre-RTL optimizers like ivopts will work to that
5853 instead of the wider LDR/STR range. */
5854 if (vec_flags == VEC_SVE_DATA)
5855 return (type == ADDR_QUERY_M
5856 ? offset_4bit_signed_scaled_p (mode, offset)
5857 : offset_9bit_signed_scaled_p (mode, offset));
5858
5859 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5860 {
5861 poly_int64 end_offset = (offset
5862 + GET_MODE_SIZE (mode)
5863 - BYTES_PER_SVE_VECTOR);
5864 return (type == ADDR_QUERY_M
5865 ? offset_4bit_signed_scaled_p (mode, offset)
5866 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5867 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5868 end_offset)));
5869 }
5870
5871 if (vec_flags == VEC_SVE_PRED)
5872 return offset_9bit_signed_scaled_p (mode, offset);
5873
5874 if (load_store_pair_p)
5875 return ((known_eq (GET_MODE_SIZE (mode), 4)
5876 || known_eq (GET_MODE_SIZE (mode), 8)
5877 || known_eq (GET_MODE_SIZE (mode), 16))
5878 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5879 else
5880 return (offset_9bit_signed_unscaled_p (mode, offset)
5881 || offset_12bit_unsigned_scaled_p (mode, offset));
5882 }
5883
5884 if (allow_reg_index_p)
5885 {
5886 /* Look for base + (scaled/extended) index register. */
5887 if (aarch64_base_register_rtx_p (op0, strict_p)
5888 && aarch64_classify_index (info, op1, mode, strict_p))
5889 {
5890 info->base = op0;
5891 return true;
5892 }
5893 if (aarch64_base_register_rtx_p (op1, strict_p)
5894 && aarch64_classify_index (info, op0, mode, strict_p))
5895 {
5896 info->base = op1;
5897 return true;
5898 }
5899 }
5900
5901 return false;
5902
5903 case POST_INC:
5904 case POST_DEC:
5905 case PRE_INC:
5906 case PRE_DEC:
5907 info->type = ADDRESS_REG_WB;
5908 info->base = XEXP (x, 0);
5909 info->offset = NULL_RTX;
5910 return aarch64_base_register_rtx_p (info->base, strict_p);
5911
5912 case POST_MODIFY:
5913 case PRE_MODIFY:
5914 info->type = ADDRESS_REG_WB;
5915 info->base = XEXP (x, 0);
5916 if (GET_CODE (XEXP (x, 1)) == PLUS
5917 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5918 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5919 && aarch64_base_register_rtx_p (info->base, strict_p))
5920 {
5921 info->offset = XEXP (XEXP (x, 1), 1);
5922 info->const_offset = offset;
5923
5924 /* TImode and TFmode values are allowed in both pairs of X
5925 registers and individual Q registers. The available
5926 address modes are:
5927 X,X: 7-bit signed scaled offset
5928 Q: 9-bit signed offset
5929 We conservatively require an offset representable in either mode.
5930 */
5931 if (mode == TImode || mode == TFmode)
5932 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5933 && offset_9bit_signed_unscaled_p (mode, offset));
5934
5935 if (load_store_pair_p)
5936 return ((known_eq (GET_MODE_SIZE (mode), 4)
5937 || known_eq (GET_MODE_SIZE (mode), 8)
5938 || known_eq (GET_MODE_SIZE (mode), 16))
5939 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5940 else
5941 return offset_9bit_signed_unscaled_p (mode, offset);
5942 }
5943 return false;
5944
5945 case CONST:
5946 case SYMBOL_REF:
5947 case LABEL_REF:
5948 /* load literal: pc-relative constant pool entry. Only supported
5949 for SI mode or larger. */
5950 info->type = ADDRESS_SYMBOLIC;
5951
5952 if (!load_store_pair_p
5953 && GET_MODE_SIZE (mode).is_constant (&const_size)
5954 && const_size >= 4)
5955 {
5956 rtx sym, addend;
5957
5958 split_const (x, &sym, &addend);
5959 return ((GET_CODE (sym) == LABEL_REF
5960 || (GET_CODE (sym) == SYMBOL_REF
5961 && CONSTANT_POOL_ADDRESS_P (sym)
5962 && aarch64_pcrelative_literal_loads)));
5963 }
5964 return false;
5965
5966 case LO_SUM:
5967 info->type = ADDRESS_LO_SUM;
5968 info->base = XEXP (x, 0);
5969 info->offset = XEXP (x, 1);
5970 if (allow_reg_index_p
5971 && aarch64_base_register_rtx_p (info->base, strict_p))
5972 {
5973 rtx sym, offs;
5974 split_const (info->offset, &sym, &offs);
5975 if (GET_CODE (sym) == SYMBOL_REF
5976 && (aarch64_classify_symbol (sym, INTVAL (offs))
5977 == SYMBOL_SMALL_ABSOLUTE))
5978 {
5979 /* The symbol and offset must be aligned to the access size. */
5980 unsigned int align;
5981
5982 if (CONSTANT_POOL_ADDRESS_P (sym))
5983 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5984 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5985 {
5986 tree exp = SYMBOL_REF_DECL (sym);
5987 align = TYPE_ALIGN (TREE_TYPE (exp));
5988 align = aarch64_constant_alignment (exp, align);
5989 }
5990 else if (SYMBOL_REF_DECL (sym))
5991 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5992 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5993 && SYMBOL_REF_BLOCK (sym) != NULL)
5994 align = SYMBOL_REF_BLOCK (sym)->alignment;
5995 else
5996 align = BITS_PER_UNIT;
5997
5998 poly_int64 ref_size = GET_MODE_SIZE (mode);
5999 if (known_eq (ref_size, 0))
6000 ref_size = GET_MODE_SIZE (DImode);
6001
6002 return (multiple_p (INTVAL (offs), ref_size)
6003 && multiple_p (align / BITS_PER_UNIT, ref_size));
6004 }
6005 }
6006 return false;
6007
6008 default:
6009 return false;
6010 }
6011 }
6012
6013 /* Return true if the address X is valid for a PRFM instruction.
6014 STRICT_P is true if we should do strict checking with
6015 aarch64_classify_address. */
6016
6017 bool
6018 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6019 {
6020 struct aarch64_address_info addr;
6021
6022 /* PRFM accepts the same addresses as DImode... */
6023 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6024 if (!res)
6025 return false;
6026
6027 /* ... except writeback forms. */
6028 return addr.type != ADDRESS_REG_WB;
6029 }
6030
6031 bool
6032 aarch64_symbolic_address_p (rtx x)
6033 {
6034 rtx offset;
6035
6036 split_const (x, &x, &offset);
6037 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6038 }
6039
6040 /* Classify the base of symbolic expression X. */
6041
6042 enum aarch64_symbol_type
6043 aarch64_classify_symbolic_expression (rtx x)
6044 {
6045 rtx offset;
6046
6047 split_const (x, &x, &offset);
6048 return aarch64_classify_symbol (x, INTVAL (offset));
6049 }
6050
6051
6052 /* Return TRUE if X is a legitimate address for accessing memory in
6053 mode MODE. */
6054 static bool
6055 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6056 {
6057 struct aarch64_address_info addr;
6058
6059 return aarch64_classify_address (&addr, x, mode, strict_p);
6060 }
6061
6062 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6063 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6064 bool
6065 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6066 aarch64_addr_query_type type)
6067 {
6068 struct aarch64_address_info addr;
6069
6070 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6071 }
6072
6073 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6074
6075 static bool
6076 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6077 poly_int64 orig_offset,
6078 machine_mode mode)
6079 {
6080 HOST_WIDE_INT size;
6081 if (GET_MODE_SIZE (mode).is_constant (&size))
6082 {
6083 HOST_WIDE_INT const_offset, second_offset;
6084
6085 /* A general SVE offset is A * VQ + B. Remove the A component from
6086 coefficient 0 in order to get the constant B. */
6087 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6088
6089 /* Split an out-of-range address displacement into a base and
6090 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6091 range otherwise to increase opportunities for sharing the base
6092 address of different sizes. Unaligned accesses use the signed
6093 9-bit range, TImode/TFmode use the intersection of signed
6094 scaled 7-bit and signed 9-bit offset. */
6095 if (mode == TImode || mode == TFmode)
6096 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6097 else if ((const_offset & (size - 1)) != 0)
6098 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6099 else
6100 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6101
6102 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6103 return false;
6104
6105 /* Split the offset into second_offset and the rest. */
6106 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6107 *offset2 = gen_int_mode (second_offset, Pmode);
6108 return true;
6109 }
6110 else
6111 {
6112 /* Get the mode we should use as the basis of the range. For structure
6113 modes this is the mode of one vector. */
6114 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6115 machine_mode step_mode
6116 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6117
6118 /* Get the "mul vl" multiplier we'd like to use. */
6119 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6120 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6121 if (vec_flags & VEC_SVE_DATA)
6122 /* LDR supports a 9-bit range, but the move patterns for
6123 structure modes require all vectors to be in range of the
6124 same base. The simplest way of accomodating that while still
6125 promoting reuse of anchor points between different modes is
6126 to use an 8-bit range unconditionally. */
6127 vnum = ((vnum + 128) & 255) - 128;
6128 else
6129 /* Predicates are only handled singly, so we might as well use
6130 the full range. */
6131 vnum = ((vnum + 256) & 511) - 256;
6132 if (vnum == 0)
6133 return false;
6134
6135 /* Convert the "mul vl" multiplier into a byte offset. */
6136 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6137 if (known_eq (second_offset, orig_offset))
6138 return false;
6139
6140 /* Split the offset into second_offset and the rest. */
6141 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6142 *offset2 = gen_int_mode (second_offset, Pmode);
6143 return true;
6144 }
6145 }
6146
6147 /* Return the binary representation of floating point constant VALUE in INTVAL.
6148 If the value cannot be converted, return false without setting INTVAL.
6149 The conversion is done in the given MODE. */
6150 bool
6151 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6152 {
6153
6154 /* We make a general exception for 0. */
6155 if (aarch64_float_const_zero_rtx_p (value))
6156 {
6157 *intval = 0;
6158 return true;
6159 }
6160
6161 scalar_float_mode mode;
6162 if (GET_CODE (value) != CONST_DOUBLE
6163 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6164 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6165 /* Only support up to DF mode. */
6166 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6167 return false;
6168
6169 unsigned HOST_WIDE_INT ival = 0;
6170
6171 long res[2];
6172 real_to_target (res,
6173 CONST_DOUBLE_REAL_VALUE (value),
6174 REAL_MODE_FORMAT (mode));
6175
6176 if (mode == DFmode)
6177 {
6178 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6179 ival = zext_hwi (res[order], 32);
6180 ival |= (zext_hwi (res[1 - order], 32) << 32);
6181 }
6182 else
6183 ival = zext_hwi (res[0], 32);
6184
6185 *intval = ival;
6186 return true;
6187 }
6188
6189 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6190 single MOV(+MOVK) followed by an FMOV. */
6191 bool
6192 aarch64_float_const_rtx_p (rtx x)
6193 {
6194 machine_mode mode = GET_MODE (x);
6195 if (mode == VOIDmode)
6196 return false;
6197
6198 /* Determine whether it's cheaper to write float constants as
6199 mov/movk pairs over ldr/adrp pairs. */
6200 unsigned HOST_WIDE_INT ival;
6201
6202 if (GET_CODE (x) == CONST_DOUBLE
6203 && SCALAR_FLOAT_MODE_P (mode)
6204 && aarch64_reinterpret_float_as_int (x, &ival))
6205 {
6206 scalar_int_mode imode = (mode == HFmode
6207 ? SImode
6208 : int_mode_for_mode (mode).require ());
6209 int num_instr = aarch64_internal_mov_immediate
6210 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6211 return num_instr < 3;
6212 }
6213
6214 return false;
6215 }
6216
6217 /* Return TRUE if rtx X is immediate constant 0.0 */
6218 bool
6219 aarch64_float_const_zero_rtx_p (rtx x)
6220 {
6221 if (GET_MODE (x) == VOIDmode)
6222 return false;
6223
6224 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6225 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6226 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6227 }
6228
6229 /* Return TRUE if rtx X is immediate constant that fits in a single
6230 MOVI immediate operation. */
6231 bool
6232 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6233 {
6234 if (!TARGET_SIMD)
6235 return false;
6236
6237 machine_mode vmode;
6238 scalar_int_mode imode;
6239 unsigned HOST_WIDE_INT ival;
6240
6241 if (GET_CODE (x) == CONST_DOUBLE
6242 && SCALAR_FLOAT_MODE_P (mode))
6243 {
6244 if (!aarch64_reinterpret_float_as_int (x, &ival))
6245 return false;
6246
6247 /* We make a general exception for 0. */
6248 if (aarch64_float_const_zero_rtx_p (x))
6249 return true;
6250
6251 imode = int_mode_for_mode (mode).require ();
6252 }
6253 else if (GET_CODE (x) == CONST_INT
6254 && is_a <scalar_int_mode> (mode, &imode))
6255 ival = INTVAL (x);
6256 else
6257 return false;
6258
6259 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6260 a 128 bit vector mode. */
6261 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6262
6263 vmode = aarch64_simd_container_mode (imode, width);
6264 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6265
6266 return aarch64_simd_valid_immediate (v_op, NULL);
6267 }
6268
6269
6270 /* Return the fixed registers used for condition codes. */
6271
6272 static bool
6273 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6274 {
6275 *p1 = CC_REGNUM;
6276 *p2 = INVALID_REGNUM;
6277 return true;
6278 }
6279
6280 /* This function is used by the call expanders of the machine description.
6281 RESULT is the register in which the result is returned. It's NULL for
6282 "call" and "sibcall".
6283 MEM is the location of the function call.
6284 SIBCALL indicates whether this function call is normal call or sibling call.
6285 It will generate different pattern accordingly. */
6286
6287 void
6288 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6289 {
6290 rtx call, callee, tmp;
6291 rtvec vec;
6292 machine_mode mode;
6293
6294 gcc_assert (MEM_P (mem));
6295 callee = XEXP (mem, 0);
6296 mode = GET_MODE (callee);
6297 gcc_assert (mode == Pmode);
6298
6299 /* Decide if we should generate indirect calls by loading the
6300 address of the callee into a register before performing
6301 the branch-and-link. */
6302 if (SYMBOL_REF_P (callee)
6303 ? (aarch64_is_long_call_p (callee)
6304 || aarch64_is_noplt_call_p (callee))
6305 : !REG_P (callee))
6306 XEXP (mem, 0) = force_reg (mode, callee);
6307
6308 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6309
6310 if (result != NULL_RTX)
6311 call = gen_rtx_SET (result, call);
6312
6313 if (sibcall)
6314 tmp = ret_rtx;
6315 else
6316 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6317
6318 vec = gen_rtvec (2, call, tmp);
6319 call = gen_rtx_PARALLEL (VOIDmode, vec);
6320
6321 aarch64_emit_call_insn (call);
6322 }
6323
6324 /* Emit call insn with PAT and do aarch64-specific handling. */
6325
6326 void
6327 aarch64_emit_call_insn (rtx pat)
6328 {
6329 rtx insn = emit_call_insn (pat);
6330
6331 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6332 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6333 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6334 }
6335
6336 machine_mode
6337 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6338 {
6339 /* All floating point compares return CCFP if it is an equality
6340 comparison, and CCFPE otherwise. */
6341 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6342 {
6343 switch (code)
6344 {
6345 case EQ:
6346 case NE:
6347 case UNORDERED:
6348 case ORDERED:
6349 case UNLT:
6350 case UNLE:
6351 case UNGT:
6352 case UNGE:
6353 case UNEQ:
6354 return CCFPmode;
6355
6356 case LT:
6357 case LE:
6358 case GT:
6359 case GE:
6360 case LTGT:
6361 return CCFPEmode;
6362
6363 default:
6364 gcc_unreachable ();
6365 }
6366 }
6367
6368 /* Equality comparisons of short modes against zero can be performed
6369 using the TST instruction with the appropriate bitmask. */
6370 if (y == const0_rtx && REG_P (x)
6371 && (code == EQ || code == NE)
6372 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6373 return CC_NZmode;
6374
6375 /* Similarly, comparisons of zero_extends from shorter modes can
6376 be performed using an ANDS with an immediate mask. */
6377 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6378 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6379 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6380 && (code == EQ || code == NE))
6381 return CC_NZmode;
6382
6383 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6384 && y == const0_rtx
6385 && (code == EQ || code == NE || code == LT || code == GE)
6386 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6387 || GET_CODE (x) == NEG
6388 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6389 && CONST_INT_P (XEXP (x, 2)))))
6390 return CC_NZmode;
6391
6392 /* A compare with a shifted operand. Because of canonicalization,
6393 the comparison will have to be swapped when we emit the assembly
6394 code. */
6395 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6396 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6397 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6398 || GET_CODE (x) == LSHIFTRT
6399 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6400 return CC_SWPmode;
6401
6402 /* Similarly for a negated operand, but we can only do this for
6403 equalities. */
6404 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6405 && (REG_P (y) || GET_CODE (y) == SUBREG)
6406 && (code == EQ || code == NE)
6407 && GET_CODE (x) == NEG)
6408 return CC_Zmode;
6409
6410 /* A test for unsigned overflow. */
6411 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6412 && code == NE
6413 && GET_CODE (x) == PLUS
6414 && GET_CODE (y) == ZERO_EXTEND)
6415 return CC_Cmode;
6416
6417 /* For everything else, return CCmode. */
6418 return CCmode;
6419 }
6420
6421 static int
6422 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6423
6424 int
6425 aarch64_get_condition_code (rtx x)
6426 {
6427 machine_mode mode = GET_MODE (XEXP (x, 0));
6428 enum rtx_code comp_code = GET_CODE (x);
6429
6430 if (GET_MODE_CLASS (mode) != MODE_CC)
6431 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6432 return aarch64_get_condition_code_1 (mode, comp_code);
6433 }
6434
6435 static int
6436 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6437 {
6438 switch (mode)
6439 {
6440 case E_CCFPmode:
6441 case E_CCFPEmode:
6442 switch (comp_code)
6443 {
6444 case GE: return AARCH64_GE;
6445 case GT: return AARCH64_GT;
6446 case LE: return AARCH64_LS;
6447 case LT: return AARCH64_MI;
6448 case NE: return AARCH64_NE;
6449 case EQ: return AARCH64_EQ;
6450 case ORDERED: return AARCH64_VC;
6451 case UNORDERED: return AARCH64_VS;
6452 case UNLT: return AARCH64_LT;
6453 case UNLE: return AARCH64_LE;
6454 case UNGT: return AARCH64_HI;
6455 case UNGE: return AARCH64_PL;
6456 default: return -1;
6457 }
6458 break;
6459
6460 case E_CCmode:
6461 switch (comp_code)
6462 {
6463 case NE: return AARCH64_NE;
6464 case EQ: return AARCH64_EQ;
6465 case GE: return AARCH64_GE;
6466 case GT: return AARCH64_GT;
6467 case LE: return AARCH64_LE;
6468 case LT: return AARCH64_LT;
6469 case GEU: return AARCH64_CS;
6470 case GTU: return AARCH64_HI;
6471 case LEU: return AARCH64_LS;
6472 case LTU: return AARCH64_CC;
6473 default: return -1;
6474 }
6475 break;
6476
6477 case E_CC_SWPmode:
6478 switch (comp_code)
6479 {
6480 case NE: return AARCH64_NE;
6481 case EQ: return AARCH64_EQ;
6482 case GE: return AARCH64_LE;
6483 case GT: return AARCH64_LT;
6484 case LE: return AARCH64_GE;
6485 case LT: return AARCH64_GT;
6486 case GEU: return AARCH64_LS;
6487 case GTU: return AARCH64_CC;
6488 case LEU: return AARCH64_CS;
6489 case LTU: return AARCH64_HI;
6490 default: return -1;
6491 }
6492 break;
6493
6494 case E_CC_NZmode:
6495 switch (comp_code)
6496 {
6497 case NE: return AARCH64_NE;
6498 case EQ: return AARCH64_EQ;
6499 case GE: return AARCH64_PL;
6500 case LT: return AARCH64_MI;
6501 default: return -1;
6502 }
6503 break;
6504
6505 case E_CC_Zmode:
6506 switch (comp_code)
6507 {
6508 case NE: return AARCH64_NE;
6509 case EQ: return AARCH64_EQ;
6510 default: return -1;
6511 }
6512 break;
6513
6514 case E_CC_Cmode:
6515 switch (comp_code)
6516 {
6517 case NE: return AARCH64_CS;
6518 case EQ: return AARCH64_CC;
6519 default: return -1;
6520 }
6521 break;
6522
6523 default:
6524 return -1;
6525 }
6526
6527 return -1;
6528 }
6529
6530 bool
6531 aarch64_const_vec_all_same_in_range_p (rtx x,
6532 HOST_WIDE_INT minval,
6533 HOST_WIDE_INT maxval)
6534 {
6535 rtx elt;
6536 return (const_vec_duplicate_p (x, &elt)
6537 && CONST_INT_P (elt)
6538 && IN_RANGE (INTVAL (elt), minval, maxval));
6539 }
6540
6541 bool
6542 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6543 {
6544 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6545 }
6546
6547 /* Return true if VEC is a constant in which every element is in the range
6548 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6549
6550 static bool
6551 aarch64_const_vec_all_in_range_p (rtx vec,
6552 HOST_WIDE_INT minval,
6553 HOST_WIDE_INT maxval)
6554 {
6555 if (GET_CODE (vec) != CONST_VECTOR
6556 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6557 return false;
6558
6559 int nunits;
6560 if (!CONST_VECTOR_STEPPED_P (vec))
6561 nunits = const_vector_encoded_nelts (vec);
6562 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6563 return false;
6564
6565 for (int i = 0; i < nunits; i++)
6566 {
6567 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6568 if (!CONST_INT_P (vec_elem)
6569 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6570 return false;
6571 }
6572 return true;
6573 }
6574
6575 /* N Z C V. */
6576 #define AARCH64_CC_V 1
6577 #define AARCH64_CC_C (1 << 1)
6578 #define AARCH64_CC_Z (1 << 2)
6579 #define AARCH64_CC_N (1 << 3)
6580
6581 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6582 static const int aarch64_nzcv_codes[] =
6583 {
6584 0, /* EQ, Z == 1. */
6585 AARCH64_CC_Z, /* NE, Z == 0. */
6586 0, /* CS, C == 1. */
6587 AARCH64_CC_C, /* CC, C == 0. */
6588 0, /* MI, N == 1. */
6589 AARCH64_CC_N, /* PL, N == 0. */
6590 0, /* VS, V == 1. */
6591 AARCH64_CC_V, /* VC, V == 0. */
6592 0, /* HI, C ==1 && Z == 0. */
6593 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6594 AARCH64_CC_V, /* GE, N == V. */
6595 0, /* LT, N != V. */
6596 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6597 0, /* LE, !(Z == 0 && N == V). */
6598 0, /* AL, Any. */
6599 0 /* NV, Any. */
6600 };
6601
6602 /* Print floating-point vector immediate operand X to F, negating it
6603 first if NEGATE is true. Return true on success, false if it isn't
6604 a constant we can handle. */
6605
6606 static bool
6607 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6608 {
6609 rtx elt;
6610
6611 if (!const_vec_duplicate_p (x, &elt))
6612 return false;
6613
6614 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6615 if (negate)
6616 r = real_value_negate (&r);
6617
6618 /* We only handle the SVE single-bit immediates here. */
6619 if (real_equal (&r, &dconst0))
6620 asm_fprintf (f, "0.0");
6621 else if (real_equal (&r, &dconst1))
6622 asm_fprintf (f, "1.0");
6623 else if (real_equal (&r, &dconsthalf))
6624 asm_fprintf (f, "0.5");
6625 else
6626 return false;
6627
6628 return true;
6629 }
6630
6631 /* Return the equivalent letter for size. */
6632 static char
6633 sizetochar (int size)
6634 {
6635 switch (size)
6636 {
6637 case 64: return 'd';
6638 case 32: return 's';
6639 case 16: return 'h';
6640 case 8 : return 'b';
6641 default: gcc_unreachable ();
6642 }
6643 }
6644
6645 /* Print operand X to file F in a target specific manner according to CODE.
6646 The acceptable formatting commands given by CODE are:
6647 'c': An integer or symbol address without a preceding #
6648 sign.
6649 'C': Take the duplicated element in a vector constant
6650 and print it in hex.
6651 'D': Take the duplicated element in a vector constant
6652 and print it as an unsigned integer, in decimal.
6653 'e': Print the sign/zero-extend size as a character 8->b,
6654 16->h, 32->w.
6655 'p': Prints N such that 2^N == X (X must be power of 2 and
6656 const int).
6657 'P': Print the number of non-zero bits in X (a const_int).
6658 'H': Print the higher numbered register of a pair (TImode)
6659 of regs.
6660 'm': Print a condition (eq, ne, etc).
6661 'M': Same as 'm', but invert condition.
6662 'N': Take the duplicated element in a vector constant
6663 and print the negative of it in decimal.
6664 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6665 'S/T/U/V': Print a FP/SIMD register name for a register list.
6666 The register printed is the FP/SIMD register name
6667 of X + 0/1/2/3 for S/T/U/V.
6668 'R': Print a scalar FP/SIMD register name + 1.
6669 'X': Print bottom 16 bits of integer constant in hex.
6670 'w/x': Print a general register name or the zero register
6671 (32-bit or 64-bit).
6672 '0': Print a normal operand, if it's a general register,
6673 then we assume DImode.
6674 'k': Print NZCV for conditional compare instructions.
6675 'A': Output address constant representing the first
6676 argument of X, specifying a relocation offset
6677 if appropriate.
6678 'L': Output constant address specified by X
6679 with a relocation offset if appropriate.
6680 'G': Prints address of X, specifying a PC relative
6681 relocation mode if appropriate.
6682 'y': Output address of LDP or STP - this is used for
6683 some LDP/STPs which don't use a PARALLEL in their
6684 pattern (so the mode needs to be adjusted).
6685 'z': Output address of a typical LDP or STP. */
6686
6687 static void
6688 aarch64_print_operand (FILE *f, rtx x, int code)
6689 {
6690 rtx elt;
6691 switch (code)
6692 {
6693 case 'c':
6694 switch (GET_CODE (x))
6695 {
6696 case CONST_INT:
6697 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6698 break;
6699
6700 case SYMBOL_REF:
6701 output_addr_const (f, x);
6702 break;
6703
6704 case CONST:
6705 if (GET_CODE (XEXP (x, 0)) == PLUS
6706 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6707 {
6708 output_addr_const (f, x);
6709 break;
6710 }
6711 /* Fall through. */
6712
6713 default:
6714 output_operand_lossage ("unsupported operand for code '%c'", code);
6715 }
6716 break;
6717
6718 case 'e':
6719 {
6720 int n;
6721
6722 if (!CONST_INT_P (x)
6723 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6724 {
6725 output_operand_lossage ("invalid operand for '%%%c'", code);
6726 return;
6727 }
6728
6729 switch (n)
6730 {
6731 case 3:
6732 fputc ('b', f);
6733 break;
6734 case 4:
6735 fputc ('h', f);
6736 break;
6737 case 5:
6738 fputc ('w', f);
6739 break;
6740 default:
6741 output_operand_lossage ("invalid operand for '%%%c'", code);
6742 return;
6743 }
6744 }
6745 break;
6746
6747 case 'p':
6748 {
6749 int n;
6750
6751 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6752 {
6753 output_operand_lossage ("invalid operand for '%%%c'", code);
6754 return;
6755 }
6756
6757 asm_fprintf (f, "%d", n);
6758 }
6759 break;
6760
6761 case 'P':
6762 if (!CONST_INT_P (x))
6763 {
6764 output_operand_lossage ("invalid operand for '%%%c'", code);
6765 return;
6766 }
6767
6768 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6769 break;
6770
6771 case 'H':
6772 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6773 {
6774 output_operand_lossage ("invalid operand for '%%%c'", code);
6775 return;
6776 }
6777
6778 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6779 break;
6780
6781 case 'M':
6782 case 'm':
6783 {
6784 int cond_code;
6785 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6786 if (x == const_true_rtx)
6787 {
6788 if (code == 'M')
6789 fputs ("nv", f);
6790 return;
6791 }
6792
6793 if (!COMPARISON_P (x))
6794 {
6795 output_operand_lossage ("invalid operand for '%%%c'", code);
6796 return;
6797 }
6798
6799 cond_code = aarch64_get_condition_code (x);
6800 gcc_assert (cond_code >= 0);
6801 if (code == 'M')
6802 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6803 fputs (aarch64_condition_codes[cond_code], f);
6804 }
6805 break;
6806
6807 case 'N':
6808 if (!const_vec_duplicate_p (x, &elt))
6809 {
6810 output_operand_lossage ("invalid vector constant");
6811 return;
6812 }
6813
6814 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6815 asm_fprintf (f, "%wd", -INTVAL (elt));
6816 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6817 && aarch64_print_vector_float_operand (f, x, true))
6818 ;
6819 else
6820 {
6821 output_operand_lossage ("invalid vector constant");
6822 return;
6823 }
6824 break;
6825
6826 case 'b':
6827 case 'h':
6828 case 's':
6829 case 'd':
6830 case 'q':
6831 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6832 {
6833 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6834 return;
6835 }
6836 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6837 break;
6838
6839 case 'S':
6840 case 'T':
6841 case 'U':
6842 case 'V':
6843 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6844 {
6845 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6846 return;
6847 }
6848 asm_fprintf (f, "%c%d",
6849 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6850 REGNO (x) - V0_REGNUM + (code - 'S'));
6851 break;
6852
6853 case 'R':
6854 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6855 {
6856 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6857 return;
6858 }
6859 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6860 break;
6861
6862 case 'X':
6863 if (!CONST_INT_P (x))
6864 {
6865 output_operand_lossage ("invalid operand for '%%%c'", code);
6866 return;
6867 }
6868 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6869 break;
6870
6871 case 'C':
6872 {
6873 /* Print a replicated constant in hex. */
6874 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6875 {
6876 output_operand_lossage ("invalid operand for '%%%c'", code);
6877 return;
6878 }
6879 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6880 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6881 }
6882 break;
6883
6884 case 'D':
6885 {
6886 /* Print a replicated constant in decimal, treating it as
6887 unsigned. */
6888 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6889 {
6890 output_operand_lossage ("invalid operand for '%%%c'", code);
6891 return;
6892 }
6893 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6894 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6895 }
6896 break;
6897
6898 case 'w':
6899 case 'x':
6900 if (x == const0_rtx
6901 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6902 {
6903 asm_fprintf (f, "%czr", code);
6904 break;
6905 }
6906
6907 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6908 {
6909 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6910 break;
6911 }
6912
6913 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6914 {
6915 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6916 break;
6917 }
6918
6919 /* Fall through */
6920
6921 case 0:
6922 if (x == NULL)
6923 {
6924 output_operand_lossage ("missing operand");
6925 return;
6926 }
6927
6928 switch (GET_CODE (x))
6929 {
6930 case REG:
6931 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6932 {
6933 if (REG_NREGS (x) == 1)
6934 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6935 else
6936 {
6937 char suffix
6938 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6939 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6940 REGNO (x) - V0_REGNUM, suffix,
6941 END_REGNO (x) - V0_REGNUM - 1, suffix);
6942 }
6943 }
6944 else
6945 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6946 break;
6947
6948 case MEM:
6949 output_address (GET_MODE (x), XEXP (x, 0));
6950 break;
6951
6952 case LABEL_REF:
6953 case SYMBOL_REF:
6954 output_addr_const (asm_out_file, x);
6955 break;
6956
6957 case CONST_INT:
6958 asm_fprintf (f, "%wd", INTVAL (x));
6959 break;
6960
6961 case CONST:
6962 if (!VECTOR_MODE_P (GET_MODE (x)))
6963 {
6964 output_addr_const (asm_out_file, x);
6965 break;
6966 }
6967 /* fall through */
6968
6969 case CONST_VECTOR:
6970 if (!const_vec_duplicate_p (x, &elt))
6971 {
6972 output_operand_lossage ("invalid vector constant");
6973 return;
6974 }
6975
6976 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6977 asm_fprintf (f, "%wd", INTVAL (elt));
6978 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6979 && aarch64_print_vector_float_operand (f, x, false))
6980 ;
6981 else
6982 {
6983 output_operand_lossage ("invalid vector constant");
6984 return;
6985 }
6986 break;
6987
6988 case CONST_DOUBLE:
6989 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6990 be getting CONST_DOUBLEs holding integers. */
6991 gcc_assert (GET_MODE (x) != VOIDmode);
6992 if (aarch64_float_const_zero_rtx_p (x))
6993 {
6994 fputc ('0', f);
6995 break;
6996 }
6997 else if (aarch64_float_const_representable_p (x))
6998 {
6999 #define buf_size 20
7000 char float_buf[buf_size] = {'\0'};
7001 real_to_decimal_for_mode (float_buf,
7002 CONST_DOUBLE_REAL_VALUE (x),
7003 buf_size, buf_size,
7004 1, GET_MODE (x));
7005 asm_fprintf (asm_out_file, "%s", float_buf);
7006 break;
7007 #undef buf_size
7008 }
7009 output_operand_lossage ("invalid constant");
7010 return;
7011 default:
7012 output_operand_lossage ("invalid operand");
7013 return;
7014 }
7015 break;
7016
7017 case 'A':
7018 if (GET_CODE (x) == HIGH)
7019 x = XEXP (x, 0);
7020
7021 switch (aarch64_classify_symbolic_expression (x))
7022 {
7023 case SYMBOL_SMALL_GOT_4G:
7024 asm_fprintf (asm_out_file, ":got:");
7025 break;
7026
7027 case SYMBOL_SMALL_TLSGD:
7028 asm_fprintf (asm_out_file, ":tlsgd:");
7029 break;
7030
7031 case SYMBOL_SMALL_TLSDESC:
7032 asm_fprintf (asm_out_file, ":tlsdesc:");
7033 break;
7034
7035 case SYMBOL_SMALL_TLSIE:
7036 asm_fprintf (asm_out_file, ":gottprel:");
7037 break;
7038
7039 case SYMBOL_TLSLE24:
7040 asm_fprintf (asm_out_file, ":tprel:");
7041 break;
7042
7043 case SYMBOL_TINY_GOT:
7044 gcc_unreachable ();
7045 break;
7046
7047 default:
7048 break;
7049 }
7050 output_addr_const (asm_out_file, x);
7051 break;
7052
7053 case 'L':
7054 switch (aarch64_classify_symbolic_expression (x))
7055 {
7056 case SYMBOL_SMALL_GOT_4G:
7057 asm_fprintf (asm_out_file, ":lo12:");
7058 break;
7059
7060 case SYMBOL_SMALL_TLSGD:
7061 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7062 break;
7063
7064 case SYMBOL_SMALL_TLSDESC:
7065 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7066 break;
7067
7068 case SYMBOL_SMALL_TLSIE:
7069 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7070 break;
7071
7072 case SYMBOL_TLSLE12:
7073 asm_fprintf (asm_out_file, ":tprel_lo12:");
7074 break;
7075
7076 case SYMBOL_TLSLE24:
7077 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7078 break;
7079
7080 case SYMBOL_TINY_GOT:
7081 asm_fprintf (asm_out_file, ":got:");
7082 break;
7083
7084 case SYMBOL_TINY_TLSIE:
7085 asm_fprintf (asm_out_file, ":gottprel:");
7086 break;
7087
7088 default:
7089 break;
7090 }
7091 output_addr_const (asm_out_file, x);
7092 break;
7093
7094 case 'G':
7095 switch (aarch64_classify_symbolic_expression (x))
7096 {
7097 case SYMBOL_TLSLE24:
7098 asm_fprintf (asm_out_file, ":tprel_hi12:");
7099 break;
7100 default:
7101 break;
7102 }
7103 output_addr_const (asm_out_file, x);
7104 break;
7105
7106 case 'k':
7107 {
7108 HOST_WIDE_INT cond_code;
7109
7110 if (!CONST_INT_P (x))
7111 {
7112 output_operand_lossage ("invalid operand for '%%%c'", code);
7113 return;
7114 }
7115
7116 cond_code = INTVAL (x);
7117 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7118 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7119 }
7120 break;
7121
7122 case 'y':
7123 case 'z':
7124 {
7125 machine_mode mode = GET_MODE (x);
7126
7127 if (GET_CODE (x) != MEM
7128 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7129 {
7130 output_operand_lossage ("invalid operand for '%%%c'", code);
7131 return;
7132 }
7133
7134 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7135 code == 'y'
7136 ? ADDR_QUERY_LDP_STP_N
7137 : ADDR_QUERY_LDP_STP))
7138 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7139 }
7140 break;
7141
7142 default:
7143 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7144 return;
7145 }
7146 }
7147
7148 /* Print address 'x' of a memory access with mode 'mode'.
7149 'op' is the context required by aarch64_classify_address. It can either be
7150 MEM for a normal memory access or PARALLEL for LDP/STP. */
7151 static bool
7152 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7153 aarch64_addr_query_type type)
7154 {
7155 struct aarch64_address_info addr;
7156 unsigned int size;
7157
7158 /* Check all addresses are Pmode - including ILP32. */
7159 if (GET_MODE (x) != Pmode)
7160 output_operand_lossage ("invalid address mode");
7161
7162 if (aarch64_classify_address (&addr, x, mode, true, type))
7163 switch (addr.type)
7164 {
7165 case ADDRESS_REG_IMM:
7166 if (known_eq (addr.const_offset, 0))
7167 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7168 else if (aarch64_sve_data_mode_p (mode))
7169 {
7170 HOST_WIDE_INT vnum
7171 = exact_div (addr.const_offset,
7172 BYTES_PER_SVE_VECTOR).to_constant ();
7173 asm_fprintf (f, "[%s, #%wd, mul vl]",
7174 reg_names[REGNO (addr.base)], vnum);
7175 }
7176 else if (aarch64_sve_pred_mode_p (mode))
7177 {
7178 HOST_WIDE_INT vnum
7179 = exact_div (addr.const_offset,
7180 BYTES_PER_SVE_PRED).to_constant ();
7181 asm_fprintf (f, "[%s, #%wd, mul vl]",
7182 reg_names[REGNO (addr.base)], vnum);
7183 }
7184 else
7185 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7186 INTVAL (addr.offset));
7187 return true;
7188
7189 case ADDRESS_REG_REG:
7190 if (addr.shift == 0)
7191 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7192 reg_names [REGNO (addr.offset)]);
7193 else
7194 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7195 reg_names [REGNO (addr.offset)], addr.shift);
7196 return true;
7197
7198 case ADDRESS_REG_UXTW:
7199 if (addr.shift == 0)
7200 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7201 REGNO (addr.offset) - R0_REGNUM);
7202 else
7203 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7204 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7205 return true;
7206
7207 case ADDRESS_REG_SXTW:
7208 if (addr.shift == 0)
7209 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7210 REGNO (addr.offset) - R0_REGNUM);
7211 else
7212 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7213 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7214 return true;
7215
7216 case ADDRESS_REG_WB:
7217 /* Writeback is only supported for fixed-width modes. */
7218 size = GET_MODE_SIZE (mode).to_constant ();
7219 switch (GET_CODE (x))
7220 {
7221 case PRE_INC:
7222 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7223 return true;
7224 case POST_INC:
7225 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7226 return true;
7227 case PRE_DEC:
7228 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7229 return true;
7230 case POST_DEC:
7231 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7232 return true;
7233 case PRE_MODIFY:
7234 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7235 INTVAL (addr.offset));
7236 return true;
7237 case POST_MODIFY:
7238 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7239 INTVAL (addr.offset));
7240 return true;
7241 default:
7242 break;
7243 }
7244 break;
7245
7246 case ADDRESS_LO_SUM:
7247 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7248 output_addr_const (f, addr.offset);
7249 asm_fprintf (f, "]");
7250 return true;
7251
7252 case ADDRESS_SYMBOLIC:
7253 output_addr_const (f, x);
7254 return true;
7255 }
7256
7257 return false;
7258 }
7259
7260 /* Print address 'x' of a memory access with mode 'mode'. */
7261 static void
7262 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7263 {
7264 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7265 output_addr_const (f, x);
7266 }
7267
7268 bool
7269 aarch64_label_mentioned_p (rtx x)
7270 {
7271 const char *fmt;
7272 int i;
7273
7274 if (GET_CODE (x) == LABEL_REF)
7275 return true;
7276
7277 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7278 referencing instruction, but they are constant offsets, not
7279 symbols. */
7280 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7281 return false;
7282
7283 fmt = GET_RTX_FORMAT (GET_CODE (x));
7284 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7285 {
7286 if (fmt[i] == 'E')
7287 {
7288 int j;
7289
7290 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7291 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7292 return 1;
7293 }
7294 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7295 return 1;
7296 }
7297
7298 return 0;
7299 }
7300
7301 /* Implement REGNO_REG_CLASS. */
7302
7303 enum reg_class
7304 aarch64_regno_regclass (unsigned regno)
7305 {
7306 if (GP_REGNUM_P (regno))
7307 return GENERAL_REGS;
7308
7309 if (regno == SP_REGNUM)
7310 return STACK_REG;
7311
7312 if (regno == FRAME_POINTER_REGNUM
7313 || regno == ARG_POINTER_REGNUM)
7314 return POINTER_REGS;
7315
7316 if (FP_REGNUM_P (regno))
7317 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7318
7319 if (PR_REGNUM_P (regno))
7320 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7321
7322 return NO_REGS;
7323 }
7324
7325 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7326 If OFFSET is out of range, return an offset of an anchor point
7327 that is in range. Return 0 otherwise. */
7328
7329 static HOST_WIDE_INT
7330 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7331 machine_mode mode)
7332 {
7333 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7334 if (size > 16)
7335 return (offset + 0x400) & ~0x7f0;
7336
7337 /* For offsets that aren't a multiple of the access size, the limit is
7338 -256...255. */
7339 if (offset & (size - 1))
7340 {
7341 /* BLKmode typically uses LDP of X-registers. */
7342 if (mode == BLKmode)
7343 return (offset + 512) & ~0x3ff;
7344 return (offset + 0x100) & ~0x1ff;
7345 }
7346
7347 /* Small negative offsets are supported. */
7348 if (IN_RANGE (offset, -256, 0))
7349 return 0;
7350
7351 if (mode == TImode || mode == TFmode)
7352 return (offset + 0x100) & ~0x1ff;
7353
7354 /* Use 12-bit offset by access size. */
7355 return offset & (~0xfff * size);
7356 }
7357
7358 static rtx
7359 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7360 {
7361 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7362 where mask is selected by alignment and size of the offset.
7363 We try to pick as large a range for the offset as possible to
7364 maximize the chance of a CSE. However, for aligned addresses
7365 we limit the range to 4k so that structures with different sized
7366 elements are likely to use the same base. We need to be careful
7367 not to split a CONST for some forms of address expression, otherwise
7368 it will generate sub-optimal code. */
7369
7370 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7371 {
7372 rtx base = XEXP (x, 0);
7373 rtx offset_rtx = XEXP (x, 1);
7374 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7375
7376 if (GET_CODE (base) == PLUS)
7377 {
7378 rtx op0 = XEXP (base, 0);
7379 rtx op1 = XEXP (base, 1);
7380
7381 /* Force any scaling into a temp for CSE. */
7382 op0 = force_reg (Pmode, op0);
7383 op1 = force_reg (Pmode, op1);
7384
7385 /* Let the pointer register be in op0. */
7386 if (REG_POINTER (op1))
7387 std::swap (op0, op1);
7388
7389 /* If the pointer is virtual or frame related, then we know that
7390 virtual register instantiation or register elimination is going
7391 to apply a second constant. We want the two constants folded
7392 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7393 if (virt_or_elim_regno_p (REGNO (op0)))
7394 {
7395 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7396 NULL_RTX, true, OPTAB_DIRECT);
7397 return gen_rtx_PLUS (Pmode, base, op1);
7398 }
7399
7400 /* Otherwise, in order to encourage CSE (and thence loop strength
7401 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7402 base = expand_binop (Pmode, add_optab, op0, op1,
7403 NULL_RTX, true, OPTAB_DIRECT);
7404 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7405 }
7406
7407 HOST_WIDE_INT size;
7408 if (GET_MODE_SIZE (mode).is_constant (&size))
7409 {
7410 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7411 mode);
7412 if (base_offset != 0)
7413 {
7414 base = plus_constant (Pmode, base, base_offset);
7415 base = force_operand (base, NULL_RTX);
7416 return plus_constant (Pmode, base, offset - base_offset);
7417 }
7418 }
7419 }
7420
7421 return x;
7422 }
7423
7424 /* Return the reload icode required for a constant pool in mode. */
7425 static enum insn_code
7426 aarch64_constant_pool_reload_icode (machine_mode mode)
7427 {
7428 switch (mode)
7429 {
7430 case E_SFmode:
7431 return CODE_FOR_aarch64_reload_movcpsfdi;
7432
7433 case E_DFmode:
7434 return CODE_FOR_aarch64_reload_movcpdfdi;
7435
7436 case E_TFmode:
7437 return CODE_FOR_aarch64_reload_movcptfdi;
7438
7439 case E_V8QImode:
7440 return CODE_FOR_aarch64_reload_movcpv8qidi;
7441
7442 case E_V16QImode:
7443 return CODE_FOR_aarch64_reload_movcpv16qidi;
7444
7445 case E_V4HImode:
7446 return CODE_FOR_aarch64_reload_movcpv4hidi;
7447
7448 case E_V8HImode:
7449 return CODE_FOR_aarch64_reload_movcpv8hidi;
7450
7451 case E_V2SImode:
7452 return CODE_FOR_aarch64_reload_movcpv2sidi;
7453
7454 case E_V4SImode:
7455 return CODE_FOR_aarch64_reload_movcpv4sidi;
7456
7457 case E_V2DImode:
7458 return CODE_FOR_aarch64_reload_movcpv2didi;
7459
7460 case E_V2DFmode:
7461 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7462
7463 default:
7464 gcc_unreachable ();
7465 }
7466
7467 gcc_unreachable ();
7468 }
7469 static reg_class_t
7470 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7471 reg_class_t rclass,
7472 machine_mode mode,
7473 secondary_reload_info *sri)
7474 {
7475 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7476 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7477 comment at the head of aarch64-sve.md for more details about the
7478 big-endian handling. */
7479 if (BYTES_BIG_ENDIAN
7480 && reg_class_subset_p (rclass, FP_REGS)
7481 && !((REG_P (x) && HARD_REGISTER_P (x))
7482 || aarch64_simd_valid_immediate (x, NULL))
7483 && aarch64_sve_data_mode_p (mode))
7484 {
7485 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7486 return NO_REGS;
7487 }
7488
7489 /* If we have to disable direct literal pool loads and stores because the
7490 function is too big, then we need a scratch register. */
7491 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7492 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7493 || targetm.vector_mode_supported_p (GET_MODE (x)))
7494 && !aarch64_pcrelative_literal_loads)
7495 {
7496 sri->icode = aarch64_constant_pool_reload_icode (mode);
7497 return NO_REGS;
7498 }
7499
7500 /* Without the TARGET_SIMD instructions we cannot move a Q register
7501 to a Q register directly. We need a scratch. */
7502 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7503 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7504 && reg_class_subset_p (rclass, FP_REGS))
7505 {
7506 if (mode == TFmode)
7507 sri->icode = CODE_FOR_aarch64_reload_movtf;
7508 else if (mode == TImode)
7509 sri->icode = CODE_FOR_aarch64_reload_movti;
7510 return NO_REGS;
7511 }
7512
7513 /* A TFmode or TImode memory access should be handled via an FP_REGS
7514 because AArch64 has richer addressing modes for LDR/STR instructions
7515 than LDP/STP instructions. */
7516 if (TARGET_FLOAT && rclass == GENERAL_REGS
7517 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7518 return FP_REGS;
7519
7520 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7521 return GENERAL_REGS;
7522
7523 return NO_REGS;
7524 }
7525
7526 static bool
7527 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7528 {
7529 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7530
7531 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7532 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7533 if (frame_pointer_needed)
7534 return to == HARD_FRAME_POINTER_REGNUM;
7535 return true;
7536 }
7537
7538 poly_int64
7539 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7540 {
7541 aarch64_layout_frame ();
7542
7543 if (to == HARD_FRAME_POINTER_REGNUM)
7544 {
7545 if (from == ARG_POINTER_REGNUM)
7546 return cfun->machine->frame.hard_fp_offset;
7547
7548 if (from == FRAME_POINTER_REGNUM)
7549 return cfun->machine->frame.hard_fp_offset
7550 - cfun->machine->frame.locals_offset;
7551 }
7552
7553 if (to == STACK_POINTER_REGNUM)
7554 {
7555 if (from == FRAME_POINTER_REGNUM)
7556 return cfun->machine->frame.frame_size
7557 - cfun->machine->frame.locals_offset;
7558 }
7559
7560 return cfun->machine->frame.frame_size;
7561 }
7562
7563 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7564 previous frame. */
7565
7566 rtx
7567 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7568 {
7569 if (count != 0)
7570 return const0_rtx;
7571 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7572 }
7573
7574
7575 static void
7576 aarch64_asm_trampoline_template (FILE *f)
7577 {
7578 if (TARGET_ILP32)
7579 {
7580 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7581 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7582 }
7583 else
7584 {
7585 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7586 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7587 }
7588 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7589 assemble_aligned_integer (4, const0_rtx);
7590 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7591 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7592 }
7593
7594 static void
7595 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7596 {
7597 rtx fnaddr, mem, a_tramp;
7598 const int tramp_code_sz = 16;
7599
7600 /* Don't need to copy the trailing D-words, we fill those in below. */
7601 emit_block_move (m_tramp, assemble_trampoline_template (),
7602 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7603 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7604 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7605 if (GET_MODE (fnaddr) != ptr_mode)
7606 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7607 emit_move_insn (mem, fnaddr);
7608
7609 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7610 emit_move_insn (mem, chain_value);
7611
7612 /* XXX We should really define a "clear_cache" pattern and use
7613 gen_clear_cache(). */
7614 a_tramp = XEXP (m_tramp, 0);
7615 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7616 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7617 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7618 ptr_mode);
7619 }
7620
7621 static unsigned char
7622 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7623 {
7624 /* ??? Logically we should only need to provide a value when
7625 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7626 can hold MODE, but at the moment we need to handle all modes.
7627 Just ignore any runtime parts for registers that can't store them. */
7628 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7629 unsigned int nregs;
7630 switch (regclass)
7631 {
7632 case TAILCALL_ADDR_REGS:
7633 case POINTER_REGS:
7634 case GENERAL_REGS:
7635 case ALL_REGS:
7636 case POINTER_AND_FP_REGS:
7637 case FP_REGS:
7638 case FP_LO_REGS:
7639 if (aarch64_sve_data_mode_p (mode)
7640 && constant_multiple_p (GET_MODE_SIZE (mode),
7641 BYTES_PER_SVE_VECTOR, &nregs))
7642 return nregs;
7643 return (aarch64_vector_data_mode_p (mode)
7644 ? CEIL (lowest_size, UNITS_PER_VREG)
7645 : CEIL (lowest_size, UNITS_PER_WORD));
7646 case STACK_REG:
7647 case PR_REGS:
7648 case PR_LO_REGS:
7649 case PR_HI_REGS:
7650 return 1;
7651
7652 case NO_REGS:
7653 return 0;
7654
7655 default:
7656 break;
7657 }
7658 gcc_unreachable ();
7659 }
7660
7661 static reg_class_t
7662 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7663 {
7664 if (regclass == POINTER_REGS)
7665 return GENERAL_REGS;
7666
7667 if (regclass == STACK_REG)
7668 {
7669 if (REG_P(x)
7670 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7671 return regclass;
7672
7673 return NO_REGS;
7674 }
7675
7676 /* Register eliminiation can result in a request for
7677 SP+constant->FP_REGS. We cannot support such operations which
7678 use SP as source and an FP_REG as destination, so reject out
7679 right now. */
7680 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7681 {
7682 rtx lhs = XEXP (x, 0);
7683
7684 /* Look through a possible SUBREG introduced by ILP32. */
7685 if (GET_CODE (lhs) == SUBREG)
7686 lhs = SUBREG_REG (lhs);
7687
7688 gcc_assert (REG_P (lhs));
7689 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7690 POINTER_REGS));
7691 return NO_REGS;
7692 }
7693
7694 return regclass;
7695 }
7696
7697 void
7698 aarch64_asm_output_labelref (FILE* f, const char *name)
7699 {
7700 asm_fprintf (f, "%U%s", name);
7701 }
7702
7703 static void
7704 aarch64_elf_asm_constructor (rtx symbol, int priority)
7705 {
7706 if (priority == DEFAULT_INIT_PRIORITY)
7707 default_ctor_section_asm_out_constructor (symbol, priority);
7708 else
7709 {
7710 section *s;
7711 /* While priority is known to be in range [0, 65535], so 18 bytes
7712 would be enough, the compiler might not know that. To avoid
7713 -Wformat-truncation false positive, use a larger size. */
7714 char buf[23];
7715 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7716 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7717 switch_to_section (s);
7718 assemble_align (POINTER_SIZE);
7719 assemble_aligned_integer (POINTER_BYTES, symbol);
7720 }
7721 }
7722
7723 static void
7724 aarch64_elf_asm_destructor (rtx symbol, int priority)
7725 {
7726 if (priority == DEFAULT_INIT_PRIORITY)
7727 default_dtor_section_asm_out_destructor (symbol, priority);
7728 else
7729 {
7730 section *s;
7731 /* While priority is known to be in range [0, 65535], so 18 bytes
7732 would be enough, the compiler might not know that. To avoid
7733 -Wformat-truncation false positive, use a larger size. */
7734 char buf[23];
7735 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7736 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7737 switch_to_section (s);
7738 assemble_align (POINTER_SIZE);
7739 assemble_aligned_integer (POINTER_BYTES, symbol);
7740 }
7741 }
7742
7743 const char*
7744 aarch64_output_casesi (rtx *operands)
7745 {
7746 char buf[100];
7747 char label[100];
7748 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7749 int index;
7750 static const char *const patterns[4][2] =
7751 {
7752 {
7753 "ldrb\t%w3, [%0,%w1,uxtw]",
7754 "add\t%3, %4, %w3, sxtb #2"
7755 },
7756 {
7757 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7758 "add\t%3, %4, %w3, sxth #2"
7759 },
7760 {
7761 "ldr\t%w3, [%0,%w1,uxtw #2]",
7762 "add\t%3, %4, %w3, sxtw #2"
7763 },
7764 /* We assume that DImode is only generated when not optimizing and
7765 that we don't really need 64-bit address offsets. That would
7766 imply an object file with 8GB of code in a single function! */
7767 {
7768 "ldr\t%w3, [%0,%w1,uxtw #2]",
7769 "add\t%3, %4, %w3, sxtw #2"
7770 }
7771 };
7772
7773 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7774
7775 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7776 index = exact_log2 (GET_MODE_SIZE (mode));
7777
7778 gcc_assert (index >= 0 && index <= 3);
7779
7780 /* Need to implement table size reduction, by chaning the code below. */
7781 output_asm_insn (patterns[index][0], operands);
7782 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7783 snprintf (buf, sizeof (buf),
7784 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7785 output_asm_insn (buf, operands);
7786 output_asm_insn (patterns[index][1], operands);
7787 output_asm_insn ("br\t%3", operands);
7788 assemble_label (asm_out_file, label);
7789 return "";
7790 }
7791
7792
7793 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7794 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7795 operator. */
7796
7797 int
7798 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7799 {
7800 if (shift >= 0 && shift <= 3)
7801 {
7802 int size;
7803 for (size = 8; size <= 32; size *= 2)
7804 {
7805 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7806 if (mask == bits << shift)
7807 return size;
7808 }
7809 }
7810 return 0;
7811 }
7812
7813 /* Constant pools are per function only when PC relative
7814 literal loads are true or we are in the large memory
7815 model. */
7816
7817 static inline bool
7818 aarch64_can_use_per_function_literal_pools_p (void)
7819 {
7820 return (aarch64_pcrelative_literal_loads
7821 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7822 }
7823
7824 static bool
7825 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7826 {
7827 /* We can't use blocks for constants when we're using a per-function
7828 constant pool. */
7829 return !aarch64_can_use_per_function_literal_pools_p ();
7830 }
7831
7832 /* Select appropriate section for constants depending
7833 on where we place literal pools. */
7834
7835 static section *
7836 aarch64_select_rtx_section (machine_mode mode,
7837 rtx x,
7838 unsigned HOST_WIDE_INT align)
7839 {
7840 if (aarch64_can_use_per_function_literal_pools_p ())
7841 return function_section (current_function_decl);
7842
7843 return default_elf_select_rtx_section (mode, x, align);
7844 }
7845
7846 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7847 void
7848 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7849 HOST_WIDE_INT offset)
7850 {
7851 /* When using per-function literal pools, we must ensure that any code
7852 section is aligned to the minimal instruction length, lest we get
7853 errors from the assembler re "unaligned instructions". */
7854 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7855 ASM_OUTPUT_ALIGN (f, 2);
7856 }
7857
7858 /* Costs. */
7859
7860 /* Helper function for rtx cost calculation. Strip a shift expression
7861 from X. Returns the inner operand if successful, or the original
7862 expression on failure. */
7863 static rtx
7864 aarch64_strip_shift (rtx x)
7865 {
7866 rtx op = x;
7867
7868 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7869 we can convert both to ROR during final output. */
7870 if ((GET_CODE (op) == ASHIFT
7871 || GET_CODE (op) == ASHIFTRT
7872 || GET_CODE (op) == LSHIFTRT
7873 || GET_CODE (op) == ROTATERT
7874 || GET_CODE (op) == ROTATE)
7875 && CONST_INT_P (XEXP (op, 1)))
7876 return XEXP (op, 0);
7877
7878 if (GET_CODE (op) == MULT
7879 && CONST_INT_P (XEXP (op, 1))
7880 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7881 return XEXP (op, 0);
7882
7883 return x;
7884 }
7885
7886 /* Helper function for rtx cost calculation. Strip an extend
7887 expression from X. Returns the inner operand if successful, or the
7888 original expression on failure. We deal with a number of possible
7889 canonicalization variations here. If STRIP_SHIFT is true, then
7890 we can strip off a shift also. */
7891 static rtx
7892 aarch64_strip_extend (rtx x, bool strip_shift)
7893 {
7894 scalar_int_mode mode;
7895 rtx op = x;
7896
7897 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7898 return op;
7899
7900 /* Zero and sign extraction of a widened value. */
7901 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7902 && XEXP (op, 2) == const0_rtx
7903 && GET_CODE (XEXP (op, 0)) == MULT
7904 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7905 XEXP (op, 1)))
7906 return XEXP (XEXP (op, 0), 0);
7907
7908 /* It can also be represented (for zero-extend) as an AND with an
7909 immediate. */
7910 if (GET_CODE (op) == AND
7911 && GET_CODE (XEXP (op, 0)) == MULT
7912 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7913 && CONST_INT_P (XEXP (op, 1))
7914 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7915 INTVAL (XEXP (op, 1))) != 0)
7916 return XEXP (XEXP (op, 0), 0);
7917
7918 /* Now handle extended register, as this may also have an optional
7919 left shift by 1..4. */
7920 if (strip_shift
7921 && GET_CODE (op) == ASHIFT
7922 && CONST_INT_P (XEXP (op, 1))
7923 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7924 op = XEXP (op, 0);
7925
7926 if (GET_CODE (op) == ZERO_EXTEND
7927 || GET_CODE (op) == SIGN_EXTEND)
7928 op = XEXP (op, 0);
7929
7930 if (op != x)
7931 return op;
7932
7933 return x;
7934 }
7935
7936 /* Return true iff CODE is a shift supported in combination
7937 with arithmetic instructions. */
7938
7939 static bool
7940 aarch64_shift_p (enum rtx_code code)
7941 {
7942 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7943 }
7944
7945
7946 /* Return true iff X is a cheap shift without a sign extend. */
7947
7948 static bool
7949 aarch64_cheap_mult_shift_p (rtx x)
7950 {
7951 rtx op0, op1;
7952
7953 op0 = XEXP (x, 0);
7954 op1 = XEXP (x, 1);
7955
7956 if (!(aarch64_tune_params.extra_tuning_flags
7957 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7958 return false;
7959
7960 if (GET_CODE (op0) == SIGN_EXTEND)
7961 return false;
7962
7963 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7964 && UINTVAL (op1) <= 4)
7965 return true;
7966
7967 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7968 return false;
7969
7970 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7971
7972 if (l2 > 0 && l2 <= 4)
7973 return true;
7974
7975 return false;
7976 }
7977
7978 /* Helper function for rtx cost calculation. Calculate the cost of
7979 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7980 Return the calculated cost of the expression, recursing manually in to
7981 operands where needed. */
7982
7983 static int
7984 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7985 {
7986 rtx op0, op1;
7987 const struct cpu_cost_table *extra_cost
7988 = aarch64_tune_params.insn_extra_cost;
7989 int cost = 0;
7990 bool compound_p = (outer == PLUS || outer == MINUS);
7991 machine_mode mode = GET_MODE (x);
7992
7993 gcc_checking_assert (code == MULT);
7994
7995 op0 = XEXP (x, 0);
7996 op1 = XEXP (x, 1);
7997
7998 if (VECTOR_MODE_P (mode))
7999 mode = GET_MODE_INNER (mode);
8000
8001 /* Integer multiply/fma. */
8002 if (GET_MODE_CLASS (mode) == MODE_INT)
8003 {
8004 /* The multiply will be canonicalized as a shift, cost it as such. */
8005 if (aarch64_shift_p (GET_CODE (x))
8006 || (CONST_INT_P (op1)
8007 && exact_log2 (INTVAL (op1)) > 0))
8008 {
8009 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8010 || GET_CODE (op0) == SIGN_EXTEND;
8011 if (speed)
8012 {
8013 if (compound_p)
8014 {
8015 /* If the shift is considered cheap,
8016 then don't add any cost. */
8017 if (aarch64_cheap_mult_shift_p (x))
8018 ;
8019 else if (REG_P (op1))
8020 /* ARITH + shift-by-register. */
8021 cost += extra_cost->alu.arith_shift_reg;
8022 else if (is_extend)
8023 /* ARITH + extended register. We don't have a cost field
8024 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8025 cost += extra_cost->alu.extend_arith;
8026 else
8027 /* ARITH + shift-by-immediate. */
8028 cost += extra_cost->alu.arith_shift;
8029 }
8030 else
8031 /* LSL (immediate). */
8032 cost += extra_cost->alu.shift;
8033
8034 }
8035 /* Strip extends as we will have costed them in the case above. */
8036 if (is_extend)
8037 op0 = aarch64_strip_extend (op0, true);
8038
8039 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8040
8041 return cost;
8042 }
8043
8044 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8045 compound and let the below cases handle it. After all, MNEG is a
8046 special-case alias of MSUB. */
8047 if (GET_CODE (op0) == NEG)
8048 {
8049 op0 = XEXP (op0, 0);
8050 compound_p = true;
8051 }
8052
8053 /* Integer multiplies or FMAs have zero/sign extending variants. */
8054 if ((GET_CODE (op0) == ZERO_EXTEND
8055 && GET_CODE (op1) == ZERO_EXTEND)
8056 || (GET_CODE (op0) == SIGN_EXTEND
8057 && GET_CODE (op1) == SIGN_EXTEND))
8058 {
8059 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8060 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8061
8062 if (speed)
8063 {
8064 if (compound_p)
8065 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8066 cost += extra_cost->mult[0].extend_add;
8067 else
8068 /* MUL/SMULL/UMULL. */
8069 cost += extra_cost->mult[0].extend;
8070 }
8071
8072 return cost;
8073 }
8074
8075 /* This is either an integer multiply or a MADD. In both cases
8076 we want to recurse and cost the operands. */
8077 cost += rtx_cost (op0, mode, MULT, 0, speed);
8078 cost += rtx_cost (op1, mode, MULT, 1, speed);
8079
8080 if (speed)
8081 {
8082 if (compound_p)
8083 /* MADD/MSUB. */
8084 cost += extra_cost->mult[mode == DImode].add;
8085 else
8086 /* MUL. */
8087 cost += extra_cost->mult[mode == DImode].simple;
8088 }
8089
8090 return cost;
8091 }
8092 else
8093 {
8094 if (speed)
8095 {
8096 /* Floating-point FMA/FMUL can also support negations of the
8097 operands, unless the rounding mode is upward or downward in
8098 which case FNMUL is different than FMUL with operand negation. */
8099 bool neg0 = GET_CODE (op0) == NEG;
8100 bool neg1 = GET_CODE (op1) == NEG;
8101 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8102 {
8103 if (neg0)
8104 op0 = XEXP (op0, 0);
8105 if (neg1)
8106 op1 = XEXP (op1, 0);
8107 }
8108
8109 if (compound_p)
8110 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8111 cost += extra_cost->fp[mode == DFmode].fma;
8112 else
8113 /* FMUL/FNMUL. */
8114 cost += extra_cost->fp[mode == DFmode].mult;
8115 }
8116
8117 cost += rtx_cost (op0, mode, MULT, 0, speed);
8118 cost += rtx_cost (op1, mode, MULT, 1, speed);
8119 return cost;
8120 }
8121 }
8122
8123 static int
8124 aarch64_address_cost (rtx x,
8125 machine_mode mode,
8126 addr_space_t as ATTRIBUTE_UNUSED,
8127 bool speed)
8128 {
8129 enum rtx_code c = GET_CODE (x);
8130 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8131 struct aarch64_address_info info;
8132 int cost = 0;
8133 info.shift = 0;
8134
8135 if (!aarch64_classify_address (&info, x, mode, false))
8136 {
8137 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8138 {
8139 /* This is a CONST or SYMBOL ref which will be split
8140 in a different way depending on the code model in use.
8141 Cost it through the generic infrastructure. */
8142 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8143 /* Divide through by the cost of one instruction to
8144 bring it to the same units as the address costs. */
8145 cost_symbol_ref /= COSTS_N_INSNS (1);
8146 /* The cost is then the cost of preparing the address,
8147 followed by an immediate (possibly 0) offset. */
8148 return cost_symbol_ref + addr_cost->imm_offset;
8149 }
8150 else
8151 {
8152 /* This is most likely a jump table from a case
8153 statement. */
8154 return addr_cost->register_offset;
8155 }
8156 }
8157
8158 switch (info.type)
8159 {
8160 case ADDRESS_LO_SUM:
8161 case ADDRESS_SYMBOLIC:
8162 case ADDRESS_REG_IMM:
8163 cost += addr_cost->imm_offset;
8164 break;
8165
8166 case ADDRESS_REG_WB:
8167 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8168 cost += addr_cost->pre_modify;
8169 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8170 cost += addr_cost->post_modify;
8171 else
8172 gcc_unreachable ();
8173
8174 break;
8175
8176 case ADDRESS_REG_REG:
8177 cost += addr_cost->register_offset;
8178 break;
8179
8180 case ADDRESS_REG_SXTW:
8181 cost += addr_cost->register_sextend;
8182 break;
8183
8184 case ADDRESS_REG_UXTW:
8185 cost += addr_cost->register_zextend;
8186 break;
8187
8188 default:
8189 gcc_unreachable ();
8190 }
8191
8192
8193 if (info.shift > 0)
8194 {
8195 /* For the sake of calculating the cost of the shifted register
8196 component, we can treat same sized modes in the same way. */
8197 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8198 cost += addr_cost->addr_scale_costs.hi;
8199 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8200 cost += addr_cost->addr_scale_costs.si;
8201 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8202 cost += addr_cost->addr_scale_costs.di;
8203 else
8204 /* We can't tell, or this is a 128-bit vector. */
8205 cost += addr_cost->addr_scale_costs.ti;
8206 }
8207
8208 return cost;
8209 }
8210
8211 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8212 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8213 to be taken. */
8214
8215 int
8216 aarch64_branch_cost (bool speed_p, bool predictable_p)
8217 {
8218 /* When optimizing for speed, use the cost of unpredictable branches. */
8219 const struct cpu_branch_cost *branch_costs =
8220 aarch64_tune_params.branch_costs;
8221
8222 if (!speed_p || predictable_p)
8223 return branch_costs->predictable;
8224 else
8225 return branch_costs->unpredictable;
8226 }
8227
8228 /* Return true if the RTX X in mode MODE is a zero or sign extract
8229 usable in an ADD or SUB (extended register) instruction. */
8230 static bool
8231 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8232 {
8233 /* Catch add with a sign extract.
8234 This is add_<optab><mode>_multp2. */
8235 if (GET_CODE (x) == SIGN_EXTRACT
8236 || GET_CODE (x) == ZERO_EXTRACT)
8237 {
8238 rtx op0 = XEXP (x, 0);
8239 rtx op1 = XEXP (x, 1);
8240 rtx op2 = XEXP (x, 2);
8241
8242 if (GET_CODE (op0) == MULT
8243 && CONST_INT_P (op1)
8244 && op2 == const0_rtx
8245 && CONST_INT_P (XEXP (op0, 1))
8246 && aarch64_is_extend_from_extract (mode,
8247 XEXP (op0, 1),
8248 op1))
8249 {
8250 return true;
8251 }
8252 }
8253 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8254 No shift. */
8255 else if (GET_CODE (x) == SIGN_EXTEND
8256 || GET_CODE (x) == ZERO_EXTEND)
8257 return REG_P (XEXP (x, 0));
8258
8259 return false;
8260 }
8261
8262 static bool
8263 aarch64_frint_unspec_p (unsigned int u)
8264 {
8265 switch (u)
8266 {
8267 case UNSPEC_FRINTZ:
8268 case UNSPEC_FRINTP:
8269 case UNSPEC_FRINTM:
8270 case UNSPEC_FRINTA:
8271 case UNSPEC_FRINTN:
8272 case UNSPEC_FRINTX:
8273 case UNSPEC_FRINTI:
8274 return true;
8275
8276 default:
8277 return false;
8278 }
8279 }
8280
8281 /* Return true iff X is an rtx that will match an extr instruction
8282 i.e. as described in the *extr<mode>5_insn family of patterns.
8283 OP0 and OP1 will be set to the operands of the shifts involved
8284 on success and will be NULL_RTX otherwise. */
8285
8286 static bool
8287 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8288 {
8289 rtx op0, op1;
8290 scalar_int_mode mode;
8291 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8292 return false;
8293
8294 *res_op0 = NULL_RTX;
8295 *res_op1 = NULL_RTX;
8296
8297 if (GET_CODE (x) != IOR)
8298 return false;
8299
8300 op0 = XEXP (x, 0);
8301 op1 = XEXP (x, 1);
8302
8303 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8304 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8305 {
8306 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8307 if (GET_CODE (op1) == ASHIFT)
8308 std::swap (op0, op1);
8309
8310 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8311 return false;
8312
8313 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8314 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8315
8316 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8317 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8318 {
8319 *res_op0 = XEXP (op0, 0);
8320 *res_op1 = XEXP (op1, 0);
8321 return true;
8322 }
8323 }
8324
8325 return false;
8326 }
8327
8328 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8329 storing it in *COST. Result is true if the total cost of the operation
8330 has now been calculated. */
8331 static bool
8332 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8333 {
8334 rtx inner;
8335 rtx comparator;
8336 enum rtx_code cmpcode;
8337
8338 if (COMPARISON_P (op0))
8339 {
8340 inner = XEXP (op0, 0);
8341 comparator = XEXP (op0, 1);
8342 cmpcode = GET_CODE (op0);
8343 }
8344 else
8345 {
8346 inner = op0;
8347 comparator = const0_rtx;
8348 cmpcode = NE;
8349 }
8350
8351 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8352 {
8353 /* Conditional branch. */
8354 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8355 return true;
8356 else
8357 {
8358 if (cmpcode == NE || cmpcode == EQ)
8359 {
8360 if (comparator == const0_rtx)
8361 {
8362 /* TBZ/TBNZ/CBZ/CBNZ. */
8363 if (GET_CODE (inner) == ZERO_EXTRACT)
8364 /* TBZ/TBNZ. */
8365 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8366 ZERO_EXTRACT, 0, speed);
8367 else
8368 /* CBZ/CBNZ. */
8369 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8370
8371 return true;
8372 }
8373 }
8374 else if (cmpcode == LT || cmpcode == GE)
8375 {
8376 /* TBZ/TBNZ. */
8377 if (comparator == const0_rtx)
8378 return true;
8379 }
8380 }
8381 }
8382 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8383 {
8384 /* CCMP. */
8385 if (GET_CODE (op1) == COMPARE)
8386 {
8387 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8388 if (XEXP (op1, 1) == const0_rtx)
8389 *cost += 1;
8390 if (speed)
8391 {
8392 machine_mode mode = GET_MODE (XEXP (op1, 0));
8393 const struct cpu_cost_table *extra_cost
8394 = aarch64_tune_params.insn_extra_cost;
8395
8396 if (GET_MODE_CLASS (mode) == MODE_INT)
8397 *cost += extra_cost->alu.arith;
8398 else
8399 *cost += extra_cost->fp[mode == DFmode].compare;
8400 }
8401 return true;
8402 }
8403
8404 /* It's a conditional operation based on the status flags,
8405 so it must be some flavor of CSEL. */
8406
8407 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8408 if (GET_CODE (op1) == NEG
8409 || GET_CODE (op1) == NOT
8410 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8411 op1 = XEXP (op1, 0);
8412 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8413 {
8414 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8415 op1 = XEXP (op1, 0);
8416 op2 = XEXP (op2, 0);
8417 }
8418
8419 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8420 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8421 return true;
8422 }
8423
8424 /* We don't know what this is, cost all operands. */
8425 return false;
8426 }
8427
8428 /* Check whether X is a bitfield operation of the form shift + extend that
8429 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8430 operand to which the bitfield operation is applied. Otherwise return
8431 NULL_RTX. */
8432
8433 static rtx
8434 aarch64_extend_bitfield_pattern_p (rtx x)
8435 {
8436 rtx_code outer_code = GET_CODE (x);
8437 machine_mode outer_mode = GET_MODE (x);
8438
8439 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8440 && outer_mode != SImode && outer_mode != DImode)
8441 return NULL_RTX;
8442
8443 rtx inner = XEXP (x, 0);
8444 rtx_code inner_code = GET_CODE (inner);
8445 machine_mode inner_mode = GET_MODE (inner);
8446 rtx op = NULL_RTX;
8447
8448 switch (inner_code)
8449 {
8450 case ASHIFT:
8451 if (CONST_INT_P (XEXP (inner, 1))
8452 && (inner_mode == QImode || inner_mode == HImode))
8453 op = XEXP (inner, 0);
8454 break;
8455 case LSHIFTRT:
8456 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8457 && (inner_mode == QImode || inner_mode == HImode))
8458 op = XEXP (inner, 0);
8459 break;
8460 case ASHIFTRT:
8461 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8462 && (inner_mode == QImode || inner_mode == HImode))
8463 op = XEXP (inner, 0);
8464 break;
8465 default:
8466 break;
8467 }
8468
8469 return op;
8470 }
8471
8472 /* Return true if the mask and a shift amount from an RTX of the form
8473 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8474 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8475
8476 bool
8477 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8478 rtx shft_amnt)
8479 {
8480 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8481 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8482 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8483 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8484 }
8485
8486 /* Calculate the cost of calculating X, storing it in *COST. Result
8487 is true if the total cost of the operation has now been calculated. */
8488 static bool
8489 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8490 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8491 {
8492 rtx op0, op1, op2;
8493 const struct cpu_cost_table *extra_cost
8494 = aarch64_tune_params.insn_extra_cost;
8495 int code = GET_CODE (x);
8496 scalar_int_mode int_mode;
8497
8498 /* By default, assume that everything has equivalent cost to the
8499 cheapest instruction. Any additional costs are applied as a delta
8500 above this default. */
8501 *cost = COSTS_N_INSNS (1);
8502
8503 switch (code)
8504 {
8505 case SET:
8506 /* The cost depends entirely on the operands to SET. */
8507 *cost = 0;
8508 op0 = SET_DEST (x);
8509 op1 = SET_SRC (x);
8510
8511 switch (GET_CODE (op0))
8512 {
8513 case MEM:
8514 if (speed)
8515 {
8516 rtx address = XEXP (op0, 0);
8517 if (VECTOR_MODE_P (mode))
8518 *cost += extra_cost->ldst.storev;
8519 else if (GET_MODE_CLASS (mode) == MODE_INT)
8520 *cost += extra_cost->ldst.store;
8521 else if (mode == SFmode)
8522 *cost += extra_cost->ldst.storef;
8523 else if (mode == DFmode)
8524 *cost += extra_cost->ldst.stored;
8525
8526 *cost +=
8527 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8528 0, speed));
8529 }
8530
8531 *cost += rtx_cost (op1, mode, SET, 1, speed);
8532 return true;
8533
8534 case SUBREG:
8535 if (! REG_P (SUBREG_REG (op0)))
8536 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8537
8538 /* Fall through. */
8539 case REG:
8540 /* The cost is one per vector-register copied. */
8541 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8542 {
8543 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8544 *cost = COSTS_N_INSNS (nregs);
8545 }
8546 /* const0_rtx is in general free, but we will use an
8547 instruction to set a register to 0. */
8548 else if (REG_P (op1) || op1 == const0_rtx)
8549 {
8550 /* The cost is 1 per register copied. */
8551 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8552 *cost = COSTS_N_INSNS (nregs);
8553 }
8554 else
8555 /* Cost is just the cost of the RHS of the set. */
8556 *cost += rtx_cost (op1, mode, SET, 1, speed);
8557 return true;
8558
8559 case ZERO_EXTRACT:
8560 case SIGN_EXTRACT:
8561 /* Bit-field insertion. Strip any redundant widening of
8562 the RHS to meet the width of the target. */
8563 if (GET_CODE (op1) == SUBREG)
8564 op1 = SUBREG_REG (op1);
8565 if ((GET_CODE (op1) == ZERO_EXTEND
8566 || GET_CODE (op1) == SIGN_EXTEND)
8567 && CONST_INT_P (XEXP (op0, 1))
8568 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8569 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8570 op1 = XEXP (op1, 0);
8571
8572 if (CONST_INT_P (op1))
8573 {
8574 /* MOV immediate is assumed to always be cheap. */
8575 *cost = COSTS_N_INSNS (1);
8576 }
8577 else
8578 {
8579 /* BFM. */
8580 if (speed)
8581 *cost += extra_cost->alu.bfi;
8582 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8583 }
8584
8585 return true;
8586
8587 default:
8588 /* We can't make sense of this, assume default cost. */
8589 *cost = COSTS_N_INSNS (1);
8590 return false;
8591 }
8592 return false;
8593
8594 case CONST_INT:
8595 /* If an instruction can incorporate a constant within the
8596 instruction, the instruction's expression avoids calling
8597 rtx_cost() on the constant. If rtx_cost() is called on a
8598 constant, then it is usually because the constant must be
8599 moved into a register by one or more instructions.
8600
8601 The exception is constant 0, which can be expressed
8602 as XZR/WZR and is therefore free. The exception to this is
8603 if we have (set (reg) (const0_rtx)) in which case we must cost
8604 the move. However, we can catch that when we cost the SET, so
8605 we don't need to consider that here. */
8606 if (x == const0_rtx)
8607 *cost = 0;
8608 else
8609 {
8610 /* To an approximation, building any other constant is
8611 proportionally expensive to the number of instructions
8612 required to build that constant. This is true whether we
8613 are compiling for SPEED or otherwise. */
8614 if (!is_a <scalar_int_mode> (mode, &int_mode))
8615 int_mode = word_mode;
8616 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8617 (NULL_RTX, x, false, int_mode));
8618 }
8619 return true;
8620
8621 case CONST_DOUBLE:
8622
8623 /* First determine number of instructions to do the move
8624 as an integer constant. */
8625 if (!aarch64_float_const_representable_p (x)
8626 && !aarch64_can_const_movi_rtx_p (x, mode)
8627 && aarch64_float_const_rtx_p (x))
8628 {
8629 unsigned HOST_WIDE_INT ival;
8630 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8631 gcc_assert (succeed);
8632
8633 scalar_int_mode imode = (mode == HFmode
8634 ? SImode
8635 : int_mode_for_mode (mode).require ());
8636 int ncost = aarch64_internal_mov_immediate
8637 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8638 *cost += COSTS_N_INSNS (ncost);
8639 return true;
8640 }
8641
8642 if (speed)
8643 {
8644 /* mov[df,sf]_aarch64. */
8645 if (aarch64_float_const_representable_p (x))
8646 /* FMOV (scalar immediate). */
8647 *cost += extra_cost->fp[mode == DFmode].fpconst;
8648 else if (!aarch64_float_const_zero_rtx_p (x))
8649 {
8650 /* This will be a load from memory. */
8651 if (mode == DFmode)
8652 *cost += extra_cost->ldst.loadd;
8653 else
8654 *cost += extra_cost->ldst.loadf;
8655 }
8656 else
8657 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8658 or MOV v0.s[0], wzr - neither of which are modeled by the
8659 cost tables. Just use the default cost. */
8660 {
8661 }
8662 }
8663
8664 return true;
8665
8666 case MEM:
8667 if (speed)
8668 {
8669 /* For loads we want the base cost of a load, plus an
8670 approximation for the additional cost of the addressing
8671 mode. */
8672 rtx address = XEXP (x, 0);
8673 if (VECTOR_MODE_P (mode))
8674 *cost += extra_cost->ldst.loadv;
8675 else if (GET_MODE_CLASS (mode) == MODE_INT)
8676 *cost += extra_cost->ldst.load;
8677 else if (mode == SFmode)
8678 *cost += extra_cost->ldst.loadf;
8679 else if (mode == DFmode)
8680 *cost += extra_cost->ldst.loadd;
8681
8682 *cost +=
8683 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8684 0, speed));
8685 }
8686
8687 return true;
8688
8689 case NEG:
8690 op0 = XEXP (x, 0);
8691
8692 if (VECTOR_MODE_P (mode))
8693 {
8694 if (speed)
8695 {
8696 /* FNEG. */
8697 *cost += extra_cost->vect.alu;
8698 }
8699 return false;
8700 }
8701
8702 if (GET_MODE_CLASS (mode) == MODE_INT)
8703 {
8704 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8705 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8706 {
8707 /* CSETM. */
8708 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8709 return true;
8710 }
8711
8712 /* Cost this as SUB wzr, X. */
8713 op0 = CONST0_RTX (mode);
8714 op1 = XEXP (x, 0);
8715 goto cost_minus;
8716 }
8717
8718 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8719 {
8720 /* Support (neg(fma...)) as a single instruction only if
8721 sign of zeros is unimportant. This matches the decision
8722 making in aarch64.md. */
8723 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8724 {
8725 /* FNMADD. */
8726 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8727 return true;
8728 }
8729 if (GET_CODE (op0) == MULT)
8730 {
8731 /* FNMUL. */
8732 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8733 return true;
8734 }
8735 if (speed)
8736 /* FNEG. */
8737 *cost += extra_cost->fp[mode == DFmode].neg;
8738 return false;
8739 }
8740
8741 return false;
8742
8743 case CLRSB:
8744 case CLZ:
8745 if (speed)
8746 {
8747 if (VECTOR_MODE_P (mode))
8748 *cost += extra_cost->vect.alu;
8749 else
8750 *cost += extra_cost->alu.clz;
8751 }
8752
8753 return false;
8754
8755 case COMPARE:
8756 op0 = XEXP (x, 0);
8757 op1 = XEXP (x, 1);
8758
8759 if (op1 == const0_rtx
8760 && GET_CODE (op0) == AND)
8761 {
8762 x = op0;
8763 mode = GET_MODE (op0);
8764 goto cost_logic;
8765 }
8766
8767 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8768 {
8769 /* TODO: A write to the CC flags possibly costs extra, this
8770 needs encoding in the cost tables. */
8771
8772 mode = GET_MODE (op0);
8773 /* ANDS. */
8774 if (GET_CODE (op0) == AND)
8775 {
8776 x = op0;
8777 goto cost_logic;
8778 }
8779
8780 if (GET_CODE (op0) == PLUS)
8781 {
8782 /* ADDS (and CMN alias). */
8783 x = op0;
8784 goto cost_plus;
8785 }
8786
8787 if (GET_CODE (op0) == MINUS)
8788 {
8789 /* SUBS. */
8790 x = op0;
8791 goto cost_minus;
8792 }
8793
8794 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8795 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8796 && CONST_INT_P (XEXP (op0, 2)))
8797 {
8798 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8799 Handle it here directly rather than going to cost_logic
8800 since we know the immediate generated for the TST is valid
8801 so we can avoid creating an intermediate rtx for it only
8802 for costing purposes. */
8803 if (speed)
8804 *cost += extra_cost->alu.logical;
8805
8806 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8807 ZERO_EXTRACT, 0, speed);
8808 return true;
8809 }
8810
8811 if (GET_CODE (op1) == NEG)
8812 {
8813 /* CMN. */
8814 if (speed)
8815 *cost += extra_cost->alu.arith;
8816
8817 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8818 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8819 return true;
8820 }
8821
8822 /* CMP.
8823
8824 Compare can freely swap the order of operands, and
8825 canonicalization puts the more complex operation first.
8826 But the integer MINUS logic expects the shift/extend
8827 operation in op1. */
8828 if (! (REG_P (op0)
8829 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8830 {
8831 op0 = XEXP (x, 1);
8832 op1 = XEXP (x, 0);
8833 }
8834 goto cost_minus;
8835 }
8836
8837 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8838 {
8839 /* FCMP. */
8840 if (speed)
8841 *cost += extra_cost->fp[mode == DFmode].compare;
8842
8843 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8844 {
8845 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8846 /* FCMP supports constant 0.0 for no extra cost. */
8847 return true;
8848 }
8849 return false;
8850 }
8851
8852 if (VECTOR_MODE_P (mode))
8853 {
8854 /* Vector compare. */
8855 if (speed)
8856 *cost += extra_cost->vect.alu;
8857
8858 if (aarch64_float_const_zero_rtx_p (op1))
8859 {
8860 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8861 cost. */
8862 return true;
8863 }
8864 return false;
8865 }
8866 return false;
8867
8868 case MINUS:
8869 {
8870 op0 = XEXP (x, 0);
8871 op1 = XEXP (x, 1);
8872
8873 cost_minus:
8874 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8875
8876 /* Detect valid immediates. */
8877 if ((GET_MODE_CLASS (mode) == MODE_INT
8878 || (GET_MODE_CLASS (mode) == MODE_CC
8879 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8880 && CONST_INT_P (op1)
8881 && aarch64_uimm12_shift (INTVAL (op1)))
8882 {
8883 if (speed)
8884 /* SUB(S) (immediate). */
8885 *cost += extra_cost->alu.arith;
8886 return true;
8887 }
8888
8889 /* Look for SUB (extended register). */
8890 if (is_a <scalar_int_mode> (mode, &int_mode)
8891 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8892 {
8893 if (speed)
8894 *cost += extra_cost->alu.extend_arith;
8895
8896 op1 = aarch64_strip_extend (op1, true);
8897 *cost += rtx_cost (op1, VOIDmode,
8898 (enum rtx_code) GET_CODE (op1), 0, speed);
8899 return true;
8900 }
8901
8902 rtx new_op1 = aarch64_strip_extend (op1, false);
8903
8904 /* Cost this as an FMA-alike operation. */
8905 if ((GET_CODE (new_op1) == MULT
8906 || aarch64_shift_p (GET_CODE (new_op1)))
8907 && code != COMPARE)
8908 {
8909 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8910 (enum rtx_code) code,
8911 speed);
8912 return true;
8913 }
8914
8915 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8916
8917 if (speed)
8918 {
8919 if (VECTOR_MODE_P (mode))
8920 {
8921 /* Vector SUB. */
8922 *cost += extra_cost->vect.alu;
8923 }
8924 else if (GET_MODE_CLASS (mode) == MODE_INT)
8925 {
8926 /* SUB(S). */
8927 *cost += extra_cost->alu.arith;
8928 }
8929 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8930 {
8931 /* FSUB. */
8932 *cost += extra_cost->fp[mode == DFmode].addsub;
8933 }
8934 }
8935 return true;
8936 }
8937
8938 case PLUS:
8939 {
8940 rtx new_op0;
8941
8942 op0 = XEXP (x, 0);
8943 op1 = XEXP (x, 1);
8944
8945 cost_plus:
8946 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8947 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8948 {
8949 /* CSINC. */
8950 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8951 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8952 return true;
8953 }
8954
8955 if (GET_MODE_CLASS (mode) == MODE_INT
8956 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8957 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8958 {
8959 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8960
8961 if (speed)
8962 /* ADD (immediate). */
8963 *cost += extra_cost->alu.arith;
8964 return true;
8965 }
8966
8967 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8968
8969 /* Look for ADD (extended register). */
8970 if (is_a <scalar_int_mode> (mode, &int_mode)
8971 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8972 {
8973 if (speed)
8974 *cost += extra_cost->alu.extend_arith;
8975
8976 op0 = aarch64_strip_extend (op0, true);
8977 *cost += rtx_cost (op0, VOIDmode,
8978 (enum rtx_code) GET_CODE (op0), 0, speed);
8979 return true;
8980 }
8981
8982 /* Strip any extend, leave shifts behind as we will
8983 cost them through mult_cost. */
8984 new_op0 = aarch64_strip_extend (op0, false);
8985
8986 if (GET_CODE (new_op0) == MULT
8987 || aarch64_shift_p (GET_CODE (new_op0)))
8988 {
8989 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8990 speed);
8991 return true;
8992 }
8993
8994 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8995
8996 if (speed)
8997 {
8998 if (VECTOR_MODE_P (mode))
8999 {
9000 /* Vector ADD. */
9001 *cost += extra_cost->vect.alu;
9002 }
9003 else if (GET_MODE_CLASS (mode) == MODE_INT)
9004 {
9005 /* ADD. */
9006 *cost += extra_cost->alu.arith;
9007 }
9008 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9009 {
9010 /* FADD. */
9011 *cost += extra_cost->fp[mode == DFmode].addsub;
9012 }
9013 }
9014 return true;
9015 }
9016
9017 case BSWAP:
9018 *cost = COSTS_N_INSNS (1);
9019
9020 if (speed)
9021 {
9022 if (VECTOR_MODE_P (mode))
9023 *cost += extra_cost->vect.alu;
9024 else
9025 *cost += extra_cost->alu.rev;
9026 }
9027 return false;
9028
9029 case IOR:
9030 if (aarch_rev16_p (x))
9031 {
9032 *cost = COSTS_N_INSNS (1);
9033
9034 if (speed)
9035 {
9036 if (VECTOR_MODE_P (mode))
9037 *cost += extra_cost->vect.alu;
9038 else
9039 *cost += extra_cost->alu.rev;
9040 }
9041 return true;
9042 }
9043
9044 if (aarch64_extr_rtx_p (x, &op0, &op1))
9045 {
9046 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9047 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9048 if (speed)
9049 *cost += extra_cost->alu.shift;
9050
9051 return true;
9052 }
9053 /* Fall through. */
9054 case XOR:
9055 case AND:
9056 cost_logic:
9057 op0 = XEXP (x, 0);
9058 op1 = XEXP (x, 1);
9059
9060 if (VECTOR_MODE_P (mode))
9061 {
9062 if (speed)
9063 *cost += extra_cost->vect.alu;
9064 return true;
9065 }
9066
9067 if (code == AND
9068 && GET_CODE (op0) == MULT
9069 && CONST_INT_P (XEXP (op0, 1))
9070 && CONST_INT_P (op1)
9071 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9072 INTVAL (op1)) != 0)
9073 {
9074 /* This is a UBFM/SBFM. */
9075 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9076 if (speed)
9077 *cost += extra_cost->alu.bfx;
9078 return true;
9079 }
9080
9081 if (is_int_mode (mode, &int_mode))
9082 {
9083 if (CONST_INT_P (op1))
9084 {
9085 /* We have a mask + shift version of a UBFIZ
9086 i.e. the *andim_ashift<mode>_bfiz pattern. */
9087 if (GET_CODE (op0) == ASHIFT
9088 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9089 XEXP (op0, 1)))
9090 {
9091 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9092 (enum rtx_code) code, 0, speed);
9093 if (speed)
9094 *cost += extra_cost->alu.bfx;
9095
9096 return true;
9097 }
9098 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9099 {
9100 /* We possibly get the immediate for free, this is not
9101 modelled. */
9102 *cost += rtx_cost (op0, int_mode,
9103 (enum rtx_code) code, 0, speed);
9104 if (speed)
9105 *cost += extra_cost->alu.logical;
9106
9107 return true;
9108 }
9109 }
9110 else
9111 {
9112 rtx new_op0 = op0;
9113
9114 /* Handle ORN, EON, or BIC. */
9115 if (GET_CODE (op0) == NOT)
9116 op0 = XEXP (op0, 0);
9117
9118 new_op0 = aarch64_strip_shift (op0);
9119
9120 /* If we had a shift on op0 then this is a logical-shift-
9121 by-register/immediate operation. Otherwise, this is just
9122 a logical operation. */
9123 if (speed)
9124 {
9125 if (new_op0 != op0)
9126 {
9127 /* Shift by immediate. */
9128 if (CONST_INT_P (XEXP (op0, 1)))
9129 *cost += extra_cost->alu.log_shift;
9130 else
9131 *cost += extra_cost->alu.log_shift_reg;
9132 }
9133 else
9134 *cost += extra_cost->alu.logical;
9135 }
9136
9137 /* In both cases we want to cost both operands. */
9138 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9139 0, speed);
9140 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9141 1, speed);
9142
9143 return true;
9144 }
9145 }
9146 return false;
9147
9148 case NOT:
9149 x = XEXP (x, 0);
9150 op0 = aarch64_strip_shift (x);
9151
9152 if (VECTOR_MODE_P (mode))
9153 {
9154 /* Vector NOT. */
9155 *cost += extra_cost->vect.alu;
9156 return false;
9157 }
9158
9159 /* MVN-shifted-reg. */
9160 if (op0 != x)
9161 {
9162 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9163
9164 if (speed)
9165 *cost += extra_cost->alu.log_shift;
9166
9167 return true;
9168 }
9169 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9170 Handle the second form here taking care that 'a' in the above can
9171 be a shift. */
9172 else if (GET_CODE (op0) == XOR)
9173 {
9174 rtx newop0 = XEXP (op0, 0);
9175 rtx newop1 = XEXP (op0, 1);
9176 rtx op0_stripped = aarch64_strip_shift (newop0);
9177
9178 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9179 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9180
9181 if (speed)
9182 {
9183 if (op0_stripped != newop0)
9184 *cost += extra_cost->alu.log_shift;
9185 else
9186 *cost += extra_cost->alu.logical;
9187 }
9188
9189 return true;
9190 }
9191 /* MVN. */
9192 if (speed)
9193 *cost += extra_cost->alu.logical;
9194
9195 return false;
9196
9197 case ZERO_EXTEND:
9198
9199 op0 = XEXP (x, 0);
9200 /* If a value is written in SI mode, then zero extended to DI
9201 mode, the operation will in general be free as a write to
9202 a 'w' register implicitly zeroes the upper bits of an 'x'
9203 register. However, if this is
9204
9205 (set (reg) (zero_extend (reg)))
9206
9207 we must cost the explicit register move. */
9208 if (mode == DImode
9209 && GET_MODE (op0) == SImode
9210 && outer == SET)
9211 {
9212 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9213
9214 /* If OP_COST is non-zero, then the cost of the zero extend
9215 is effectively the cost of the inner operation. Otherwise
9216 we have a MOV instruction and we take the cost from the MOV
9217 itself. This is true independently of whether we are
9218 optimizing for space or time. */
9219 if (op_cost)
9220 *cost = op_cost;
9221
9222 return true;
9223 }
9224 else if (MEM_P (op0))
9225 {
9226 /* All loads can zero extend to any size for free. */
9227 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9228 return true;
9229 }
9230
9231 op0 = aarch64_extend_bitfield_pattern_p (x);
9232 if (op0)
9233 {
9234 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9235 if (speed)
9236 *cost += extra_cost->alu.bfx;
9237 return true;
9238 }
9239
9240 if (speed)
9241 {
9242 if (VECTOR_MODE_P (mode))
9243 {
9244 /* UMOV. */
9245 *cost += extra_cost->vect.alu;
9246 }
9247 else
9248 {
9249 /* We generate an AND instead of UXTB/UXTH. */
9250 *cost += extra_cost->alu.logical;
9251 }
9252 }
9253 return false;
9254
9255 case SIGN_EXTEND:
9256 if (MEM_P (XEXP (x, 0)))
9257 {
9258 /* LDRSH. */
9259 if (speed)
9260 {
9261 rtx address = XEXP (XEXP (x, 0), 0);
9262 *cost += extra_cost->ldst.load_sign_extend;
9263
9264 *cost +=
9265 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9266 0, speed));
9267 }
9268 return true;
9269 }
9270
9271 op0 = aarch64_extend_bitfield_pattern_p (x);
9272 if (op0)
9273 {
9274 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9275 if (speed)
9276 *cost += extra_cost->alu.bfx;
9277 return true;
9278 }
9279
9280 if (speed)
9281 {
9282 if (VECTOR_MODE_P (mode))
9283 *cost += extra_cost->vect.alu;
9284 else
9285 *cost += extra_cost->alu.extend;
9286 }
9287 return false;
9288
9289 case ASHIFT:
9290 op0 = XEXP (x, 0);
9291 op1 = XEXP (x, 1);
9292
9293 if (CONST_INT_P (op1))
9294 {
9295 if (speed)
9296 {
9297 if (VECTOR_MODE_P (mode))
9298 {
9299 /* Vector shift (immediate). */
9300 *cost += extra_cost->vect.alu;
9301 }
9302 else
9303 {
9304 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9305 aliases. */
9306 *cost += extra_cost->alu.shift;
9307 }
9308 }
9309
9310 /* We can incorporate zero/sign extend for free. */
9311 if (GET_CODE (op0) == ZERO_EXTEND
9312 || GET_CODE (op0) == SIGN_EXTEND)
9313 op0 = XEXP (op0, 0);
9314
9315 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9316 return true;
9317 }
9318 else
9319 {
9320 if (VECTOR_MODE_P (mode))
9321 {
9322 if (speed)
9323 /* Vector shift (register). */
9324 *cost += extra_cost->vect.alu;
9325 }
9326 else
9327 {
9328 if (speed)
9329 /* LSLV. */
9330 *cost += extra_cost->alu.shift_reg;
9331
9332 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9333 && CONST_INT_P (XEXP (op1, 1))
9334 && known_eq (INTVAL (XEXP (op1, 1)),
9335 GET_MODE_BITSIZE (mode) - 1))
9336 {
9337 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9338 /* We already demanded XEXP (op1, 0) to be REG_P, so
9339 don't recurse into it. */
9340 return true;
9341 }
9342 }
9343 return false; /* All arguments need to be in registers. */
9344 }
9345
9346 case ROTATE:
9347 case ROTATERT:
9348 case LSHIFTRT:
9349 case ASHIFTRT:
9350 op0 = XEXP (x, 0);
9351 op1 = XEXP (x, 1);
9352
9353 if (CONST_INT_P (op1))
9354 {
9355 /* ASR (immediate) and friends. */
9356 if (speed)
9357 {
9358 if (VECTOR_MODE_P (mode))
9359 *cost += extra_cost->vect.alu;
9360 else
9361 *cost += extra_cost->alu.shift;
9362 }
9363
9364 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9365 return true;
9366 }
9367 else
9368 {
9369 if (VECTOR_MODE_P (mode))
9370 {
9371 if (speed)
9372 /* Vector shift (register). */
9373 *cost += extra_cost->vect.alu;
9374 }
9375 else
9376 {
9377 if (speed)
9378 /* ASR (register) and friends. */
9379 *cost += extra_cost->alu.shift_reg;
9380
9381 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9382 && CONST_INT_P (XEXP (op1, 1))
9383 && known_eq (INTVAL (XEXP (op1, 1)),
9384 GET_MODE_BITSIZE (mode) - 1))
9385 {
9386 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9387 /* We already demanded XEXP (op1, 0) to be REG_P, so
9388 don't recurse into it. */
9389 return true;
9390 }
9391 }
9392 return false; /* All arguments need to be in registers. */
9393 }
9394
9395 case SYMBOL_REF:
9396
9397 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9398 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9399 {
9400 /* LDR. */
9401 if (speed)
9402 *cost += extra_cost->ldst.load;
9403 }
9404 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9405 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9406 {
9407 /* ADRP, followed by ADD. */
9408 *cost += COSTS_N_INSNS (1);
9409 if (speed)
9410 *cost += 2 * extra_cost->alu.arith;
9411 }
9412 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9413 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9414 {
9415 /* ADR. */
9416 if (speed)
9417 *cost += extra_cost->alu.arith;
9418 }
9419
9420 if (flag_pic)
9421 {
9422 /* One extra load instruction, after accessing the GOT. */
9423 *cost += COSTS_N_INSNS (1);
9424 if (speed)
9425 *cost += extra_cost->ldst.load;
9426 }
9427 return true;
9428
9429 case HIGH:
9430 case LO_SUM:
9431 /* ADRP/ADD (immediate). */
9432 if (speed)
9433 *cost += extra_cost->alu.arith;
9434 return true;
9435
9436 case ZERO_EXTRACT:
9437 case SIGN_EXTRACT:
9438 /* UBFX/SBFX. */
9439 if (speed)
9440 {
9441 if (VECTOR_MODE_P (mode))
9442 *cost += extra_cost->vect.alu;
9443 else
9444 *cost += extra_cost->alu.bfx;
9445 }
9446
9447 /* We can trust that the immediates used will be correct (there
9448 are no by-register forms), so we need only cost op0. */
9449 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9450 return true;
9451
9452 case MULT:
9453 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9454 /* aarch64_rtx_mult_cost always handles recursion to its
9455 operands. */
9456 return true;
9457
9458 case MOD:
9459 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9460 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9461 an unconditional negate. This case should only ever be reached through
9462 the set_smod_pow2_cheap check in expmed.c. */
9463 if (CONST_INT_P (XEXP (x, 1))
9464 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9465 && (mode == SImode || mode == DImode))
9466 {
9467 /* We expand to 4 instructions. Reset the baseline. */
9468 *cost = COSTS_N_INSNS (4);
9469
9470 if (speed)
9471 *cost += 2 * extra_cost->alu.logical
9472 + 2 * extra_cost->alu.arith;
9473
9474 return true;
9475 }
9476
9477 /* Fall-through. */
9478 case UMOD:
9479 if (speed)
9480 {
9481 /* Slighly prefer UMOD over SMOD. */
9482 if (VECTOR_MODE_P (mode))
9483 *cost += extra_cost->vect.alu;
9484 else if (GET_MODE_CLASS (mode) == MODE_INT)
9485 *cost += (extra_cost->mult[mode == DImode].add
9486 + extra_cost->mult[mode == DImode].idiv
9487 + (code == MOD ? 1 : 0));
9488 }
9489 return false; /* All arguments need to be in registers. */
9490
9491 case DIV:
9492 case UDIV:
9493 case SQRT:
9494 if (speed)
9495 {
9496 if (VECTOR_MODE_P (mode))
9497 *cost += extra_cost->vect.alu;
9498 else if (GET_MODE_CLASS (mode) == MODE_INT)
9499 /* There is no integer SQRT, so only DIV and UDIV can get
9500 here. */
9501 *cost += (extra_cost->mult[mode == DImode].idiv
9502 /* Slighly prefer UDIV over SDIV. */
9503 + (code == DIV ? 1 : 0));
9504 else
9505 *cost += extra_cost->fp[mode == DFmode].div;
9506 }
9507 return false; /* All arguments need to be in registers. */
9508
9509 case IF_THEN_ELSE:
9510 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9511 XEXP (x, 2), cost, speed);
9512
9513 case EQ:
9514 case NE:
9515 case GT:
9516 case GTU:
9517 case LT:
9518 case LTU:
9519 case GE:
9520 case GEU:
9521 case LE:
9522 case LEU:
9523
9524 return false; /* All arguments must be in registers. */
9525
9526 case FMA:
9527 op0 = XEXP (x, 0);
9528 op1 = XEXP (x, 1);
9529 op2 = XEXP (x, 2);
9530
9531 if (speed)
9532 {
9533 if (VECTOR_MODE_P (mode))
9534 *cost += extra_cost->vect.alu;
9535 else
9536 *cost += extra_cost->fp[mode == DFmode].fma;
9537 }
9538
9539 /* FMSUB, FNMADD, and FNMSUB are free. */
9540 if (GET_CODE (op0) == NEG)
9541 op0 = XEXP (op0, 0);
9542
9543 if (GET_CODE (op2) == NEG)
9544 op2 = XEXP (op2, 0);
9545
9546 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9547 and the by-element operand as operand 0. */
9548 if (GET_CODE (op1) == NEG)
9549 op1 = XEXP (op1, 0);
9550
9551 /* Catch vector-by-element operations. The by-element operand can
9552 either be (vec_duplicate (vec_select (x))) or just
9553 (vec_select (x)), depending on whether we are multiplying by
9554 a vector or a scalar.
9555
9556 Canonicalization is not very good in these cases, FMA4 will put the
9557 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9558 if (GET_CODE (op0) == VEC_DUPLICATE)
9559 op0 = XEXP (op0, 0);
9560 else if (GET_CODE (op1) == VEC_DUPLICATE)
9561 op1 = XEXP (op1, 0);
9562
9563 if (GET_CODE (op0) == VEC_SELECT)
9564 op0 = XEXP (op0, 0);
9565 else if (GET_CODE (op1) == VEC_SELECT)
9566 op1 = XEXP (op1, 0);
9567
9568 /* If the remaining parameters are not registers,
9569 get the cost to put them into registers. */
9570 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9571 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9572 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9573 return true;
9574
9575 case FLOAT:
9576 case UNSIGNED_FLOAT:
9577 if (speed)
9578 *cost += extra_cost->fp[mode == DFmode].fromint;
9579 return false;
9580
9581 case FLOAT_EXTEND:
9582 if (speed)
9583 {
9584 if (VECTOR_MODE_P (mode))
9585 {
9586 /*Vector truncate. */
9587 *cost += extra_cost->vect.alu;
9588 }
9589 else
9590 *cost += extra_cost->fp[mode == DFmode].widen;
9591 }
9592 return false;
9593
9594 case FLOAT_TRUNCATE:
9595 if (speed)
9596 {
9597 if (VECTOR_MODE_P (mode))
9598 {
9599 /*Vector conversion. */
9600 *cost += extra_cost->vect.alu;
9601 }
9602 else
9603 *cost += extra_cost->fp[mode == DFmode].narrow;
9604 }
9605 return false;
9606
9607 case FIX:
9608 case UNSIGNED_FIX:
9609 x = XEXP (x, 0);
9610 /* Strip the rounding part. They will all be implemented
9611 by the fcvt* family of instructions anyway. */
9612 if (GET_CODE (x) == UNSPEC)
9613 {
9614 unsigned int uns_code = XINT (x, 1);
9615
9616 if (uns_code == UNSPEC_FRINTA
9617 || uns_code == UNSPEC_FRINTM
9618 || uns_code == UNSPEC_FRINTN
9619 || uns_code == UNSPEC_FRINTP
9620 || uns_code == UNSPEC_FRINTZ)
9621 x = XVECEXP (x, 0, 0);
9622 }
9623
9624 if (speed)
9625 {
9626 if (VECTOR_MODE_P (mode))
9627 *cost += extra_cost->vect.alu;
9628 else
9629 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9630 }
9631
9632 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9633 fixed-point fcvt. */
9634 if (GET_CODE (x) == MULT
9635 && ((VECTOR_MODE_P (mode)
9636 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9637 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9638 {
9639 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9640 0, speed);
9641 return true;
9642 }
9643
9644 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9645 return true;
9646
9647 case ABS:
9648 if (VECTOR_MODE_P (mode))
9649 {
9650 /* ABS (vector). */
9651 if (speed)
9652 *cost += extra_cost->vect.alu;
9653 }
9654 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9655 {
9656 op0 = XEXP (x, 0);
9657
9658 /* FABD, which is analogous to FADD. */
9659 if (GET_CODE (op0) == MINUS)
9660 {
9661 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9662 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9663 if (speed)
9664 *cost += extra_cost->fp[mode == DFmode].addsub;
9665
9666 return true;
9667 }
9668 /* Simple FABS is analogous to FNEG. */
9669 if (speed)
9670 *cost += extra_cost->fp[mode == DFmode].neg;
9671 }
9672 else
9673 {
9674 /* Integer ABS will either be split to
9675 two arithmetic instructions, or will be an ABS
9676 (scalar), which we don't model. */
9677 *cost = COSTS_N_INSNS (2);
9678 if (speed)
9679 *cost += 2 * extra_cost->alu.arith;
9680 }
9681 return false;
9682
9683 case SMAX:
9684 case SMIN:
9685 if (speed)
9686 {
9687 if (VECTOR_MODE_P (mode))
9688 *cost += extra_cost->vect.alu;
9689 else
9690 {
9691 /* FMAXNM/FMINNM/FMAX/FMIN.
9692 TODO: This may not be accurate for all implementations, but
9693 we do not model this in the cost tables. */
9694 *cost += extra_cost->fp[mode == DFmode].addsub;
9695 }
9696 }
9697 return false;
9698
9699 case UNSPEC:
9700 /* The floating point round to integer frint* instructions. */
9701 if (aarch64_frint_unspec_p (XINT (x, 1)))
9702 {
9703 if (speed)
9704 *cost += extra_cost->fp[mode == DFmode].roundint;
9705
9706 return false;
9707 }
9708
9709 if (XINT (x, 1) == UNSPEC_RBIT)
9710 {
9711 if (speed)
9712 *cost += extra_cost->alu.rev;
9713
9714 return false;
9715 }
9716 break;
9717
9718 case TRUNCATE:
9719
9720 /* Decompose <su>muldi3_highpart. */
9721 if (/* (truncate:DI */
9722 mode == DImode
9723 /* (lshiftrt:TI */
9724 && GET_MODE (XEXP (x, 0)) == TImode
9725 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9726 /* (mult:TI */
9727 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9728 /* (ANY_EXTEND:TI (reg:DI))
9729 (ANY_EXTEND:TI (reg:DI))) */
9730 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9731 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9732 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9733 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9734 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9735 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9736 /* (const_int 64) */
9737 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9738 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9739 {
9740 /* UMULH/SMULH. */
9741 if (speed)
9742 *cost += extra_cost->mult[mode == DImode].extend;
9743 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9744 mode, MULT, 0, speed);
9745 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9746 mode, MULT, 1, speed);
9747 return true;
9748 }
9749
9750 /* Fall through. */
9751 default:
9752 break;
9753 }
9754
9755 if (dump_file
9756 && flag_aarch64_verbose_cost)
9757 fprintf (dump_file,
9758 "\nFailed to cost RTX. Assuming default cost.\n");
9759
9760 return true;
9761 }
9762
9763 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9764 calculated for X. This cost is stored in *COST. Returns true
9765 if the total cost of X was calculated. */
9766 static bool
9767 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9768 int param, int *cost, bool speed)
9769 {
9770 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9771
9772 if (dump_file
9773 && flag_aarch64_verbose_cost)
9774 {
9775 print_rtl_single (dump_file, x);
9776 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9777 speed ? "Hot" : "Cold",
9778 *cost, result ? "final" : "partial");
9779 }
9780
9781 return result;
9782 }
9783
9784 static int
9785 aarch64_register_move_cost (machine_mode mode,
9786 reg_class_t from_i, reg_class_t to_i)
9787 {
9788 enum reg_class from = (enum reg_class) from_i;
9789 enum reg_class to = (enum reg_class) to_i;
9790 const struct cpu_regmove_cost *regmove_cost
9791 = aarch64_tune_params.regmove_cost;
9792
9793 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9794 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9795 to = GENERAL_REGS;
9796
9797 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9798 from = GENERAL_REGS;
9799
9800 /* Moving between GPR and stack cost is the same as GP2GP. */
9801 if ((from == GENERAL_REGS && to == STACK_REG)
9802 || (to == GENERAL_REGS && from == STACK_REG))
9803 return regmove_cost->GP2GP;
9804
9805 /* To/From the stack register, we move via the gprs. */
9806 if (to == STACK_REG || from == STACK_REG)
9807 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9808 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9809
9810 if (known_eq (GET_MODE_SIZE (mode), 16))
9811 {
9812 /* 128-bit operations on general registers require 2 instructions. */
9813 if (from == GENERAL_REGS && to == GENERAL_REGS)
9814 return regmove_cost->GP2GP * 2;
9815 else if (from == GENERAL_REGS)
9816 return regmove_cost->GP2FP * 2;
9817 else if (to == GENERAL_REGS)
9818 return regmove_cost->FP2GP * 2;
9819
9820 /* When AdvSIMD instructions are disabled it is not possible to move
9821 a 128-bit value directly between Q registers. This is handled in
9822 secondary reload. A general register is used as a scratch to move
9823 the upper DI value and the lower DI value is moved directly,
9824 hence the cost is the sum of three moves. */
9825 if (! TARGET_SIMD)
9826 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9827
9828 return regmove_cost->FP2FP;
9829 }
9830
9831 if (from == GENERAL_REGS && to == GENERAL_REGS)
9832 return regmove_cost->GP2GP;
9833 else if (from == GENERAL_REGS)
9834 return regmove_cost->GP2FP;
9835 else if (to == GENERAL_REGS)
9836 return regmove_cost->FP2GP;
9837
9838 return regmove_cost->FP2FP;
9839 }
9840
9841 static int
9842 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9843 reg_class_t rclass ATTRIBUTE_UNUSED,
9844 bool in ATTRIBUTE_UNUSED)
9845 {
9846 return aarch64_tune_params.memmov_cost;
9847 }
9848
9849 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9850 to optimize 1.0/sqrt. */
9851
9852 static bool
9853 use_rsqrt_p (machine_mode mode)
9854 {
9855 return (!flag_trapping_math
9856 && flag_unsafe_math_optimizations
9857 && ((aarch64_tune_params.approx_modes->recip_sqrt
9858 & AARCH64_APPROX_MODE (mode))
9859 || flag_mrecip_low_precision_sqrt));
9860 }
9861
9862 /* Function to decide when to use the approximate reciprocal square root
9863 builtin. */
9864
9865 static tree
9866 aarch64_builtin_reciprocal (tree fndecl)
9867 {
9868 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9869
9870 if (!use_rsqrt_p (mode))
9871 return NULL_TREE;
9872 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9873 }
9874
9875 typedef rtx (*rsqrte_type) (rtx, rtx);
9876
9877 /* Select reciprocal square root initial estimate insn depending on machine
9878 mode. */
9879
9880 static rsqrte_type
9881 get_rsqrte_type (machine_mode mode)
9882 {
9883 switch (mode)
9884 {
9885 case E_DFmode: return gen_aarch64_rsqrtedf;
9886 case E_SFmode: return gen_aarch64_rsqrtesf;
9887 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9888 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9889 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9890 default: gcc_unreachable ();
9891 }
9892 }
9893
9894 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9895
9896 /* Select reciprocal square root series step insn depending on machine mode. */
9897
9898 static rsqrts_type
9899 get_rsqrts_type (machine_mode mode)
9900 {
9901 switch (mode)
9902 {
9903 case E_DFmode: return gen_aarch64_rsqrtsdf;
9904 case E_SFmode: return gen_aarch64_rsqrtssf;
9905 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9906 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9907 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9908 default: gcc_unreachable ();
9909 }
9910 }
9911
9912 /* Emit instruction sequence to compute either the approximate square root
9913 or its approximate reciprocal, depending on the flag RECP, and return
9914 whether the sequence was emitted or not. */
9915
9916 bool
9917 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9918 {
9919 machine_mode mode = GET_MODE (dst);
9920
9921 if (GET_MODE_INNER (mode) == HFmode)
9922 {
9923 gcc_assert (!recp);
9924 return false;
9925 }
9926
9927 if (!recp)
9928 {
9929 if (!(flag_mlow_precision_sqrt
9930 || (aarch64_tune_params.approx_modes->sqrt
9931 & AARCH64_APPROX_MODE (mode))))
9932 return false;
9933
9934 if (flag_finite_math_only
9935 || flag_trapping_math
9936 || !flag_unsafe_math_optimizations
9937 || optimize_function_for_size_p (cfun))
9938 return false;
9939 }
9940 else
9941 /* Caller assumes we cannot fail. */
9942 gcc_assert (use_rsqrt_p (mode));
9943
9944 machine_mode mmsk = mode_for_int_vector (mode).require ();
9945 rtx xmsk = gen_reg_rtx (mmsk);
9946 if (!recp)
9947 /* When calculating the approximate square root, compare the
9948 argument with 0.0 and create a mask. */
9949 emit_insn (gen_rtx_SET (xmsk,
9950 gen_rtx_NEG (mmsk,
9951 gen_rtx_EQ (mmsk, src,
9952 CONST0_RTX (mode)))));
9953
9954 /* Estimate the approximate reciprocal square root. */
9955 rtx xdst = gen_reg_rtx (mode);
9956 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9957
9958 /* Iterate over the series twice for SF and thrice for DF. */
9959 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9960
9961 /* Optionally iterate over the series once less for faster performance
9962 while sacrificing the accuracy. */
9963 if ((recp && flag_mrecip_low_precision_sqrt)
9964 || (!recp && flag_mlow_precision_sqrt))
9965 iterations--;
9966
9967 /* Iterate over the series to calculate the approximate reciprocal square
9968 root. */
9969 rtx x1 = gen_reg_rtx (mode);
9970 while (iterations--)
9971 {
9972 rtx x2 = gen_reg_rtx (mode);
9973 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9974
9975 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9976
9977 if (iterations > 0)
9978 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9979 }
9980
9981 if (!recp)
9982 {
9983 /* Qualify the approximate reciprocal square root when the argument is
9984 0.0 by squashing the intermediary result to 0.0. */
9985 rtx xtmp = gen_reg_rtx (mmsk);
9986 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9987 gen_rtx_SUBREG (mmsk, xdst, 0)));
9988 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9989
9990 /* Calculate the approximate square root. */
9991 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9992 }
9993
9994 /* Finalize the approximation. */
9995 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9996
9997 return true;
9998 }
9999
10000 typedef rtx (*recpe_type) (rtx, rtx);
10001
10002 /* Select reciprocal initial estimate insn depending on machine mode. */
10003
10004 static recpe_type
10005 get_recpe_type (machine_mode mode)
10006 {
10007 switch (mode)
10008 {
10009 case E_SFmode: return (gen_aarch64_frecpesf);
10010 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
10011 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
10012 case E_DFmode: return (gen_aarch64_frecpedf);
10013 case E_V2DFmode: return (gen_aarch64_frecpev2df);
10014 default: gcc_unreachable ();
10015 }
10016 }
10017
10018 typedef rtx (*recps_type) (rtx, rtx, rtx);
10019
10020 /* Select reciprocal series step insn depending on machine mode. */
10021
10022 static recps_type
10023 get_recps_type (machine_mode mode)
10024 {
10025 switch (mode)
10026 {
10027 case E_SFmode: return (gen_aarch64_frecpssf);
10028 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10029 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10030 case E_DFmode: return (gen_aarch64_frecpsdf);
10031 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10032 default: gcc_unreachable ();
10033 }
10034 }
10035
10036 /* Emit the instruction sequence to compute the approximation for the division
10037 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10038
10039 bool
10040 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10041 {
10042 machine_mode mode = GET_MODE (quo);
10043
10044 if (GET_MODE_INNER (mode) == HFmode)
10045 return false;
10046
10047 bool use_approx_division_p = (flag_mlow_precision_div
10048 || (aarch64_tune_params.approx_modes->division
10049 & AARCH64_APPROX_MODE (mode)));
10050
10051 if (!flag_finite_math_only
10052 || flag_trapping_math
10053 || !flag_unsafe_math_optimizations
10054 || optimize_function_for_size_p (cfun)
10055 || !use_approx_division_p)
10056 return false;
10057
10058 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10059 return false;
10060
10061 /* Estimate the approximate reciprocal. */
10062 rtx xrcp = gen_reg_rtx (mode);
10063 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10064
10065 /* Iterate over the series twice for SF and thrice for DF. */
10066 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10067
10068 /* Optionally iterate over the series once less for faster performance,
10069 while sacrificing the accuracy. */
10070 if (flag_mlow_precision_div)
10071 iterations--;
10072
10073 /* Iterate over the series to calculate the approximate reciprocal. */
10074 rtx xtmp = gen_reg_rtx (mode);
10075 while (iterations--)
10076 {
10077 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10078
10079 if (iterations > 0)
10080 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10081 }
10082
10083 if (num != CONST1_RTX (mode))
10084 {
10085 /* As the approximate reciprocal of DEN is already calculated, only
10086 calculate the approximate division when NUM is not 1.0. */
10087 rtx xnum = force_reg (mode, num);
10088 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10089 }
10090
10091 /* Finalize the approximation. */
10092 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10093 return true;
10094 }
10095
10096 /* Return the number of instructions that can be issued per cycle. */
10097 static int
10098 aarch64_sched_issue_rate (void)
10099 {
10100 return aarch64_tune_params.issue_rate;
10101 }
10102
10103 static int
10104 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10105 {
10106 int issue_rate = aarch64_sched_issue_rate ();
10107
10108 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10109 }
10110
10111
10112 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10113 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10114 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10115
10116 static int
10117 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10118 int ready_index)
10119 {
10120 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10121 }
10122
10123
10124 /* Vectorizer cost model target hooks. */
10125
10126 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10127 static int
10128 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10129 tree vectype,
10130 int misalign ATTRIBUTE_UNUSED)
10131 {
10132 unsigned elements;
10133 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10134 bool fp = false;
10135
10136 if (vectype != NULL)
10137 fp = FLOAT_TYPE_P (vectype);
10138
10139 switch (type_of_cost)
10140 {
10141 case scalar_stmt:
10142 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10143
10144 case scalar_load:
10145 return costs->scalar_load_cost;
10146
10147 case scalar_store:
10148 return costs->scalar_store_cost;
10149
10150 case vector_stmt:
10151 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10152
10153 case vector_load:
10154 return costs->vec_align_load_cost;
10155
10156 case vector_store:
10157 return costs->vec_store_cost;
10158
10159 case vec_to_scalar:
10160 return costs->vec_to_scalar_cost;
10161
10162 case scalar_to_vec:
10163 return costs->scalar_to_vec_cost;
10164
10165 case unaligned_load:
10166 case vector_gather_load:
10167 return costs->vec_unalign_load_cost;
10168
10169 case unaligned_store:
10170 case vector_scatter_store:
10171 return costs->vec_unalign_store_cost;
10172
10173 case cond_branch_taken:
10174 return costs->cond_taken_branch_cost;
10175
10176 case cond_branch_not_taken:
10177 return costs->cond_not_taken_branch_cost;
10178
10179 case vec_perm:
10180 return costs->vec_permute_cost;
10181
10182 case vec_promote_demote:
10183 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10184
10185 case vec_construct:
10186 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10187 return elements / 2 + 1;
10188
10189 default:
10190 gcc_unreachable ();
10191 }
10192 }
10193
10194 /* Implement targetm.vectorize.add_stmt_cost. */
10195 static unsigned
10196 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10197 struct _stmt_vec_info *stmt_info, int misalign,
10198 enum vect_cost_model_location where)
10199 {
10200 unsigned *cost = (unsigned *) data;
10201 unsigned retval = 0;
10202
10203 if (flag_vect_cost_model)
10204 {
10205 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10206 int stmt_cost =
10207 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10208
10209 /* Statements in an inner loop relative to the loop being
10210 vectorized are weighted more heavily. The value here is
10211 arbitrary and could potentially be improved with analysis. */
10212 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10213 count *= 50; /* FIXME */
10214
10215 retval = (unsigned) (count * stmt_cost);
10216 cost[where] += retval;
10217 }
10218
10219 return retval;
10220 }
10221
10222 static void initialize_aarch64_code_model (struct gcc_options *);
10223
10224 /* Parse the TO_PARSE string and put the architecture struct that it
10225 selects into RES and the architectural features into ISA_FLAGS.
10226 Return an aarch64_parse_opt_result describing the parse result.
10227 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10228
10229 static enum aarch64_parse_opt_result
10230 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10231 unsigned long *isa_flags)
10232 {
10233 char *ext;
10234 const struct processor *arch;
10235 char *str = (char *) alloca (strlen (to_parse) + 1);
10236 size_t len;
10237
10238 strcpy (str, to_parse);
10239
10240 ext = strchr (str, '+');
10241
10242 if (ext != NULL)
10243 len = ext - str;
10244 else
10245 len = strlen (str);
10246
10247 if (len == 0)
10248 return AARCH64_PARSE_MISSING_ARG;
10249
10250
10251 /* Loop through the list of supported ARCHes to find a match. */
10252 for (arch = all_architectures; arch->name != NULL; arch++)
10253 {
10254 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10255 {
10256 unsigned long isa_temp = arch->flags;
10257
10258 if (ext != NULL)
10259 {
10260 /* TO_PARSE string contains at least one extension. */
10261 enum aarch64_parse_opt_result ext_res
10262 = aarch64_parse_extension (ext, &isa_temp);
10263
10264 if (ext_res != AARCH64_PARSE_OK)
10265 return ext_res;
10266 }
10267 /* Extension parsing was successful. Confirm the result
10268 arch and ISA flags. */
10269 *res = arch;
10270 *isa_flags = isa_temp;
10271 return AARCH64_PARSE_OK;
10272 }
10273 }
10274
10275 /* ARCH name not found in list. */
10276 return AARCH64_PARSE_INVALID_ARG;
10277 }
10278
10279 /* Parse the TO_PARSE string and put the result tuning in RES and the
10280 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10281 describing the parse result. If there is an error parsing, RES and
10282 ISA_FLAGS are left unchanged. */
10283
10284 static enum aarch64_parse_opt_result
10285 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10286 unsigned long *isa_flags)
10287 {
10288 char *ext;
10289 const struct processor *cpu;
10290 char *str = (char *) alloca (strlen (to_parse) + 1);
10291 size_t len;
10292
10293 strcpy (str, to_parse);
10294
10295 ext = strchr (str, '+');
10296
10297 if (ext != NULL)
10298 len = ext - str;
10299 else
10300 len = strlen (str);
10301
10302 if (len == 0)
10303 return AARCH64_PARSE_MISSING_ARG;
10304
10305
10306 /* Loop through the list of supported CPUs to find a match. */
10307 for (cpu = all_cores; cpu->name != NULL; cpu++)
10308 {
10309 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10310 {
10311 unsigned long isa_temp = cpu->flags;
10312
10313
10314 if (ext != NULL)
10315 {
10316 /* TO_PARSE string contains at least one extension. */
10317 enum aarch64_parse_opt_result ext_res
10318 = aarch64_parse_extension (ext, &isa_temp);
10319
10320 if (ext_res != AARCH64_PARSE_OK)
10321 return ext_res;
10322 }
10323 /* Extension parsing was successfull. Confirm the result
10324 cpu and ISA flags. */
10325 *res = cpu;
10326 *isa_flags = isa_temp;
10327 return AARCH64_PARSE_OK;
10328 }
10329 }
10330
10331 /* CPU name not found in list. */
10332 return AARCH64_PARSE_INVALID_ARG;
10333 }
10334
10335 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10336 Return an aarch64_parse_opt_result describing the parse result.
10337 If the parsing fails the RES does not change. */
10338
10339 static enum aarch64_parse_opt_result
10340 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10341 {
10342 const struct processor *cpu;
10343 char *str = (char *) alloca (strlen (to_parse) + 1);
10344
10345 strcpy (str, to_parse);
10346
10347 /* Loop through the list of supported CPUs to find a match. */
10348 for (cpu = all_cores; cpu->name != NULL; cpu++)
10349 {
10350 if (strcmp (cpu->name, str) == 0)
10351 {
10352 *res = cpu;
10353 return AARCH64_PARSE_OK;
10354 }
10355 }
10356
10357 /* CPU name not found in list. */
10358 return AARCH64_PARSE_INVALID_ARG;
10359 }
10360
10361 /* Parse TOKEN, which has length LENGTH to see if it is an option
10362 described in FLAG. If it is, return the index bit for that fusion type.
10363 If not, error (printing OPTION_NAME) and return zero. */
10364
10365 static unsigned int
10366 aarch64_parse_one_option_token (const char *token,
10367 size_t length,
10368 const struct aarch64_flag_desc *flag,
10369 const char *option_name)
10370 {
10371 for (; flag->name != NULL; flag++)
10372 {
10373 if (length == strlen (flag->name)
10374 && !strncmp (flag->name, token, length))
10375 return flag->flag;
10376 }
10377
10378 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10379 return 0;
10380 }
10381
10382 /* Parse OPTION which is a comma-separated list of flags to enable.
10383 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10384 default state we inherit from the CPU tuning structures. OPTION_NAME
10385 gives the top-level option we are parsing in the -moverride string,
10386 for use in error messages. */
10387
10388 static unsigned int
10389 aarch64_parse_boolean_options (const char *option,
10390 const struct aarch64_flag_desc *flags,
10391 unsigned int initial_state,
10392 const char *option_name)
10393 {
10394 const char separator = '.';
10395 const char* specs = option;
10396 const char* ntoken = option;
10397 unsigned int found_flags = initial_state;
10398
10399 while ((ntoken = strchr (specs, separator)))
10400 {
10401 size_t token_length = ntoken - specs;
10402 unsigned token_ops = aarch64_parse_one_option_token (specs,
10403 token_length,
10404 flags,
10405 option_name);
10406 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10407 in the token stream, reset the supported operations. So:
10408
10409 adrp+add.cmp+branch.none.adrp+add
10410
10411 would have the result of turning on only adrp+add fusion. */
10412 if (!token_ops)
10413 found_flags = 0;
10414
10415 found_flags |= token_ops;
10416 specs = ++ntoken;
10417 }
10418
10419 /* We ended with a comma, print something. */
10420 if (!(*specs))
10421 {
10422 error ("%s string ill-formed\n", option_name);
10423 return 0;
10424 }
10425
10426 /* We still have one more token to parse. */
10427 size_t token_length = strlen (specs);
10428 unsigned token_ops = aarch64_parse_one_option_token (specs,
10429 token_length,
10430 flags,
10431 option_name);
10432 if (!token_ops)
10433 found_flags = 0;
10434
10435 found_flags |= token_ops;
10436 return found_flags;
10437 }
10438
10439 /* Support for overriding instruction fusion. */
10440
10441 static void
10442 aarch64_parse_fuse_string (const char *fuse_string,
10443 struct tune_params *tune)
10444 {
10445 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10446 aarch64_fusible_pairs,
10447 tune->fusible_ops,
10448 "fuse=");
10449 }
10450
10451 /* Support for overriding other tuning flags. */
10452
10453 static void
10454 aarch64_parse_tune_string (const char *tune_string,
10455 struct tune_params *tune)
10456 {
10457 tune->extra_tuning_flags
10458 = aarch64_parse_boolean_options (tune_string,
10459 aarch64_tuning_flags,
10460 tune->extra_tuning_flags,
10461 "tune=");
10462 }
10463
10464 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10465 we understand. If it is, extract the option string and handoff to
10466 the appropriate function. */
10467
10468 void
10469 aarch64_parse_one_override_token (const char* token,
10470 size_t length,
10471 struct tune_params *tune)
10472 {
10473 const struct aarch64_tuning_override_function *fn
10474 = aarch64_tuning_override_functions;
10475
10476 const char *option_part = strchr (token, '=');
10477 if (!option_part)
10478 {
10479 error ("tuning string missing in option (%s)", token);
10480 return;
10481 }
10482
10483 /* Get the length of the option name. */
10484 length = option_part - token;
10485 /* Skip the '=' to get to the option string. */
10486 option_part++;
10487
10488 for (; fn->name != NULL; fn++)
10489 {
10490 if (!strncmp (fn->name, token, length))
10491 {
10492 fn->parse_override (option_part, tune);
10493 return;
10494 }
10495 }
10496
10497 error ("unknown tuning option (%s)",token);
10498 return;
10499 }
10500
10501 /* A checking mechanism for the implementation of the tls size. */
10502
10503 static void
10504 initialize_aarch64_tls_size (struct gcc_options *opts)
10505 {
10506 if (aarch64_tls_size == 0)
10507 aarch64_tls_size = 24;
10508
10509 switch (opts->x_aarch64_cmodel_var)
10510 {
10511 case AARCH64_CMODEL_TINY:
10512 /* Both the default and maximum TLS size allowed under tiny is 1M which
10513 needs two instructions to address, so we clamp the size to 24. */
10514 if (aarch64_tls_size > 24)
10515 aarch64_tls_size = 24;
10516 break;
10517 case AARCH64_CMODEL_SMALL:
10518 /* The maximum TLS size allowed under small is 4G. */
10519 if (aarch64_tls_size > 32)
10520 aarch64_tls_size = 32;
10521 break;
10522 case AARCH64_CMODEL_LARGE:
10523 /* The maximum TLS size allowed under large is 16E.
10524 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10525 if (aarch64_tls_size > 48)
10526 aarch64_tls_size = 48;
10527 break;
10528 default:
10529 gcc_unreachable ();
10530 }
10531
10532 return;
10533 }
10534
10535 /* Parse STRING looking for options in the format:
10536 string :: option:string
10537 option :: name=substring
10538 name :: {a-z}
10539 substring :: defined by option. */
10540
10541 static void
10542 aarch64_parse_override_string (const char* input_string,
10543 struct tune_params* tune)
10544 {
10545 const char separator = ':';
10546 size_t string_length = strlen (input_string) + 1;
10547 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10548 char *string = string_root;
10549 strncpy (string, input_string, string_length);
10550 string[string_length - 1] = '\0';
10551
10552 char* ntoken = string;
10553
10554 while ((ntoken = strchr (string, separator)))
10555 {
10556 size_t token_length = ntoken - string;
10557 /* Make this substring look like a string. */
10558 *ntoken = '\0';
10559 aarch64_parse_one_override_token (string, token_length, tune);
10560 string = ++ntoken;
10561 }
10562
10563 /* One last option to parse. */
10564 aarch64_parse_one_override_token (string, strlen (string), tune);
10565 free (string_root);
10566 }
10567
10568
10569 static void
10570 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10571 {
10572 /* PR 70044: We have to be careful about being called multiple times for the
10573 same function. This means all changes should be repeatable. */
10574
10575 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10576 Disable the frame pointer flag so the mid-end will not use a frame
10577 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10578 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10579 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10580 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10581 if (opts->x_flag_omit_frame_pointer == 0)
10582 opts->x_flag_omit_frame_pointer = 2;
10583
10584 /* If not optimizing for size, set the default
10585 alignment to what the target wants. */
10586 if (!opts->x_optimize_size)
10587 {
10588 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10589 opts->x_str_align_loops = aarch64_tune_params.loop_align;
10590 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10591 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10592 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10593 opts->x_str_align_functions = aarch64_tune_params.function_align;
10594 }
10595
10596 /* We default to no pc-relative literal loads. */
10597
10598 aarch64_pcrelative_literal_loads = false;
10599
10600 /* If -mpc-relative-literal-loads is set on the command line, this
10601 implies that the user asked for PC relative literal loads. */
10602 if (opts->x_pcrelative_literal_loads == 1)
10603 aarch64_pcrelative_literal_loads = true;
10604
10605 /* In the tiny memory model it makes no sense to disallow PC relative
10606 literal pool loads. */
10607 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10608 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10609 aarch64_pcrelative_literal_loads = true;
10610
10611 /* When enabling the lower precision Newton series for the square root, also
10612 enable it for the reciprocal square root, since the latter is an
10613 intermediary step for the former. */
10614 if (flag_mlow_precision_sqrt)
10615 flag_mrecip_low_precision_sqrt = true;
10616 }
10617
10618 /* 'Unpack' up the internal tuning structs and update the options
10619 in OPTS. The caller must have set up selected_tune and selected_arch
10620 as all the other target-specific codegen decisions are
10621 derived from them. */
10622
10623 void
10624 aarch64_override_options_internal (struct gcc_options *opts)
10625 {
10626 aarch64_tune_flags = selected_tune->flags;
10627 aarch64_tune = selected_tune->sched_core;
10628 /* Make a copy of the tuning parameters attached to the core, which
10629 we may later overwrite. */
10630 aarch64_tune_params = *(selected_tune->tune);
10631 aarch64_architecture_version = selected_arch->architecture_version;
10632
10633 if (opts->x_aarch64_override_tune_string)
10634 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10635 &aarch64_tune_params);
10636
10637 /* This target defaults to strict volatile bitfields. */
10638 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10639 opts->x_flag_strict_volatile_bitfields = 1;
10640
10641 initialize_aarch64_code_model (opts);
10642 initialize_aarch64_tls_size (opts);
10643
10644 int queue_depth = 0;
10645 switch (aarch64_tune_params.autoprefetcher_model)
10646 {
10647 case tune_params::AUTOPREFETCHER_OFF:
10648 queue_depth = -1;
10649 break;
10650 case tune_params::AUTOPREFETCHER_WEAK:
10651 queue_depth = 0;
10652 break;
10653 case tune_params::AUTOPREFETCHER_STRONG:
10654 queue_depth = max_insn_queue_index + 1;
10655 break;
10656 default:
10657 gcc_unreachable ();
10658 }
10659
10660 /* We don't mind passing in global_options_set here as we don't use
10661 the *options_set structs anyway. */
10662 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10663 queue_depth,
10664 opts->x_param_values,
10665 global_options_set.x_param_values);
10666
10667 /* Set up parameters to be used in prefetching algorithm. Do not
10668 override the defaults unless we are tuning for a core we have
10669 researched values for. */
10670 if (aarch64_tune_params.prefetch->num_slots > 0)
10671 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10672 aarch64_tune_params.prefetch->num_slots,
10673 opts->x_param_values,
10674 global_options_set.x_param_values);
10675 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10676 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10677 aarch64_tune_params.prefetch->l1_cache_size,
10678 opts->x_param_values,
10679 global_options_set.x_param_values);
10680 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10681 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10682 aarch64_tune_params.prefetch->l1_cache_line_size,
10683 opts->x_param_values,
10684 global_options_set.x_param_values);
10685 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10686 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10687 aarch64_tune_params.prefetch->l2_cache_size,
10688 opts->x_param_values,
10689 global_options_set.x_param_values);
10690 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10691 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10692 0,
10693 opts->x_param_values,
10694 global_options_set.x_param_values);
10695 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10696 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10697 aarch64_tune_params.prefetch->minimum_stride,
10698 opts->x_param_values,
10699 global_options_set.x_param_values);
10700
10701 /* Use the alternative scheduling-pressure algorithm by default. */
10702 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10703 opts->x_param_values,
10704 global_options_set.x_param_values);
10705
10706 /* Enable sw prefetching at specified optimization level for
10707 CPUS that have prefetch. Lower optimization level threshold by 1
10708 when profiling is enabled. */
10709 if (opts->x_flag_prefetch_loop_arrays < 0
10710 && !opts->x_optimize_size
10711 && aarch64_tune_params.prefetch->default_opt_level >= 0
10712 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10713 opts->x_flag_prefetch_loop_arrays = 1;
10714
10715 aarch64_override_options_after_change_1 (opts);
10716 }
10717
10718 /* Print a hint with a suggestion for a core or architecture name that
10719 most closely resembles what the user passed in STR. ARCH is true if
10720 the user is asking for an architecture name. ARCH is false if the user
10721 is asking for a core name. */
10722
10723 static void
10724 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10725 {
10726 auto_vec<const char *> candidates;
10727 const struct processor *entry = arch ? all_architectures : all_cores;
10728 for (; entry->name != NULL; entry++)
10729 candidates.safe_push (entry->name);
10730
10731 #ifdef HAVE_LOCAL_CPU_DETECT
10732 /* Add also "native" as possible value. */
10733 if (arch)
10734 candidates.safe_push ("native");
10735 #endif
10736
10737 char *s;
10738 const char *hint = candidates_list_and_hint (str, s, candidates);
10739 if (hint)
10740 inform (input_location, "valid arguments are: %s;"
10741 " did you mean %qs?", s, hint);
10742 else
10743 inform (input_location, "valid arguments are: %s", s);
10744
10745 XDELETEVEC (s);
10746 }
10747
10748 /* Print a hint with a suggestion for a core name that most closely resembles
10749 what the user passed in STR. */
10750
10751 inline static void
10752 aarch64_print_hint_for_core (const char *str)
10753 {
10754 aarch64_print_hint_for_core_or_arch (str, false);
10755 }
10756
10757 /* Print a hint with a suggestion for an architecture name that most closely
10758 resembles what the user passed in STR. */
10759
10760 inline static void
10761 aarch64_print_hint_for_arch (const char *str)
10762 {
10763 aarch64_print_hint_for_core_or_arch (str, true);
10764 }
10765
10766 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10767 specified in STR and throw errors if appropriate. Put the results if
10768 they are valid in RES and ISA_FLAGS. Return whether the option is
10769 valid. */
10770
10771 static bool
10772 aarch64_validate_mcpu (const char *str, const struct processor **res,
10773 unsigned long *isa_flags)
10774 {
10775 enum aarch64_parse_opt_result parse_res
10776 = aarch64_parse_cpu (str, res, isa_flags);
10777
10778 if (parse_res == AARCH64_PARSE_OK)
10779 return true;
10780
10781 switch (parse_res)
10782 {
10783 case AARCH64_PARSE_MISSING_ARG:
10784 error ("missing cpu name in %<-mcpu=%s%>", str);
10785 break;
10786 case AARCH64_PARSE_INVALID_ARG:
10787 error ("unknown value %qs for -mcpu", str);
10788 aarch64_print_hint_for_core (str);
10789 break;
10790 case AARCH64_PARSE_INVALID_FEATURE:
10791 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10792 break;
10793 default:
10794 gcc_unreachable ();
10795 }
10796
10797 return false;
10798 }
10799
10800 /* Validate a command-line -march option. Parse the arch and extensions
10801 (if any) specified in STR and throw errors if appropriate. Put the
10802 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10803 option is valid. */
10804
10805 static bool
10806 aarch64_validate_march (const char *str, const struct processor **res,
10807 unsigned long *isa_flags)
10808 {
10809 enum aarch64_parse_opt_result parse_res
10810 = aarch64_parse_arch (str, res, isa_flags);
10811
10812 if (parse_res == AARCH64_PARSE_OK)
10813 return true;
10814
10815 switch (parse_res)
10816 {
10817 case AARCH64_PARSE_MISSING_ARG:
10818 error ("missing arch name in %<-march=%s%>", str);
10819 break;
10820 case AARCH64_PARSE_INVALID_ARG:
10821 error ("unknown value %qs for -march", str);
10822 aarch64_print_hint_for_arch (str);
10823 break;
10824 case AARCH64_PARSE_INVALID_FEATURE:
10825 error ("invalid feature modifier in %<-march=%s%>", str);
10826 break;
10827 default:
10828 gcc_unreachable ();
10829 }
10830
10831 return false;
10832 }
10833
10834 /* Validate a command-line -mtune option. Parse the cpu
10835 specified in STR and throw errors if appropriate. Put the
10836 result, if it is valid, in RES. Return whether the option is
10837 valid. */
10838
10839 static bool
10840 aarch64_validate_mtune (const char *str, const struct processor **res)
10841 {
10842 enum aarch64_parse_opt_result parse_res
10843 = aarch64_parse_tune (str, res);
10844
10845 if (parse_res == AARCH64_PARSE_OK)
10846 return true;
10847
10848 switch (parse_res)
10849 {
10850 case AARCH64_PARSE_MISSING_ARG:
10851 error ("missing cpu name in %<-mtune=%s%>", str);
10852 break;
10853 case AARCH64_PARSE_INVALID_ARG:
10854 error ("unknown value %qs for -mtune", str);
10855 aarch64_print_hint_for_core (str);
10856 break;
10857 default:
10858 gcc_unreachable ();
10859 }
10860 return false;
10861 }
10862
10863 /* Return the CPU corresponding to the enum CPU.
10864 If it doesn't specify a cpu, return the default. */
10865
10866 static const struct processor *
10867 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10868 {
10869 if (cpu != aarch64_none)
10870 return &all_cores[cpu];
10871
10872 /* The & 0x3f is to extract the bottom 6 bits that encode the
10873 default cpu as selected by the --with-cpu GCC configure option
10874 in config.gcc.
10875 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10876 flags mechanism should be reworked to make it more sane. */
10877 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10878 }
10879
10880 /* Return the architecture corresponding to the enum ARCH.
10881 If it doesn't specify a valid architecture, return the default. */
10882
10883 static const struct processor *
10884 aarch64_get_arch (enum aarch64_arch arch)
10885 {
10886 if (arch != aarch64_no_arch)
10887 return &all_architectures[arch];
10888
10889 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10890
10891 return &all_architectures[cpu->arch];
10892 }
10893
10894 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10895
10896 static poly_uint16
10897 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10898 {
10899 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10900 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10901 deciding which .md file patterns to use and when deciding whether
10902 something is a legitimate address or constant. */
10903 if (value == SVE_SCALABLE || value == SVE_128)
10904 return poly_uint16 (2, 2);
10905 else
10906 return (int) value / 64;
10907 }
10908
10909 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10910 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10911 tuning structs. In particular it must set selected_tune and
10912 aarch64_isa_flags that define the available ISA features and tuning
10913 decisions. It must also set selected_arch as this will be used to
10914 output the .arch asm tags for each function. */
10915
10916 static void
10917 aarch64_override_options (void)
10918 {
10919 unsigned long cpu_isa = 0;
10920 unsigned long arch_isa = 0;
10921 aarch64_isa_flags = 0;
10922
10923 bool valid_cpu = true;
10924 bool valid_tune = true;
10925 bool valid_arch = true;
10926
10927 selected_cpu = NULL;
10928 selected_arch = NULL;
10929 selected_tune = NULL;
10930
10931 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10932 If either of -march or -mtune is given, they override their
10933 respective component of -mcpu. */
10934 if (aarch64_cpu_string)
10935 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10936 &cpu_isa);
10937
10938 if (aarch64_arch_string)
10939 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10940 &arch_isa);
10941
10942 if (aarch64_tune_string)
10943 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10944
10945 /* If the user did not specify a processor, choose the default
10946 one for them. This will be the CPU set during configuration using
10947 --with-cpu, otherwise it is "generic". */
10948 if (!selected_cpu)
10949 {
10950 if (selected_arch)
10951 {
10952 selected_cpu = &all_cores[selected_arch->ident];
10953 aarch64_isa_flags = arch_isa;
10954 explicit_arch = selected_arch->arch;
10955 }
10956 else
10957 {
10958 /* Get default configure-time CPU. */
10959 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10960 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10961 }
10962
10963 if (selected_tune)
10964 explicit_tune_core = selected_tune->ident;
10965 }
10966 /* If both -mcpu and -march are specified check that they are architecturally
10967 compatible, warn if they're not and prefer the -march ISA flags. */
10968 else if (selected_arch)
10969 {
10970 if (selected_arch->arch != selected_cpu->arch)
10971 {
10972 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10973 all_architectures[selected_cpu->arch].name,
10974 selected_arch->name);
10975 }
10976 aarch64_isa_flags = arch_isa;
10977 explicit_arch = selected_arch->arch;
10978 explicit_tune_core = selected_tune ? selected_tune->ident
10979 : selected_cpu->ident;
10980 }
10981 else
10982 {
10983 /* -mcpu but no -march. */
10984 aarch64_isa_flags = cpu_isa;
10985 explicit_tune_core = selected_tune ? selected_tune->ident
10986 : selected_cpu->ident;
10987 gcc_assert (selected_cpu);
10988 selected_arch = &all_architectures[selected_cpu->arch];
10989 explicit_arch = selected_arch->arch;
10990 }
10991
10992 /* Set the arch as well as we will need it when outputing
10993 the .arch directive in assembly. */
10994 if (!selected_arch)
10995 {
10996 gcc_assert (selected_cpu);
10997 selected_arch = &all_architectures[selected_cpu->arch];
10998 }
10999
11000 if (!selected_tune)
11001 selected_tune = selected_cpu;
11002
11003 #ifndef HAVE_AS_MABI_OPTION
11004 /* The compiler may have been configured with 2.23.* binutils, which does
11005 not have support for ILP32. */
11006 if (TARGET_ILP32)
11007 error ("assembler does not support -mabi=ilp32");
11008 #endif
11009
11010 /* Convert -msve-vector-bits to a VG count. */
11011 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11012
11013 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11014 sorry ("return address signing is only supported for -mabi=lp64");
11015
11016 /* Make sure we properly set up the explicit options. */
11017 if ((aarch64_cpu_string && valid_cpu)
11018 || (aarch64_tune_string && valid_tune))
11019 gcc_assert (explicit_tune_core != aarch64_none);
11020
11021 if ((aarch64_cpu_string && valid_cpu)
11022 || (aarch64_arch_string && valid_arch))
11023 gcc_assert (explicit_arch != aarch64_no_arch);
11024
11025 aarch64_override_options_internal (&global_options);
11026
11027 /* Save these options as the default ones in case we push and pop them later
11028 while processing functions with potential target attributes. */
11029 target_option_default_node = target_option_current_node
11030 = build_target_option_node (&global_options);
11031 }
11032
11033 /* Implement targetm.override_options_after_change. */
11034
11035 static void
11036 aarch64_override_options_after_change (void)
11037 {
11038 aarch64_override_options_after_change_1 (&global_options);
11039 }
11040
11041 static struct machine_function *
11042 aarch64_init_machine_status (void)
11043 {
11044 struct machine_function *machine;
11045 machine = ggc_cleared_alloc<machine_function> ();
11046 return machine;
11047 }
11048
11049 void
11050 aarch64_init_expanders (void)
11051 {
11052 init_machine_status = aarch64_init_machine_status;
11053 }
11054
11055 /* A checking mechanism for the implementation of the various code models. */
11056 static void
11057 initialize_aarch64_code_model (struct gcc_options *opts)
11058 {
11059 if (opts->x_flag_pic)
11060 {
11061 switch (opts->x_aarch64_cmodel_var)
11062 {
11063 case AARCH64_CMODEL_TINY:
11064 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11065 break;
11066 case AARCH64_CMODEL_SMALL:
11067 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11068 aarch64_cmodel = (flag_pic == 2
11069 ? AARCH64_CMODEL_SMALL_PIC
11070 : AARCH64_CMODEL_SMALL_SPIC);
11071 #else
11072 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11073 #endif
11074 break;
11075 case AARCH64_CMODEL_LARGE:
11076 sorry ("code model %qs with -f%s", "large",
11077 opts->x_flag_pic > 1 ? "PIC" : "pic");
11078 break;
11079 default:
11080 gcc_unreachable ();
11081 }
11082 }
11083 else
11084 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11085 }
11086
11087 /* Implement TARGET_OPTION_SAVE. */
11088
11089 static void
11090 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11091 {
11092 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11093 }
11094
11095 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11096 using the information saved in PTR. */
11097
11098 static void
11099 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11100 {
11101 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11102 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11103 opts->x_explicit_arch = ptr->x_explicit_arch;
11104 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11105 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11106
11107 aarch64_override_options_internal (opts);
11108 }
11109
11110 /* Implement TARGET_OPTION_PRINT. */
11111
11112 static void
11113 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11114 {
11115 const struct processor *cpu
11116 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11117 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11118 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11119 std::string extension
11120 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11121
11122 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11123 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11124 arch->name, extension.c_str ());
11125 }
11126
11127 static GTY(()) tree aarch64_previous_fndecl;
11128
11129 void
11130 aarch64_reset_previous_fndecl (void)
11131 {
11132 aarch64_previous_fndecl = NULL;
11133 }
11134
11135 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11136 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11137 make sure optab availability predicates are recomputed when necessary. */
11138
11139 void
11140 aarch64_save_restore_target_globals (tree new_tree)
11141 {
11142 if (TREE_TARGET_GLOBALS (new_tree))
11143 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11144 else if (new_tree == target_option_default_node)
11145 restore_target_globals (&default_target_globals);
11146 else
11147 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11148 }
11149
11150 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11151 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11152 of the function, if such exists. This function may be called multiple
11153 times on a single function so use aarch64_previous_fndecl to avoid
11154 setting up identical state. */
11155
11156 static void
11157 aarch64_set_current_function (tree fndecl)
11158 {
11159 if (!fndecl || fndecl == aarch64_previous_fndecl)
11160 return;
11161
11162 tree old_tree = (aarch64_previous_fndecl
11163 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11164 : NULL_TREE);
11165
11166 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11167
11168 /* If current function has no attributes but the previous one did,
11169 use the default node. */
11170 if (!new_tree && old_tree)
11171 new_tree = target_option_default_node;
11172
11173 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11174 the default have been handled by aarch64_save_restore_target_globals from
11175 aarch64_pragma_target_parse. */
11176 if (old_tree == new_tree)
11177 return;
11178
11179 aarch64_previous_fndecl = fndecl;
11180
11181 /* First set the target options. */
11182 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11183
11184 aarch64_save_restore_target_globals (new_tree);
11185 }
11186
11187 /* Enum describing the various ways we can handle attributes.
11188 In many cases we can reuse the generic option handling machinery. */
11189
11190 enum aarch64_attr_opt_type
11191 {
11192 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11193 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11194 aarch64_attr_enum, /* Attribute sets an enum variable. */
11195 aarch64_attr_custom /* Attribute requires a custom handling function. */
11196 };
11197
11198 /* All the information needed to handle a target attribute.
11199 NAME is the name of the attribute.
11200 ATTR_TYPE specifies the type of behavior of the attribute as described
11201 in the definition of enum aarch64_attr_opt_type.
11202 ALLOW_NEG is true if the attribute supports a "no-" form.
11203 HANDLER is the function that takes the attribute string as an argument
11204 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11205 OPT_NUM is the enum specifying the option that the attribute modifies.
11206 This is needed for attributes that mirror the behavior of a command-line
11207 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11208 aarch64_attr_enum. */
11209
11210 struct aarch64_attribute_info
11211 {
11212 const char *name;
11213 enum aarch64_attr_opt_type attr_type;
11214 bool allow_neg;
11215 bool (*handler) (const char *);
11216 enum opt_code opt_num;
11217 };
11218
11219 /* Handle the ARCH_STR argument to the arch= target attribute. */
11220
11221 static bool
11222 aarch64_handle_attr_arch (const char *str)
11223 {
11224 const struct processor *tmp_arch = NULL;
11225 enum aarch64_parse_opt_result parse_res
11226 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11227
11228 if (parse_res == AARCH64_PARSE_OK)
11229 {
11230 gcc_assert (tmp_arch);
11231 selected_arch = tmp_arch;
11232 explicit_arch = selected_arch->arch;
11233 return true;
11234 }
11235
11236 switch (parse_res)
11237 {
11238 case AARCH64_PARSE_MISSING_ARG:
11239 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11240 break;
11241 case AARCH64_PARSE_INVALID_ARG:
11242 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11243 aarch64_print_hint_for_arch (str);
11244 break;
11245 case AARCH64_PARSE_INVALID_FEATURE:
11246 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11247 break;
11248 default:
11249 gcc_unreachable ();
11250 }
11251
11252 return false;
11253 }
11254
11255 /* Handle the argument CPU_STR to the cpu= target attribute. */
11256
11257 static bool
11258 aarch64_handle_attr_cpu (const char *str)
11259 {
11260 const struct processor *tmp_cpu = NULL;
11261 enum aarch64_parse_opt_result parse_res
11262 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11263
11264 if (parse_res == AARCH64_PARSE_OK)
11265 {
11266 gcc_assert (tmp_cpu);
11267 selected_tune = tmp_cpu;
11268 explicit_tune_core = selected_tune->ident;
11269
11270 selected_arch = &all_architectures[tmp_cpu->arch];
11271 explicit_arch = selected_arch->arch;
11272 return true;
11273 }
11274
11275 switch (parse_res)
11276 {
11277 case AARCH64_PARSE_MISSING_ARG:
11278 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11279 break;
11280 case AARCH64_PARSE_INVALID_ARG:
11281 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11282 aarch64_print_hint_for_core (str);
11283 break;
11284 case AARCH64_PARSE_INVALID_FEATURE:
11285 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11286 break;
11287 default:
11288 gcc_unreachable ();
11289 }
11290
11291 return false;
11292 }
11293
11294 /* Handle the argument STR to the tune= target attribute. */
11295
11296 static bool
11297 aarch64_handle_attr_tune (const char *str)
11298 {
11299 const struct processor *tmp_tune = NULL;
11300 enum aarch64_parse_opt_result parse_res
11301 = aarch64_parse_tune (str, &tmp_tune);
11302
11303 if (parse_res == AARCH64_PARSE_OK)
11304 {
11305 gcc_assert (tmp_tune);
11306 selected_tune = tmp_tune;
11307 explicit_tune_core = selected_tune->ident;
11308 return true;
11309 }
11310
11311 switch (parse_res)
11312 {
11313 case AARCH64_PARSE_INVALID_ARG:
11314 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11315 aarch64_print_hint_for_core (str);
11316 break;
11317 default:
11318 gcc_unreachable ();
11319 }
11320
11321 return false;
11322 }
11323
11324 /* Parse an architecture extensions target attribute string specified in STR.
11325 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11326 if successful. Update aarch64_isa_flags to reflect the ISA features
11327 modified. */
11328
11329 static bool
11330 aarch64_handle_attr_isa_flags (char *str)
11331 {
11332 enum aarch64_parse_opt_result parse_res;
11333 unsigned long isa_flags = aarch64_isa_flags;
11334
11335 /* We allow "+nothing" in the beginning to clear out all architectural
11336 features if the user wants to handpick specific features. */
11337 if (strncmp ("+nothing", str, 8) == 0)
11338 {
11339 isa_flags = 0;
11340 str += 8;
11341 }
11342
11343 parse_res = aarch64_parse_extension (str, &isa_flags);
11344
11345 if (parse_res == AARCH64_PARSE_OK)
11346 {
11347 aarch64_isa_flags = isa_flags;
11348 return true;
11349 }
11350
11351 switch (parse_res)
11352 {
11353 case AARCH64_PARSE_MISSING_ARG:
11354 error ("missing value in %<target()%> pragma or attribute");
11355 break;
11356
11357 case AARCH64_PARSE_INVALID_FEATURE:
11358 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11359 break;
11360
11361 default:
11362 gcc_unreachable ();
11363 }
11364
11365 return false;
11366 }
11367
11368 /* The target attributes that we support. On top of these we also support just
11369 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11370 handled explicitly in aarch64_process_one_target_attr. */
11371
11372 static const struct aarch64_attribute_info aarch64_attributes[] =
11373 {
11374 { "general-regs-only", aarch64_attr_mask, false, NULL,
11375 OPT_mgeneral_regs_only },
11376 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11377 OPT_mfix_cortex_a53_835769 },
11378 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11379 OPT_mfix_cortex_a53_843419 },
11380 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11381 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11382 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11383 OPT_momit_leaf_frame_pointer },
11384 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11385 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11386 OPT_march_ },
11387 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11388 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11389 OPT_mtune_ },
11390 { "sign-return-address", aarch64_attr_enum, false, NULL,
11391 OPT_msign_return_address_ },
11392 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11393 };
11394
11395 /* Parse ARG_STR which contains the definition of one target attribute.
11396 Show appropriate errors if any or return true if the attribute is valid. */
11397
11398 static bool
11399 aarch64_process_one_target_attr (char *arg_str)
11400 {
11401 bool invert = false;
11402
11403 size_t len = strlen (arg_str);
11404
11405 if (len == 0)
11406 {
11407 error ("malformed %<target()%> pragma or attribute");
11408 return false;
11409 }
11410
11411 char *str_to_check = (char *) alloca (len + 1);
11412 strcpy (str_to_check, arg_str);
11413
11414 /* Skip leading whitespace. */
11415 while (*str_to_check == ' ' || *str_to_check == '\t')
11416 str_to_check++;
11417
11418 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11419 It is easier to detect and handle it explicitly here rather than going
11420 through the machinery for the rest of the target attributes in this
11421 function. */
11422 if (*str_to_check == '+')
11423 return aarch64_handle_attr_isa_flags (str_to_check);
11424
11425 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11426 {
11427 invert = true;
11428 str_to_check += 3;
11429 }
11430 char *arg = strchr (str_to_check, '=');
11431
11432 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11433 and point ARG to "foo". */
11434 if (arg)
11435 {
11436 *arg = '\0';
11437 arg++;
11438 }
11439 const struct aarch64_attribute_info *p_attr;
11440 bool found = false;
11441 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11442 {
11443 /* If the names don't match up, or the user has given an argument
11444 to an attribute that doesn't accept one, or didn't give an argument
11445 to an attribute that expects one, fail to match. */
11446 if (strcmp (str_to_check, p_attr->name) != 0)
11447 continue;
11448
11449 found = true;
11450 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11451 || p_attr->attr_type == aarch64_attr_enum;
11452
11453 if (attr_need_arg_p ^ (arg != NULL))
11454 {
11455 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11456 return false;
11457 }
11458
11459 /* If the name matches but the attribute does not allow "no-" versions
11460 then we can't match. */
11461 if (invert && !p_attr->allow_neg)
11462 {
11463 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11464 return false;
11465 }
11466
11467 switch (p_attr->attr_type)
11468 {
11469 /* Has a custom handler registered.
11470 For example, cpu=, arch=, tune=. */
11471 case aarch64_attr_custom:
11472 gcc_assert (p_attr->handler);
11473 if (!p_attr->handler (arg))
11474 return false;
11475 break;
11476
11477 /* Either set or unset a boolean option. */
11478 case aarch64_attr_bool:
11479 {
11480 struct cl_decoded_option decoded;
11481
11482 generate_option (p_attr->opt_num, NULL, !invert,
11483 CL_TARGET, &decoded);
11484 aarch64_handle_option (&global_options, &global_options_set,
11485 &decoded, input_location);
11486 break;
11487 }
11488 /* Set or unset a bit in the target_flags. aarch64_handle_option
11489 should know what mask to apply given the option number. */
11490 case aarch64_attr_mask:
11491 {
11492 struct cl_decoded_option decoded;
11493 /* We only need to specify the option number.
11494 aarch64_handle_option will know which mask to apply. */
11495 decoded.opt_index = p_attr->opt_num;
11496 decoded.value = !invert;
11497 aarch64_handle_option (&global_options, &global_options_set,
11498 &decoded, input_location);
11499 break;
11500 }
11501 /* Use the option setting machinery to set an option to an enum. */
11502 case aarch64_attr_enum:
11503 {
11504 gcc_assert (arg);
11505 bool valid;
11506 int value;
11507 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11508 &value, CL_TARGET);
11509 if (valid)
11510 {
11511 set_option (&global_options, NULL, p_attr->opt_num, value,
11512 NULL, DK_UNSPECIFIED, input_location,
11513 global_dc);
11514 }
11515 else
11516 {
11517 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11518 }
11519 break;
11520 }
11521 default:
11522 gcc_unreachable ();
11523 }
11524 }
11525
11526 /* If we reached here we either have found an attribute and validated
11527 it or didn't match any. If we matched an attribute but its arguments
11528 were malformed we will have returned false already. */
11529 return found;
11530 }
11531
11532 /* Count how many times the character C appears in
11533 NULL-terminated string STR. */
11534
11535 static unsigned int
11536 num_occurences_in_str (char c, char *str)
11537 {
11538 unsigned int res = 0;
11539 while (*str != '\0')
11540 {
11541 if (*str == c)
11542 res++;
11543
11544 str++;
11545 }
11546
11547 return res;
11548 }
11549
11550 /* Parse the tree in ARGS that contains the target attribute information
11551 and update the global target options space. */
11552
11553 bool
11554 aarch64_process_target_attr (tree args)
11555 {
11556 if (TREE_CODE (args) == TREE_LIST)
11557 {
11558 do
11559 {
11560 tree head = TREE_VALUE (args);
11561 if (head)
11562 {
11563 if (!aarch64_process_target_attr (head))
11564 return false;
11565 }
11566 args = TREE_CHAIN (args);
11567 } while (args);
11568
11569 return true;
11570 }
11571
11572 if (TREE_CODE (args) != STRING_CST)
11573 {
11574 error ("attribute %<target%> argument not a string");
11575 return false;
11576 }
11577
11578 size_t len = strlen (TREE_STRING_POINTER (args));
11579 char *str_to_check = (char *) alloca (len + 1);
11580 strcpy (str_to_check, TREE_STRING_POINTER (args));
11581
11582 if (len == 0)
11583 {
11584 error ("malformed %<target()%> pragma or attribute");
11585 return false;
11586 }
11587
11588 /* Used to catch empty spaces between commas i.e.
11589 attribute ((target ("attr1,,attr2"))). */
11590 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11591
11592 /* Handle multiple target attributes separated by ','. */
11593 char *token = strtok (str_to_check, ",");
11594
11595 unsigned int num_attrs = 0;
11596 while (token)
11597 {
11598 num_attrs++;
11599 if (!aarch64_process_one_target_attr (token))
11600 {
11601 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11602 return false;
11603 }
11604
11605 token = strtok (NULL, ",");
11606 }
11607
11608 if (num_attrs != num_commas + 1)
11609 {
11610 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11611 return false;
11612 }
11613
11614 return true;
11615 }
11616
11617 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11618 process attribute ((target ("..."))). */
11619
11620 static bool
11621 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11622 {
11623 struct cl_target_option cur_target;
11624 bool ret;
11625 tree old_optimize;
11626 tree new_target, new_optimize;
11627 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11628
11629 /* If what we're processing is the current pragma string then the
11630 target option node is already stored in target_option_current_node
11631 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11632 having to re-parse the string. This is especially useful to keep
11633 arm_neon.h compile times down since that header contains a lot
11634 of intrinsics enclosed in pragmas. */
11635 if (!existing_target && args == current_target_pragma)
11636 {
11637 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11638 return true;
11639 }
11640 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11641
11642 old_optimize = build_optimization_node (&global_options);
11643 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11644
11645 /* If the function changed the optimization levels as well as setting
11646 target options, start with the optimizations specified. */
11647 if (func_optimize && func_optimize != old_optimize)
11648 cl_optimization_restore (&global_options,
11649 TREE_OPTIMIZATION (func_optimize));
11650
11651 /* Save the current target options to restore at the end. */
11652 cl_target_option_save (&cur_target, &global_options);
11653
11654 /* If fndecl already has some target attributes applied to it, unpack
11655 them so that we add this attribute on top of them, rather than
11656 overwriting them. */
11657 if (existing_target)
11658 {
11659 struct cl_target_option *existing_options
11660 = TREE_TARGET_OPTION (existing_target);
11661
11662 if (existing_options)
11663 cl_target_option_restore (&global_options, existing_options);
11664 }
11665 else
11666 cl_target_option_restore (&global_options,
11667 TREE_TARGET_OPTION (target_option_current_node));
11668
11669 ret = aarch64_process_target_attr (args);
11670
11671 /* Set up any additional state. */
11672 if (ret)
11673 {
11674 aarch64_override_options_internal (&global_options);
11675 /* Initialize SIMD builtins if we haven't already.
11676 Set current_target_pragma to NULL for the duration so that
11677 the builtin initialization code doesn't try to tag the functions
11678 being built with the attributes specified by any current pragma, thus
11679 going into an infinite recursion. */
11680 if (TARGET_SIMD)
11681 {
11682 tree saved_current_target_pragma = current_target_pragma;
11683 current_target_pragma = NULL;
11684 aarch64_init_simd_builtins ();
11685 current_target_pragma = saved_current_target_pragma;
11686 }
11687 new_target = build_target_option_node (&global_options);
11688 }
11689 else
11690 new_target = NULL;
11691
11692 new_optimize = build_optimization_node (&global_options);
11693
11694 if (fndecl && ret)
11695 {
11696 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11697
11698 if (old_optimize != new_optimize)
11699 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11700 }
11701
11702 cl_target_option_restore (&global_options, &cur_target);
11703
11704 if (old_optimize != new_optimize)
11705 cl_optimization_restore (&global_options,
11706 TREE_OPTIMIZATION (old_optimize));
11707 return ret;
11708 }
11709
11710 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11711 tri-bool options (yes, no, don't care) and the default value is
11712 DEF, determine whether to reject inlining. */
11713
11714 static bool
11715 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11716 int dont_care, int def)
11717 {
11718 /* If the callee doesn't care, always allow inlining. */
11719 if (callee == dont_care)
11720 return true;
11721
11722 /* If the caller doesn't care, always allow inlining. */
11723 if (caller == dont_care)
11724 return true;
11725
11726 /* Otherwise, allow inlining if either the callee and caller values
11727 agree, or if the callee is using the default value. */
11728 return (callee == caller || callee == def);
11729 }
11730
11731 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11732 to inline CALLEE into CALLER based on target-specific info.
11733 Make sure that the caller and callee have compatible architectural
11734 features. Then go through the other possible target attributes
11735 and see if they can block inlining. Try not to reject always_inline
11736 callees unless they are incompatible architecturally. */
11737
11738 static bool
11739 aarch64_can_inline_p (tree caller, tree callee)
11740 {
11741 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11742 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11743
11744 struct cl_target_option *caller_opts
11745 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11746 : target_option_default_node);
11747
11748 struct cl_target_option *callee_opts
11749 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11750 : target_option_default_node);
11751
11752 /* Callee's ISA flags should be a subset of the caller's. */
11753 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11754 != callee_opts->x_aarch64_isa_flags)
11755 return false;
11756
11757 /* Allow non-strict aligned functions inlining into strict
11758 aligned ones. */
11759 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11760 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11761 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11762 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11763 return false;
11764
11765 bool always_inline = lookup_attribute ("always_inline",
11766 DECL_ATTRIBUTES (callee));
11767
11768 /* If the architectural features match up and the callee is always_inline
11769 then the other attributes don't matter. */
11770 if (always_inline)
11771 return true;
11772
11773 if (caller_opts->x_aarch64_cmodel_var
11774 != callee_opts->x_aarch64_cmodel_var)
11775 return false;
11776
11777 if (caller_opts->x_aarch64_tls_dialect
11778 != callee_opts->x_aarch64_tls_dialect)
11779 return false;
11780
11781 /* Honour explicit requests to workaround errata. */
11782 if (!aarch64_tribools_ok_for_inlining_p (
11783 caller_opts->x_aarch64_fix_a53_err835769,
11784 callee_opts->x_aarch64_fix_a53_err835769,
11785 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11786 return false;
11787
11788 if (!aarch64_tribools_ok_for_inlining_p (
11789 caller_opts->x_aarch64_fix_a53_err843419,
11790 callee_opts->x_aarch64_fix_a53_err843419,
11791 2, TARGET_FIX_ERR_A53_843419))
11792 return false;
11793
11794 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11795 caller and calle and they don't match up, reject inlining. */
11796 if (!aarch64_tribools_ok_for_inlining_p (
11797 caller_opts->x_flag_omit_leaf_frame_pointer,
11798 callee_opts->x_flag_omit_leaf_frame_pointer,
11799 2, 1))
11800 return false;
11801
11802 /* If the callee has specific tuning overrides, respect them. */
11803 if (callee_opts->x_aarch64_override_tune_string != NULL
11804 && caller_opts->x_aarch64_override_tune_string == NULL)
11805 return false;
11806
11807 /* If the user specified tuning override strings for the
11808 caller and callee and they don't match up, reject inlining.
11809 We just do a string compare here, we don't analyze the meaning
11810 of the string, as it would be too costly for little gain. */
11811 if (callee_opts->x_aarch64_override_tune_string
11812 && caller_opts->x_aarch64_override_tune_string
11813 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11814 caller_opts->x_aarch64_override_tune_string) != 0))
11815 return false;
11816
11817 return true;
11818 }
11819
11820 /* Return true if SYMBOL_REF X binds locally. */
11821
11822 static bool
11823 aarch64_symbol_binds_local_p (const_rtx x)
11824 {
11825 return (SYMBOL_REF_DECL (x)
11826 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11827 : SYMBOL_REF_LOCAL_P (x));
11828 }
11829
11830 /* Return true if SYMBOL_REF X is thread local */
11831 static bool
11832 aarch64_tls_symbol_p (rtx x)
11833 {
11834 if (! TARGET_HAVE_TLS)
11835 return false;
11836
11837 if (GET_CODE (x) != SYMBOL_REF)
11838 return false;
11839
11840 return SYMBOL_REF_TLS_MODEL (x) != 0;
11841 }
11842
11843 /* Classify a TLS symbol into one of the TLS kinds. */
11844 enum aarch64_symbol_type
11845 aarch64_classify_tls_symbol (rtx x)
11846 {
11847 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11848
11849 switch (tls_kind)
11850 {
11851 case TLS_MODEL_GLOBAL_DYNAMIC:
11852 case TLS_MODEL_LOCAL_DYNAMIC:
11853 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11854
11855 case TLS_MODEL_INITIAL_EXEC:
11856 switch (aarch64_cmodel)
11857 {
11858 case AARCH64_CMODEL_TINY:
11859 case AARCH64_CMODEL_TINY_PIC:
11860 return SYMBOL_TINY_TLSIE;
11861 default:
11862 return SYMBOL_SMALL_TLSIE;
11863 }
11864
11865 case TLS_MODEL_LOCAL_EXEC:
11866 if (aarch64_tls_size == 12)
11867 return SYMBOL_TLSLE12;
11868 else if (aarch64_tls_size == 24)
11869 return SYMBOL_TLSLE24;
11870 else if (aarch64_tls_size == 32)
11871 return SYMBOL_TLSLE32;
11872 else if (aarch64_tls_size == 48)
11873 return SYMBOL_TLSLE48;
11874 else
11875 gcc_unreachable ();
11876
11877 case TLS_MODEL_EMULATED:
11878 case TLS_MODEL_NONE:
11879 return SYMBOL_FORCE_TO_MEM;
11880
11881 default:
11882 gcc_unreachable ();
11883 }
11884 }
11885
11886 /* Return the correct method for accessing X + OFFSET, where X is either
11887 a SYMBOL_REF or LABEL_REF. */
11888
11889 enum aarch64_symbol_type
11890 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11891 {
11892 if (GET_CODE (x) == LABEL_REF)
11893 {
11894 switch (aarch64_cmodel)
11895 {
11896 case AARCH64_CMODEL_LARGE:
11897 return SYMBOL_FORCE_TO_MEM;
11898
11899 case AARCH64_CMODEL_TINY_PIC:
11900 case AARCH64_CMODEL_TINY:
11901 return SYMBOL_TINY_ABSOLUTE;
11902
11903 case AARCH64_CMODEL_SMALL_SPIC:
11904 case AARCH64_CMODEL_SMALL_PIC:
11905 case AARCH64_CMODEL_SMALL:
11906 return SYMBOL_SMALL_ABSOLUTE;
11907
11908 default:
11909 gcc_unreachable ();
11910 }
11911 }
11912
11913 if (GET_CODE (x) == SYMBOL_REF)
11914 {
11915 if (aarch64_tls_symbol_p (x))
11916 return aarch64_classify_tls_symbol (x);
11917
11918 switch (aarch64_cmodel)
11919 {
11920 case AARCH64_CMODEL_TINY:
11921 /* When we retrieve symbol + offset address, we have to make sure
11922 the offset does not cause overflow of the final address. But
11923 we have no way of knowing the address of symbol at compile time
11924 so we can't accurately say if the distance between the PC and
11925 symbol + offset is outside the addressible range of +/-1M in the
11926 TINY code model. So we rely on images not being greater than
11927 1M and cap the offset at 1M and anything beyond 1M will have to
11928 be loaded using an alternative mechanism. Furthermore if the
11929 symbol is a weak reference to something that isn't known to
11930 resolve to a symbol in this module, then force to memory. */
11931 if ((SYMBOL_REF_WEAK (x)
11932 && !aarch64_symbol_binds_local_p (x))
11933 || !IN_RANGE (offset, -1048575, 1048575))
11934 return SYMBOL_FORCE_TO_MEM;
11935 return SYMBOL_TINY_ABSOLUTE;
11936
11937 case AARCH64_CMODEL_SMALL:
11938 /* Same reasoning as the tiny code model, but the offset cap here is
11939 4G. */
11940 if ((SYMBOL_REF_WEAK (x)
11941 && !aarch64_symbol_binds_local_p (x))
11942 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11943 HOST_WIDE_INT_C (4294967264)))
11944 return SYMBOL_FORCE_TO_MEM;
11945 return SYMBOL_SMALL_ABSOLUTE;
11946
11947 case AARCH64_CMODEL_TINY_PIC:
11948 if (!aarch64_symbol_binds_local_p (x))
11949 return SYMBOL_TINY_GOT;
11950 return SYMBOL_TINY_ABSOLUTE;
11951
11952 case AARCH64_CMODEL_SMALL_SPIC:
11953 case AARCH64_CMODEL_SMALL_PIC:
11954 if (!aarch64_symbol_binds_local_p (x))
11955 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11956 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11957 return SYMBOL_SMALL_ABSOLUTE;
11958
11959 case AARCH64_CMODEL_LARGE:
11960 /* This is alright even in PIC code as the constant
11961 pool reference is always PC relative and within
11962 the same translation unit. */
11963 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11964 return SYMBOL_SMALL_ABSOLUTE;
11965 else
11966 return SYMBOL_FORCE_TO_MEM;
11967
11968 default:
11969 gcc_unreachable ();
11970 }
11971 }
11972
11973 /* By default push everything into the constant pool. */
11974 return SYMBOL_FORCE_TO_MEM;
11975 }
11976
11977 bool
11978 aarch64_constant_address_p (rtx x)
11979 {
11980 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11981 }
11982
11983 bool
11984 aarch64_legitimate_pic_operand_p (rtx x)
11985 {
11986 if (GET_CODE (x) == SYMBOL_REF
11987 || (GET_CODE (x) == CONST
11988 && GET_CODE (XEXP (x, 0)) == PLUS
11989 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11990 return false;
11991
11992 return true;
11993 }
11994
11995 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11996 that should be rematerialized rather than spilled. */
11997
11998 static bool
11999 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12000 {
12001 /* Support CSE and rematerialization of common constants. */
12002 if (CONST_INT_P (x)
12003 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12004 || GET_CODE (x) == CONST_VECTOR)
12005 return true;
12006
12007 /* Do not allow vector struct mode constants for Advanced SIMD.
12008 We could support 0 and -1 easily, but they need support in
12009 aarch64-simd.md. */
12010 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12011 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12012 return false;
12013
12014 /* Only accept variable-length vector constants if they can be
12015 handled directly.
12016
12017 ??? It would be possible to handle rematerialization of other
12018 constants via secondary reloads. */
12019 if (vec_flags & VEC_ANY_SVE)
12020 return aarch64_simd_valid_immediate (x, NULL);
12021
12022 if (GET_CODE (x) == HIGH)
12023 x = XEXP (x, 0);
12024
12025 /* Accept polynomial constants that can be calculated by using the
12026 destination of a move as the sole temporary. Constants that
12027 require a second temporary cannot be rematerialized (they can't be
12028 forced to memory and also aren't legitimate constants). */
12029 poly_int64 offset;
12030 if (poly_int_rtx_p (x, &offset))
12031 return aarch64_offset_temporaries (false, offset) <= 1;
12032
12033 /* If an offset is being added to something else, we need to allow the
12034 base to be moved into the destination register, meaning that there
12035 are no free temporaries for the offset. */
12036 x = strip_offset (x, &offset);
12037 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12038 return false;
12039
12040 /* Do not allow const (plus (anchor_symbol, const_int)). */
12041 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12042 return false;
12043
12044 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12045 so spilling them is better than rematerialization. */
12046 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12047 return true;
12048
12049 /* Label references are always constant. */
12050 if (GET_CODE (x) == LABEL_REF)
12051 return true;
12052
12053 return false;
12054 }
12055
12056 rtx
12057 aarch64_load_tp (rtx target)
12058 {
12059 if (!target
12060 || GET_MODE (target) != Pmode
12061 || !register_operand (target, Pmode))
12062 target = gen_reg_rtx (Pmode);
12063
12064 /* Can return in any reg. */
12065 emit_insn (gen_aarch64_load_tp_hard (target));
12066 return target;
12067 }
12068
12069 /* On AAPCS systems, this is the "struct __va_list". */
12070 static GTY(()) tree va_list_type;
12071
12072 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12073 Return the type to use as __builtin_va_list.
12074
12075 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12076
12077 struct __va_list
12078 {
12079 void *__stack;
12080 void *__gr_top;
12081 void *__vr_top;
12082 int __gr_offs;
12083 int __vr_offs;
12084 }; */
12085
12086 static tree
12087 aarch64_build_builtin_va_list (void)
12088 {
12089 tree va_list_name;
12090 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12091
12092 /* Create the type. */
12093 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12094 /* Give it the required name. */
12095 va_list_name = build_decl (BUILTINS_LOCATION,
12096 TYPE_DECL,
12097 get_identifier ("__va_list"),
12098 va_list_type);
12099 DECL_ARTIFICIAL (va_list_name) = 1;
12100 TYPE_NAME (va_list_type) = va_list_name;
12101 TYPE_STUB_DECL (va_list_type) = va_list_name;
12102
12103 /* Create the fields. */
12104 f_stack = build_decl (BUILTINS_LOCATION,
12105 FIELD_DECL, get_identifier ("__stack"),
12106 ptr_type_node);
12107 f_grtop = build_decl (BUILTINS_LOCATION,
12108 FIELD_DECL, get_identifier ("__gr_top"),
12109 ptr_type_node);
12110 f_vrtop = build_decl (BUILTINS_LOCATION,
12111 FIELD_DECL, get_identifier ("__vr_top"),
12112 ptr_type_node);
12113 f_groff = build_decl (BUILTINS_LOCATION,
12114 FIELD_DECL, get_identifier ("__gr_offs"),
12115 integer_type_node);
12116 f_vroff = build_decl (BUILTINS_LOCATION,
12117 FIELD_DECL, get_identifier ("__vr_offs"),
12118 integer_type_node);
12119
12120 /* Tell tree-stdarg pass about our internal offset fields.
12121 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12122 purpose to identify whether the code is updating va_list internal
12123 offset fields through irregular way. */
12124 va_list_gpr_counter_field = f_groff;
12125 va_list_fpr_counter_field = f_vroff;
12126
12127 DECL_ARTIFICIAL (f_stack) = 1;
12128 DECL_ARTIFICIAL (f_grtop) = 1;
12129 DECL_ARTIFICIAL (f_vrtop) = 1;
12130 DECL_ARTIFICIAL (f_groff) = 1;
12131 DECL_ARTIFICIAL (f_vroff) = 1;
12132
12133 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12134 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12135 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12136 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12137 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12138
12139 TYPE_FIELDS (va_list_type) = f_stack;
12140 DECL_CHAIN (f_stack) = f_grtop;
12141 DECL_CHAIN (f_grtop) = f_vrtop;
12142 DECL_CHAIN (f_vrtop) = f_groff;
12143 DECL_CHAIN (f_groff) = f_vroff;
12144
12145 /* Compute its layout. */
12146 layout_type (va_list_type);
12147
12148 return va_list_type;
12149 }
12150
12151 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12152 static void
12153 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12154 {
12155 const CUMULATIVE_ARGS *cum;
12156 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12157 tree stack, grtop, vrtop, groff, vroff;
12158 tree t;
12159 int gr_save_area_size = cfun->va_list_gpr_size;
12160 int vr_save_area_size = cfun->va_list_fpr_size;
12161 int vr_offset;
12162
12163 cum = &crtl->args.info;
12164 if (cfun->va_list_gpr_size)
12165 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12166 cfun->va_list_gpr_size);
12167 if (cfun->va_list_fpr_size)
12168 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12169 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12170
12171 if (!TARGET_FLOAT)
12172 {
12173 gcc_assert (cum->aapcs_nvrn == 0);
12174 vr_save_area_size = 0;
12175 }
12176
12177 f_stack = TYPE_FIELDS (va_list_type_node);
12178 f_grtop = DECL_CHAIN (f_stack);
12179 f_vrtop = DECL_CHAIN (f_grtop);
12180 f_groff = DECL_CHAIN (f_vrtop);
12181 f_vroff = DECL_CHAIN (f_groff);
12182
12183 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12184 NULL_TREE);
12185 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12186 NULL_TREE);
12187 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12188 NULL_TREE);
12189 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12190 NULL_TREE);
12191 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12192 NULL_TREE);
12193
12194 /* Emit code to initialize STACK, which points to the next varargs stack
12195 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12196 by named arguments. STACK is 8-byte aligned. */
12197 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12198 if (cum->aapcs_stack_size > 0)
12199 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12200 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12201 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12202
12203 /* Emit code to initialize GRTOP, the top of the GR save area.
12204 virtual_incoming_args_rtx should have been 16 byte aligned. */
12205 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12206 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12207 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12208
12209 /* Emit code to initialize VRTOP, the top of the VR save area.
12210 This address is gr_save_area_bytes below GRTOP, rounded
12211 down to the next 16-byte boundary. */
12212 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12213 vr_offset = ROUND_UP (gr_save_area_size,
12214 STACK_BOUNDARY / BITS_PER_UNIT);
12215
12216 if (vr_offset)
12217 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12218 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12219 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12220
12221 /* Emit code to initialize GROFF, the offset from GRTOP of the
12222 next GPR argument. */
12223 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12224 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12225 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12226
12227 /* Likewise emit code to initialize VROFF, the offset from FTOP
12228 of the next VR argument. */
12229 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12230 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12231 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12232 }
12233
12234 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12235
12236 static tree
12237 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12238 gimple_seq *post_p ATTRIBUTE_UNUSED)
12239 {
12240 tree addr;
12241 bool indirect_p;
12242 bool is_ha; /* is HFA or HVA. */
12243 bool dw_align; /* double-word align. */
12244 machine_mode ag_mode = VOIDmode;
12245 int nregs;
12246 machine_mode mode;
12247
12248 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12249 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12250 HOST_WIDE_INT size, rsize, adjust, align;
12251 tree t, u, cond1, cond2;
12252
12253 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12254 if (indirect_p)
12255 type = build_pointer_type (type);
12256
12257 mode = TYPE_MODE (type);
12258
12259 f_stack = TYPE_FIELDS (va_list_type_node);
12260 f_grtop = DECL_CHAIN (f_stack);
12261 f_vrtop = DECL_CHAIN (f_grtop);
12262 f_groff = DECL_CHAIN (f_vrtop);
12263 f_vroff = DECL_CHAIN (f_groff);
12264
12265 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12266 f_stack, NULL_TREE);
12267 size = int_size_in_bytes (type);
12268 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12269
12270 dw_align = false;
12271 adjust = 0;
12272 if (aarch64_vfp_is_call_or_return_candidate (mode,
12273 type,
12274 &ag_mode,
12275 &nregs,
12276 &is_ha))
12277 {
12278 /* No frontends can create types with variable-sized modes, so we
12279 shouldn't be asked to pass or return them. */
12280 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12281
12282 /* TYPE passed in fp/simd registers. */
12283 if (!TARGET_FLOAT)
12284 aarch64_err_no_fpadvsimd (mode);
12285
12286 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12287 unshare_expr (valist), f_vrtop, NULL_TREE);
12288 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12289 unshare_expr (valist), f_vroff, NULL_TREE);
12290
12291 rsize = nregs * UNITS_PER_VREG;
12292
12293 if (is_ha)
12294 {
12295 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12296 adjust = UNITS_PER_VREG - ag_size;
12297 }
12298 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12299 && size < UNITS_PER_VREG)
12300 {
12301 adjust = UNITS_PER_VREG - size;
12302 }
12303 }
12304 else
12305 {
12306 /* TYPE passed in general registers. */
12307 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12308 unshare_expr (valist), f_grtop, NULL_TREE);
12309 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12310 unshare_expr (valist), f_groff, NULL_TREE);
12311 rsize = ROUND_UP (size, UNITS_PER_WORD);
12312 nregs = rsize / UNITS_PER_WORD;
12313
12314 if (align > 8)
12315 dw_align = true;
12316
12317 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12318 && size < UNITS_PER_WORD)
12319 {
12320 adjust = UNITS_PER_WORD - size;
12321 }
12322 }
12323
12324 /* Get a local temporary for the field value. */
12325 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12326
12327 /* Emit code to branch if off >= 0. */
12328 t = build2 (GE_EXPR, boolean_type_node, off,
12329 build_int_cst (TREE_TYPE (off), 0));
12330 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12331
12332 if (dw_align)
12333 {
12334 /* Emit: offs = (offs + 15) & -16. */
12335 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12336 build_int_cst (TREE_TYPE (off), 15));
12337 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12338 build_int_cst (TREE_TYPE (off), -16));
12339 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12340 }
12341 else
12342 roundup = NULL;
12343
12344 /* Update ap.__[g|v]r_offs */
12345 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12346 build_int_cst (TREE_TYPE (off), rsize));
12347 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12348
12349 /* String up. */
12350 if (roundup)
12351 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12352
12353 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12354 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12355 build_int_cst (TREE_TYPE (f_off), 0));
12356 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12357
12358 /* String up: make sure the assignment happens before the use. */
12359 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12360 COND_EXPR_ELSE (cond1) = t;
12361
12362 /* Prepare the trees handling the argument that is passed on the stack;
12363 the top level node will store in ON_STACK. */
12364 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12365 if (align > 8)
12366 {
12367 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12368 t = fold_build_pointer_plus_hwi (arg, 15);
12369 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12370 build_int_cst (TREE_TYPE (t), -16));
12371 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12372 }
12373 else
12374 roundup = NULL;
12375 /* Advance ap.__stack */
12376 t = fold_build_pointer_plus_hwi (arg, size + 7);
12377 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12378 build_int_cst (TREE_TYPE (t), -8));
12379 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12380 /* String up roundup and advance. */
12381 if (roundup)
12382 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12383 /* String up with arg */
12384 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12385 /* Big-endianness related address adjustment. */
12386 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12387 && size < UNITS_PER_WORD)
12388 {
12389 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12390 size_int (UNITS_PER_WORD - size));
12391 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12392 }
12393
12394 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12395 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12396
12397 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12398 t = off;
12399 if (adjust)
12400 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12401 build_int_cst (TREE_TYPE (off), adjust));
12402
12403 t = fold_convert (sizetype, t);
12404 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12405
12406 if (is_ha)
12407 {
12408 /* type ha; // treat as "struct {ftype field[n];}"
12409 ... [computing offs]
12410 for (i = 0; i <nregs; ++i, offs += 16)
12411 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12412 return ha; */
12413 int i;
12414 tree tmp_ha, field_t, field_ptr_t;
12415
12416 /* Declare a local variable. */
12417 tmp_ha = create_tmp_var_raw (type, "ha");
12418 gimple_add_tmp_var (tmp_ha);
12419
12420 /* Establish the base type. */
12421 switch (ag_mode)
12422 {
12423 case E_SFmode:
12424 field_t = float_type_node;
12425 field_ptr_t = float_ptr_type_node;
12426 break;
12427 case E_DFmode:
12428 field_t = double_type_node;
12429 field_ptr_t = double_ptr_type_node;
12430 break;
12431 case E_TFmode:
12432 field_t = long_double_type_node;
12433 field_ptr_t = long_double_ptr_type_node;
12434 break;
12435 case E_HFmode:
12436 field_t = aarch64_fp16_type_node;
12437 field_ptr_t = aarch64_fp16_ptr_type_node;
12438 break;
12439 case E_V2SImode:
12440 case E_V4SImode:
12441 {
12442 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12443 field_t = build_vector_type_for_mode (innertype, ag_mode);
12444 field_ptr_t = build_pointer_type (field_t);
12445 }
12446 break;
12447 default:
12448 gcc_assert (0);
12449 }
12450
12451 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12452 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12453 addr = t;
12454 t = fold_convert (field_ptr_t, addr);
12455 t = build2 (MODIFY_EXPR, field_t,
12456 build1 (INDIRECT_REF, field_t, tmp_ha),
12457 build1 (INDIRECT_REF, field_t, t));
12458
12459 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12460 for (i = 1; i < nregs; ++i)
12461 {
12462 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12463 u = fold_convert (field_ptr_t, addr);
12464 u = build2 (MODIFY_EXPR, field_t,
12465 build2 (MEM_REF, field_t, tmp_ha,
12466 build_int_cst (field_ptr_t,
12467 (i *
12468 int_size_in_bytes (field_t)))),
12469 build1 (INDIRECT_REF, field_t, u));
12470 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12471 }
12472
12473 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12474 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12475 }
12476
12477 COND_EXPR_ELSE (cond2) = t;
12478 addr = fold_convert (build_pointer_type (type), cond1);
12479 addr = build_va_arg_indirect_ref (addr);
12480
12481 if (indirect_p)
12482 addr = build_va_arg_indirect_ref (addr);
12483
12484 return addr;
12485 }
12486
12487 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12488
12489 static void
12490 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12491 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12492 int no_rtl)
12493 {
12494 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12495 CUMULATIVE_ARGS local_cum;
12496 int gr_saved = cfun->va_list_gpr_size;
12497 int vr_saved = cfun->va_list_fpr_size;
12498
12499 /* The caller has advanced CUM up to, but not beyond, the last named
12500 argument. Advance a local copy of CUM past the last "real" named
12501 argument, to find out how many registers are left over. */
12502 local_cum = *cum;
12503 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12504
12505 /* Found out how many registers we need to save.
12506 Honor tree-stdvar analysis results. */
12507 if (cfun->va_list_gpr_size)
12508 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12509 cfun->va_list_gpr_size / UNITS_PER_WORD);
12510 if (cfun->va_list_fpr_size)
12511 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12512 cfun->va_list_fpr_size / UNITS_PER_VREG);
12513
12514 if (!TARGET_FLOAT)
12515 {
12516 gcc_assert (local_cum.aapcs_nvrn == 0);
12517 vr_saved = 0;
12518 }
12519
12520 if (!no_rtl)
12521 {
12522 if (gr_saved > 0)
12523 {
12524 rtx ptr, mem;
12525
12526 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12527 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12528 - gr_saved * UNITS_PER_WORD);
12529 mem = gen_frame_mem (BLKmode, ptr);
12530 set_mem_alias_set (mem, get_varargs_alias_set ());
12531
12532 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12533 mem, gr_saved);
12534 }
12535 if (vr_saved > 0)
12536 {
12537 /* We can't use move_block_from_reg, because it will use
12538 the wrong mode, storing D regs only. */
12539 machine_mode mode = TImode;
12540 int off, i, vr_start;
12541
12542 /* Set OFF to the offset from virtual_incoming_args_rtx of
12543 the first vector register. The VR save area lies below
12544 the GR one, and is aligned to 16 bytes. */
12545 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12546 STACK_BOUNDARY / BITS_PER_UNIT);
12547 off -= vr_saved * UNITS_PER_VREG;
12548
12549 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12550 for (i = 0; i < vr_saved; ++i)
12551 {
12552 rtx ptr, mem;
12553
12554 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12555 mem = gen_frame_mem (mode, ptr);
12556 set_mem_alias_set (mem, get_varargs_alias_set ());
12557 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12558 off += UNITS_PER_VREG;
12559 }
12560 }
12561 }
12562
12563 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12564 any complication of having crtl->args.pretend_args_size changed. */
12565 cfun->machine->frame.saved_varargs_size
12566 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12567 STACK_BOUNDARY / BITS_PER_UNIT)
12568 + vr_saved * UNITS_PER_VREG);
12569 }
12570
12571 static void
12572 aarch64_conditional_register_usage (void)
12573 {
12574 int i;
12575 if (!TARGET_FLOAT)
12576 {
12577 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12578 {
12579 fixed_regs[i] = 1;
12580 call_used_regs[i] = 1;
12581 }
12582 }
12583 if (!TARGET_SVE)
12584 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12585 {
12586 fixed_regs[i] = 1;
12587 call_used_regs[i] = 1;
12588 }
12589 }
12590
12591 /* Walk down the type tree of TYPE counting consecutive base elements.
12592 If *MODEP is VOIDmode, then set it to the first valid floating point
12593 type. If a non-floating point type is found, or if a floating point
12594 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12595 otherwise return the count in the sub-tree. */
12596 static int
12597 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12598 {
12599 machine_mode mode;
12600 HOST_WIDE_INT size;
12601
12602 switch (TREE_CODE (type))
12603 {
12604 case REAL_TYPE:
12605 mode = TYPE_MODE (type);
12606 if (mode != DFmode && mode != SFmode
12607 && mode != TFmode && mode != HFmode)
12608 return -1;
12609
12610 if (*modep == VOIDmode)
12611 *modep = mode;
12612
12613 if (*modep == mode)
12614 return 1;
12615
12616 break;
12617
12618 case COMPLEX_TYPE:
12619 mode = TYPE_MODE (TREE_TYPE (type));
12620 if (mode != DFmode && mode != SFmode
12621 && mode != TFmode && mode != HFmode)
12622 return -1;
12623
12624 if (*modep == VOIDmode)
12625 *modep = mode;
12626
12627 if (*modep == mode)
12628 return 2;
12629
12630 break;
12631
12632 case VECTOR_TYPE:
12633 /* Use V2SImode and V4SImode as representatives of all 64-bit
12634 and 128-bit vector types. */
12635 size = int_size_in_bytes (type);
12636 switch (size)
12637 {
12638 case 8:
12639 mode = V2SImode;
12640 break;
12641 case 16:
12642 mode = V4SImode;
12643 break;
12644 default:
12645 return -1;
12646 }
12647
12648 if (*modep == VOIDmode)
12649 *modep = mode;
12650
12651 /* Vector modes are considered to be opaque: two vectors are
12652 equivalent for the purposes of being homogeneous aggregates
12653 if they are the same size. */
12654 if (*modep == mode)
12655 return 1;
12656
12657 break;
12658
12659 case ARRAY_TYPE:
12660 {
12661 int count;
12662 tree index = TYPE_DOMAIN (type);
12663
12664 /* Can't handle incomplete types nor sizes that are not
12665 fixed. */
12666 if (!COMPLETE_TYPE_P (type)
12667 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12668 return -1;
12669
12670 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12671 if (count == -1
12672 || !index
12673 || !TYPE_MAX_VALUE (index)
12674 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12675 || !TYPE_MIN_VALUE (index)
12676 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12677 || count < 0)
12678 return -1;
12679
12680 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12681 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12682
12683 /* There must be no padding. */
12684 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12685 count * GET_MODE_BITSIZE (*modep)))
12686 return -1;
12687
12688 return count;
12689 }
12690
12691 case RECORD_TYPE:
12692 {
12693 int count = 0;
12694 int sub_count;
12695 tree field;
12696
12697 /* Can't handle incomplete types nor sizes that are not
12698 fixed. */
12699 if (!COMPLETE_TYPE_P (type)
12700 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12701 return -1;
12702
12703 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12704 {
12705 if (TREE_CODE (field) != FIELD_DECL)
12706 continue;
12707
12708 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12709 if (sub_count < 0)
12710 return -1;
12711 count += sub_count;
12712 }
12713
12714 /* There must be no padding. */
12715 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12716 count * GET_MODE_BITSIZE (*modep)))
12717 return -1;
12718
12719 return count;
12720 }
12721
12722 case UNION_TYPE:
12723 case QUAL_UNION_TYPE:
12724 {
12725 /* These aren't very interesting except in a degenerate case. */
12726 int count = 0;
12727 int sub_count;
12728 tree field;
12729
12730 /* Can't handle incomplete types nor sizes that are not
12731 fixed. */
12732 if (!COMPLETE_TYPE_P (type)
12733 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12734 return -1;
12735
12736 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12737 {
12738 if (TREE_CODE (field) != FIELD_DECL)
12739 continue;
12740
12741 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12742 if (sub_count < 0)
12743 return -1;
12744 count = count > sub_count ? count : sub_count;
12745 }
12746
12747 /* There must be no padding. */
12748 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12749 count * GET_MODE_BITSIZE (*modep)))
12750 return -1;
12751
12752 return count;
12753 }
12754
12755 default:
12756 break;
12757 }
12758
12759 return -1;
12760 }
12761
12762 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12763 type as described in AAPCS64 \S 4.1.2.
12764
12765 See the comment above aarch64_composite_type_p for the notes on MODE. */
12766
12767 static bool
12768 aarch64_short_vector_p (const_tree type,
12769 machine_mode mode)
12770 {
12771 poly_int64 size = -1;
12772
12773 if (type && TREE_CODE (type) == VECTOR_TYPE)
12774 size = int_size_in_bytes (type);
12775 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12776 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12777 size = GET_MODE_SIZE (mode);
12778
12779 return known_eq (size, 8) || known_eq (size, 16);
12780 }
12781
12782 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12783 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12784 array types. The C99 floating-point complex types are also considered
12785 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12786 types, which are GCC extensions and out of the scope of AAPCS64, are
12787 treated as composite types here as well.
12788
12789 Note that MODE itself is not sufficient in determining whether a type
12790 is such a composite type or not. This is because
12791 stor-layout.c:compute_record_mode may have already changed the MODE
12792 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12793 structure with only one field may have its MODE set to the mode of the
12794 field. Also an integer mode whose size matches the size of the
12795 RECORD_TYPE type may be used to substitute the original mode
12796 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12797 solely relied on. */
12798
12799 static bool
12800 aarch64_composite_type_p (const_tree type,
12801 machine_mode mode)
12802 {
12803 if (aarch64_short_vector_p (type, mode))
12804 return false;
12805
12806 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12807 return true;
12808
12809 if (mode == BLKmode
12810 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12811 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12812 return true;
12813
12814 return false;
12815 }
12816
12817 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12818 shall be passed or returned in simd/fp register(s) (providing these
12819 parameter passing registers are available).
12820
12821 Upon successful return, *COUNT returns the number of needed registers,
12822 *BASE_MODE returns the mode of the individual register and when IS_HAF
12823 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12824 floating-point aggregate or a homogeneous short-vector aggregate. */
12825
12826 static bool
12827 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12828 const_tree type,
12829 machine_mode *base_mode,
12830 int *count,
12831 bool *is_ha)
12832 {
12833 machine_mode new_mode = VOIDmode;
12834 bool composite_p = aarch64_composite_type_p (type, mode);
12835
12836 if (is_ha != NULL) *is_ha = false;
12837
12838 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12839 || aarch64_short_vector_p (type, mode))
12840 {
12841 *count = 1;
12842 new_mode = mode;
12843 }
12844 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12845 {
12846 if (is_ha != NULL) *is_ha = true;
12847 *count = 2;
12848 new_mode = GET_MODE_INNER (mode);
12849 }
12850 else if (type && composite_p)
12851 {
12852 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12853
12854 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12855 {
12856 if (is_ha != NULL) *is_ha = true;
12857 *count = ag_count;
12858 }
12859 else
12860 return false;
12861 }
12862 else
12863 return false;
12864
12865 *base_mode = new_mode;
12866 return true;
12867 }
12868
12869 /* Implement TARGET_STRUCT_VALUE_RTX. */
12870
12871 static rtx
12872 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12873 int incoming ATTRIBUTE_UNUSED)
12874 {
12875 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12876 }
12877
12878 /* Implements target hook vector_mode_supported_p. */
12879 static bool
12880 aarch64_vector_mode_supported_p (machine_mode mode)
12881 {
12882 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12883 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12884 }
12885
12886 /* Return appropriate SIMD container
12887 for MODE within a vector of WIDTH bits. */
12888 static machine_mode
12889 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12890 {
12891 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12892 switch (mode)
12893 {
12894 case E_DFmode:
12895 return VNx2DFmode;
12896 case E_SFmode:
12897 return VNx4SFmode;
12898 case E_HFmode:
12899 return VNx8HFmode;
12900 case E_DImode:
12901 return VNx2DImode;
12902 case E_SImode:
12903 return VNx4SImode;
12904 case E_HImode:
12905 return VNx8HImode;
12906 case E_QImode:
12907 return VNx16QImode;
12908 default:
12909 return word_mode;
12910 }
12911
12912 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12913 if (TARGET_SIMD)
12914 {
12915 if (known_eq (width, 128))
12916 switch (mode)
12917 {
12918 case E_DFmode:
12919 return V2DFmode;
12920 case E_SFmode:
12921 return V4SFmode;
12922 case E_HFmode:
12923 return V8HFmode;
12924 case E_SImode:
12925 return V4SImode;
12926 case E_HImode:
12927 return V8HImode;
12928 case E_QImode:
12929 return V16QImode;
12930 case E_DImode:
12931 return V2DImode;
12932 default:
12933 break;
12934 }
12935 else
12936 switch (mode)
12937 {
12938 case E_SFmode:
12939 return V2SFmode;
12940 case E_HFmode:
12941 return V4HFmode;
12942 case E_SImode:
12943 return V2SImode;
12944 case E_HImode:
12945 return V4HImode;
12946 case E_QImode:
12947 return V8QImode;
12948 default:
12949 break;
12950 }
12951 }
12952 return word_mode;
12953 }
12954
12955 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12956 static machine_mode
12957 aarch64_preferred_simd_mode (scalar_mode mode)
12958 {
12959 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12960 return aarch64_simd_container_mode (mode, bits);
12961 }
12962
12963 /* Return a list of possible vector sizes for the vectorizer
12964 to iterate over. */
12965 static void
12966 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12967 {
12968 if (TARGET_SVE)
12969 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12970 sizes->safe_push (16);
12971 sizes->safe_push (8);
12972 }
12973
12974 /* Implement TARGET_MANGLE_TYPE. */
12975
12976 static const char *
12977 aarch64_mangle_type (const_tree type)
12978 {
12979 /* The AArch64 ABI documents say that "__va_list" has to be
12980 managled as if it is in the "std" namespace. */
12981 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12982 return "St9__va_list";
12983
12984 /* Half-precision float. */
12985 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12986 return "Dh";
12987
12988 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12989 builtin types. */
12990 if (TYPE_NAME (type) != NULL)
12991 return aarch64_mangle_builtin_type (type);
12992
12993 /* Use the default mangling. */
12994 return NULL;
12995 }
12996
12997 /* Find the first rtx_insn before insn that will generate an assembly
12998 instruction. */
12999
13000 static rtx_insn *
13001 aarch64_prev_real_insn (rtx_insn *insn)
13002 {
13003 if (!insn)
13004 return NULL;
13005
13006 do
13007 {
13008 insn = prev_real_insn (insn);
13009 }
13010 while (insn && recog_memoized (insn) < 0);
13011
13012 return insn;
13013 }
13014
13015 static bool
13016 is_madd_op (enum attr_type t1)
13017 {
13018 unsigned int i;
13019 /* A number of these may be AArch32 only. */
13020 enum attr_type mlatypes[] = {
13021 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13022 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13023 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13024 };
13025
13026 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13027 {
13028 if (t1 == mlatypes[i])
13029 return true;
13030 }
13031
13032 return false;
13033 }
13034
13035 /* Check if there is a register dependency between a load and the insn
13036 for which we hold recog_data. */
13037
13038 static bool
13039 dep_between_memop_and_curr (rtx memop)
13040 {
13041 rtx load_reg;
13042 int opno;
13043
13044 gcc_assert (GET_CODE (memop) == SET);
13045
13046 if (!REG_P (SET_DEST (memop)))
13047 return false;
13048
13049 load_reg = SET_DEST (memop);
13050 for (opno = 1; opno < recog_data.n_operands; opno++)
13051 {
13052 rtx operand = recog_data.operand[opno];
13053 if (REG_P (operand)
13054 && reg_overlap_mentioned_p (load_reg, operand))
13055 return true;
13056
13057 }
13058 return false;
13059 }
13060
13061
13062 /* When working around the Cortex-A53 erratum 835769,
13063 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13064 instruction and has a preceding memory instruction such that a NOP
13065 should be inserted between them. */
13066
13067 bool
13068 aarch64_madd_needs_nop (rtx_insn* insn)
13069 {
13070 enum attr_type attr_type;
13071 rtx_insn *prev;
13072 rtx body;
13073
13074 if (!TARGET_FIX_ERR_A53_835769)
13075 return false;
13076
13077 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13078 return false;
13079
13080 attr_type = get_attr_type (insn);
13081 if (!is_madd_op (attr_type))
13082 return false;
13083
13084 prev = aarch64_prev_real_insn (insn);
13085 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13086 Restore recog state to INSN to avoid state corruption. */
13087 extract_constrain_insn_cached (insn);
13088
13089 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13090 return false;
13091
13092 body = single_set (prev);
13093
13094 /* If the previous insn is a memory op and there is no dependency between
13095 it and the DImode madd, emit a NOP between them. If body is NULL then we
13096 have a complex memory operation, probably a load/store pair.
13097 Be conservative for now and emit a NOP. */
13098 if (GET_MODE (recog_data.operand[0]) == DImode
13099 && (!body || !dep_between_memop_and_curr (body)))
13100 return true;
13101
13102 return false;
13103
13104 }
13105
13106
13107 /* Implement FINAL_PRESCAN_INSN. */
13108
13109 void
13110 aarch64_final_prescan_insn (rtx_insn *insn)
13111 {
13112 if (aarch64_madd_needs_nop (insn))
13113 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13114 }
13115
13116
13117 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13118 instruction. */
13119
13120 bool
13121 aarch64_sve_index_immediate_p (rtx base_or_step)
13122 {
13123 return (CONST_INT_P (base_or_step)
13124 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13125 }
13126
13127 /* Return true if X is a valid immediate for the SVE ADD and SUB
13128 instructions. Negate X first if NEGATE_P is true. */
13129
13130 bool
13131 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13132 {
13133 rtx elt;
13134
13135 if (!const_vec_duplicate_p (x, &elt)
13136 || !CONST_INT_P (elt))
13137 return false;
13138
13139 HOST_WIDE_INT val = INTVAL (elt);
13140 if (negate_p)
13141 val = -val;
13142 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13143
13144 if (val & 0xff)
13145 return IN_RANGE (val, 0, 0xff);
13146 return IN_RANGE (val, 0, 0xff00);
13147 }
13148
13149 /* Return true if X is a valid immediate operand for an SVE logical
13150 instruction such as AND. */
13151
13152 bool
13153 aarch64_sve_bitmask_immediate_p (rtx x)
13154 {
13155 rtx elt;
13156
13157 return (const_vec_duplicate_p (x, &elt)
13158 && CONST_INT_P (elt)
13159 && aarch64_bitmask_imm (INTVAL (elt),
13160 GET_MODE_INNER (GET_MODE (x))));
13161 }
13162
13163 /* Return true if X is a valid immediate for the SVE DUP and CPY
13164 instructions. */
13165
13166 bool
13167 aarch64_sve_dup_immediate_p (rtx x)
13168 {
13169 rtx elt;
13170
13171 if (!const_vec_duplicate_p (x, &elt)
13172 || !CONST_INT_P (elt))
13173 return false;
13174
13175 HOST_WIDE_INT val = INTVAL (elt);
13176 if (val & 0xff)
13177 return IN_RANGE (val, -0x80, 0x7f);
13178 return IN_RANGE (val, -0x8000, 0x7f00);
13179 }
13180
13181 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13182 SIGNED_P says whether the operand is signed rather than unsigned. */
13183
13184 bool
13185 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13186 {
13187 rtx elt;
13188
13189 return (const_vec_duplicate_p (x, &elt)
13190 && CONST_INT_P (elt)
13191 && (signed_p
13192 ? IN_RANGE (INTVAL (elt), -16, 15)
13193 : IN_RANGE (INTVAL (elt), 0, 127)));
13194 }
13195
13196 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13197 instruction. Negate X first if NEGATE_P is true. */
13198
13199 bool
13200 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13201 {
13202 rtx elt;
13203 REAL_VALUE_TYPE r;
13204
13205 if (!const_vec_duplicate_p (x, &elt)
13206 || GET_CODE (elt) != CONST_DOUBLE)
13207 return false;
13208
13209 r = *CONST_DOUBLE_REAL_VALUE (elt);
13210
13211 if (negate_p)
13212 r = real_value_negate (&r);
13213
13214 if (real_equal (&r, &dconst1))
13215 return true;
13216 if (real_equal (&r, &dconsthalf))
13217 return true;
13218 return false;
13219 }
13220
13221 /* Return true if X is a valid immediate operand for an SVE FMUL
13222 instruction. */
13223
13224 bool
13225 aarch64_sve_float_mul_immediate_p (rtx x)
13226 {
13227 rtx elt;
13228
13229 /* GCC will never generate a multiply with an immediate of 2, so there is no
13230 point testing for it (even though it is a valid constant). */
13231 return (const_vec_duplicate_p (x, &elt)
13232 && GET_CODE (elt) == CONST_DOUBLE
13233 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13234 }
13235
13236 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13237 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13238 is nonnull, use it to describe valid immediates. */
13239 static bool
13240 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13241 simd_immediate_info *info,
13242 enum simd_immediate_check which,
13243 simd_immediate_info::insn_type insn)
13244 {
13245 /* Try a 4-byte immediate with LSL. */
13246 for (unsigned int shift = 0; shift < 32; shift += 8)
13247 if ((val32 & (0xff << shift)) == val32)
13248 {
13249 if (info)
13250 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13251 simd_immediate_info::LSL, shift);
13252 return true;
13253 }
13254
13255 /* Try a 2-byte immediate with LSL. */
13256 unsigned int imm16 = val32 & 0xffff;
13257 if (imm16 == (val32 >> 16))
13258 for (unsigned int shift = 0; shift < 16; shift += 8)
13259 if ((imm16 & (0xff << shift)) == imm16)
13260 {
13261 if (info)
13262 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13263 simd_immediate_info::LSL, shift);
13264 return true;
13265 }
13266
13267 /* Try a 4-byte immediate with MSL, except for cases that MVN
13268 can handle. */
13269 if (which == AARCH64_CHECK_MOV)
13270 for (unsigned int shift = 8; shift < 24; shift += 8)
13271 {
13272 unsigned int low = (1 << shift) - 1;
13273 if (((val32 & (0xff << shift)) | low) == val32)
13274 {
13275 if (info)
13276 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13277 simd_immediate_info::MSL, shift);
13278 return true;
13279 }
13280 }
13281
13282 return false;
13283 }
13284
13285 /* Return true if replicating VAL64 is a valid immediate for the
13286 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13287 use it to describe valid immediates. */
13288 static bool
13289 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13290 simd_immediate_info *info,
13291 enum simd_immediate_check which)
13292 {
13293 unsigned int val32 = val64 & 0xffffffff;
13294 unsigned int val16 = val64 & 0xffff;
13295 unsigned int val8 = val64 & 0xff;
13296
13297 if (val32 == (val64 >> 32))
13298 {
13299 if ((which & AARCH64_CHECK_ORR) != 0
13300 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13301 simd_immediate_info::MOV))
13302 return true;
13303
13304 if ((which & AARCH64_CHECK_BIC) != 0
13305 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13306 simd_immediate_info::MVN))
13307 return true;
13308
13309 /* Try using a replicated byte. */
13310 if (which == AARCH64_CHECK_MOV
13311 && val16 == (val32 >> 16)
13312 && val8 == (val16 >> 8))
13313 {
13314 if (info)
13315 *info = simd_immediate_info (QImode, val8);
13316 return true;
13317 }
13318 }
13319
13320 /* Try using a bit-to-bytemask. */
13321 if (which == AARCH64_CHECK_MOV)
13322 {
13323 unsigned int i;
13324 for (i = 0; i < 64; i += 8)
13325 {
13326 unsigned char byte = (val64 >> i) & 0xff;
13327 if (byte != 0 && byte != 0xff)
13328 break;
13329 }
13330 if (i == 64)
13331 {
13332 if (info)
13333 *info = simd_immediate_info (DImode, val64);
13334 return true;
13335 }
13336 }
13337 return false;
13338 }
13339
13340 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13341 instruction. If INFO is nonnull, use it to describe valid immediates. */
13342
13343 static bool
13344 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13345 simd_immediate_info *info)
13346 {
13347 scalar_int_mode mode = DImode;
13348 unsigned int val32 = val64 & 0xffffffff;
13349 if (val32 == (val64 >> 32))
13350 {
13351 mode = SImode;
13352 unsigned int val16 = val32 & 0xffff;
13353 if (val16 == (val32 >> 16))
13354 {
13355 mode = HImode;
13356 unsigned int val8 = val16 & 0xff;
13357 if (val8 == (val16 >> 8))
13358 mode = QImode;
13359 }
13360 }
13361 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13362 if (IN_RANGE (val, -0x80, 0x7f))
13363 {
13364 /* DUP with no shift. */
13365 if (info)
13366 *info = simd_immediate_info (mode, val);
13367 return true;
13368 }
13369 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13370 {
13371 /* DUP with LSL #8. */
13372 if (info)
13373 *info = simd_immediate_info (mode, val);
13374 return true;
13375 }
13376 if (aarch64_bitmask_imm (val64, mode))
13377 {
13378 /* DUPM. */
13379 if (info)
13380 *info = simd_immediate_info (mode, val);
13381 return true;
13382 }
13383 return false;
13384 }
13385
13386 /* Return true if OP is a valid SIMD immediate for the operation
13387 described by WHICH. If INFO is nonnull, use it to describe valid
13388 immediates. */
13389 bool
13390 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13391 enum simd_immediate_check which)
13392 {
13393 machine_mode mode = GET_MODE (op);
13394 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13395 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13396 return false;
13397
13398 scalar_mode elt_mode = GET_MODE_INNER (mode);
13399 rtx base, step;
13400 unsigned int n_elts;
13401 if (GET_CODE (op) == CONST_VECTOR
13402 && CONST_VECTOR_DUPLICATE_P (op))
13403 n_elts = CONST_VECTOR_NPATTERNS (op);
13404 else if ((vec_flags & VEC_SVE_DATA)
13405 && const_vec_series_p (op, &base, &step))
13406 {
13407 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13408 if (!aarch64_sve_index_immediate_p (base)
13409 || !aarch64_sve_index_immediate_p (step))
13410 return false;
13411
13412 if (info)
13413 *info = simd_immediate_info (elt_mode, base, step);
13414 return true;
13415 }
13416 else if (GET_CODE (op) == CONST_VECTOR
13417 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13418 /* N_ELTS set above. */;
13419 else
13420 return false;
13421
13422 /* Handle PFALSE and PTRUE. */
13423 if (vec_flags & VEC_SVE_PRED)
13424 return (op == CONST0_RTX (mode)
13425 || op == CONSTM1_RTX (mode));
13426
13427 scalar_float_mode elt_float_mode;
13428 if (n_elts == 1
13429 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13430 {
13431 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13432 if (aarch64_float_const_zero_rtx_p (elt)
13433 || aarch64_float_const_representable_p (elt))
13434 {
13435 if (info)
13436 *info = simd_immediate_info (elt_float_mode, elt);
13437 return true;
13438 }
13439 }
13440
13441 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13442 if (elt_size > 8)
13443 return false;
13444
13445 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13446
13447 /* Expand the vector constant out into a byte vector, with the least
13448 significant byte of the register first. */
13449 auto_vec<unsigned char, 16> bytes;
13450 bytes.reserve (n_elts * elt_size);
13451 for (unsigned int i = 0; i < n_elts; i++)
13452 {
13453 /* The vector is provided in gcc endian-neutral fashion.
13454 For aarch64_be Advanced SIMD, it must be laid out in the vector
13455 register in reverse order. */
13456 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13457 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13458
13459 if (elt_mode != elt_int_mode)
13460 elt = gen_lowpart (elt_int_mode, elt);
13461
13462 if (!CONST_INT_P (elt))
13463 return false;
13464
13465 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13466 for (unsigned int byte = 0; byte < elt_size; byte++)
13467 {
13468 bytes.quick_push (elt_val & 0xff);
13469 elt_val >>= BITS_PER_UNIT;
13470 }
13471 }
13472
13473 /* The immediate must repeat every eight bytes. */
13474 unsigned int nbytes = bytes.length ();
13475 for (unsigned i = 8; i < nbytes; ++i)
13476 if (bytes[i] != bytes[i - 8])
13477 return false;
13478
13479 /* Get the repeating 8-byte value as an integer. No endian correction
13480 is needed here because bytes is already in lsb-first order. */
13481 unsigned HOST_WIDE_INT val64 = 0;
13482 for (unsigned int i = 0; i < 8; i++)
13483 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13484 << (i * BITS_PER_UNIT));
13485
13486 if (vec_flags & VEC_SVE_DATA)
13487 return aarch64_sve_valid_immediate (val64, info);
13488 else
13489 return aarch64_advsimd_valid_immediate (val64, info, which);
13490 }
13491
13492 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13493 has a step in the range of INDEX. Return the index expression if so,
13494 otherwise return null. */
13495 rtx
13496 aarch64_check_zero_based_sve_index_immediate (rtx x)
13497 {
13498 rtx base, step;
13499 if (const_vec_series_p (x, &base, &step)
13500 && base == const0_rtx
13501 && aarch64_sve_index_immediate_p (step))
13502 return step;
13503 return NULL_RTX;
13504 }
13505
13506 /* Check of immediate shift constants are within range. */
13507 bool
13508 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13509 {
13510 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13511 if (left)
13512 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13513 else
13514 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13515 }
13516
13517 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13518 operation of width WIDTH at bit position POS. */
13519
13520 rtx
13521 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13522 {
13523 gcc_assert (CONST_INT_P (width));
13524 gcc_assert (CONST_INT_P (pos));
13525
13526 unsigned HOST_WIDE_INT mask
13527 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13528 return GEN_INT (mask << UINTVAL (pos));
13529 }
13530
13531 bool
13532 aarch64_mov_operand_p (rtx x, machine_mode mode)
13533 {
13534 if (GET_CODE (x) == HIGH
13535 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13536 return true;
13537
13538 if (CONST_INT_P (x))
13539 return true;
13540
13541 if (VECTOR_MODE_P (GET_MODE (x)))
13542 return aarch64_simd_valid_immediate (x, NULL);
13543
13544 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13545 return true;
13546
13547 if (aarch64_sve_cnt_immediate_p (x))
13548 return true;
13549
13550 return aarch64_classify_symbolic_expression (x)
13551 == SYMBOL_TINY_ABSOLUTE;
13552 }
13553
13554 /* Return a const_int vector of VAL. */
13555 rtx
13556 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13557 {
13558 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13559 return gen_const_vec_duplicate (mode, c);
13560 }
13561
13562 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13563
13564 bool
13565 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13566 {
13567 machine_mode vmode;
13568
13569 vmode = aarch64_simd_container_mode (mode, 64);
13570 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13571 return aarch64_simd_valid_immediate (op_v, NULL);
13572 }
13573
13574 /* Construct and return a PARALLEL RTX vector with elements numbering the
13575 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13576 the vector - from the perspective of the architecture. This does not
13577 line up with GCC's perspective on lane numbers, so we end up with
13578 different masks depending on our target endian-ness. The diagram
13579 below may help. We must draw the distinction when building masks
13580 which select one half of the vector. An instruction selecting
13581 architectural low-lanes for a big-endian target, must be described using
13582 a mask selecting GCC high-lanes.
13583
13584 Big-Endian Little-Endian
13585
13586 GCC 0 1 2 3 3 2 1 0
13587 | x | x | x | x | | x | x | x | x |
13588 Architecture 3 2 1 0 3 2 1 0
13589
13590 Low Mask: { 2, 3 } { 0, 1 }
13591 High Mask: { 0, 1 } { 2, 3 }
13592
13593 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13594
13595 rtx
13596 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13597 {
13598 rtvec v = rtvec_alloc (nunits / 2);
13599 int high_base = nunits / 2;
13600 int low_base = 0;
13601 int base;
13602 rtx t1;
13603 int i;
13604
13605 if (BYTES_BIG_ENDIAN)
13606 base = high ? low_base : high_base;
13607 else
13608 base = high ? high_base : low_base;
13609
13610 for (i = 0; i < nunits / 2; i++)
13611 RTVEC_ELT (v, i) = GEN_INT (base + i);
13612
13613 t1 = gen_rtx_PARALLEL (mode, v);
13614 return t1;
13615 }
13616
13617 /* Check OP for validity as a PARALLEL RTX vector with elements
13618 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13619 from the perspective of the architecture. See the diagram above
13620 aarch64_simd_vect_par_cnst_half for more details. */
13621
13622 bool
13623 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13624 bool high)
13625 {
13626 int nelts;
13627 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13628 return false;
13629
13630 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13631 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13632 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13633 int i = 0;
13634
13635 if (count_op != count_ideal)
13636 return false;
13637
13638 for (i = 0; i < count_ideal; i++)
13639 {
13640 rtx elt_op = XVECEXP (op, 0, i);
13641 rtx elt_ideal = XVECEXP (ideal, 0, i);
13642
13643 if (!CONST_INT_P (elt_op)
13644 || INTVAL (elt_ideal) != INTVAL (elt_op))
13645 return false;
13646 }
13647 return true;
13648 }
13649
13650 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13651 HIGH (exclusive). */
13652 void
13653 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13654 const_tree exp)
13655 {
13656 HOST_WIDE_INT lane;
13657 gcc_assert (CONST_INT_P (operand));
13658 lane = INTVAL (operand);
13659
13660 if (lane < low || lane >= high)
13661 {
13662 if (exp)
13663 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13664 else
13665 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13666 }
13667 }
13668
13669 /* Peform endian correction on lane number N, which indexes a vector
13670 of mode MODE, and return the result as an SImode rtx. */
13671
13672 rtx
13673 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13674 {
13675 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13676 }
13677
13678 /* Return TRUE if OP is a valid vector addressing mode. */
13679
13680 bool
13681 aarch64_simd_mem_operand_p (rtx op)
13682 {
13683 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13684 || REG_P (XEXP (op, 0)));
13685 }
13686
13687 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13688
13689 bool
13690 aarch64_sve_ld1r_operand_p (rtx op)
13691 {
13692 struct aarch64_address_info addr;
13693 scalar_mode mode;
13694
13695 return (MEM_P (op)
13696 && is_a <scalar_mode> (GET_MODE (op), &mode)
13697 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13698 && addr.type == ADDRESS_REG_IMM
13699 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13700 }
13701
13702 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13703 The conditions for STR are the same. */
13704 bool
13705 aarch64_sve_ldr_operand_p (rtx op)
13706 {
13707 struct aarch64_address_info addr;
13708
13709 return (MEM_P (op)
13710 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13711 false, ADDR_QUERY_ANY)
13712 && addr.type == ADDRESS_REG_IMM);
13713 }
13714
13715 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13716 We need to be able to access the individual pieces, so the range
13717 is different from LD[234] and ST[234]. */
13718 bool
13719 aarch64_sve_struct_memory_operand_p (rtx op)
13720 {
13721 if (!MEM_P (op))
13722 return false;
13723
13724 machine_mode mode = GET_MODE (op);
13725 struct aarch64_address_info addr;
13726 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13727 ADDR_QUERY_ANY)
13728 || addr.type != ADDRESS_REG_IMM)
13729 return false;
13730
13731 poly_int64 first = addr.const_offset;
13732 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13733 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13734 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13735 }
13736
13737 /* Emit a register copy from operand to operand, taking care not to
13738 early-clobber source registers in the process.
13739
13740 COUNT is the number of components into which the copy needs to be
13741 decomposed. */
13742 void
13743 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13744 unsigned int count)
13745 {
13746 unsigned int i;
13747 int rdest = REGNO (operands[0]);
13748 int rsrc = REGNO (operands[1]);
13749
13750 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13751 || rdest < rsrc)
13752 for (i = 0; i < count; i++)
13753 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13754 gen_rtx_REG (mode, rsrc + i));
13755 else
13756 for (i = 0; i < count; i++)
13757 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13758 gen_rtx_REG (mode, rsrc + count - i - 1));
13759 }
13760
13761 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13762 one of VSTRUCT modes: OI, CI, or XI. */
13763 int
13764 aarch64_simd_attr_length_rglist (machine_mode mode)
13765 {
13766 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13767 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13768 }
13769
13770 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13771 alignment of a vector to 128 bits. SVE predicates have an alignment of
13772 16 bits. */
13773 static HOST_WIDE_INT
13774 aarch64_simd_vector_alignment (const_tree type)
13775 {
13776 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13777 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13778 be set for non-predicate vectors of booleans. Modes are the most
13779 direct way we have of identifying real SVE predicate types. */
13780 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13781 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13782 return MIN (align, 128);
13783 }
13784
13785 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13786 static HOST_WIDE_INT
13787 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13788 {
13789 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13790 {
13791 /* If the length of the vector is fixed, try to align to that length,
13792 otherwise don't try to align at all. */
13793 HOST_WIDE_INT result;
13794 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13795 result = TYPE_ALIGN (TREE_TYPE (type));
13796 return result;
13797 }
13798 return TYPE_ALIGN (type);
13799 }
13800
13801 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13802 static bool
13803 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13804 {
13805 if (is_packed)
13806 return false;
13807
13808 /* For fixed-length vectors, check that the vectorizer will aim for
13809 full-vector alignment. This isn't true for generic GCC vectors
13810 that are wider than the ABI maximum of 128 bits. */
13811 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13812 && (wi::to_widest (TYPE_SIZE (type))
13813 != aarch64_vectorize_preferred_vector_alignment (type)))
13814 return false;
13815
13816 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13817 return true;
13818 }
13819
13820 /* Return true if the vector misalignment factor is supported by the
13821 target. */
13822 static bool
13823 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13824 const_tree type, int misalignment,
13825 bool is_packed)
13826 {
13827 if (TARGET_SIMD && STRICT_ALIGNMENT)
13828 {
13829 /* Return if movmisalign pattern is not supported for this mode. */
13830 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13831 return false;
13832
13833 /* Misalignment factor is unknown at compile time. */
13834 if (misalignment == -1)
13835 return false;
13836 }
13837 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13838 is_packed);
13839 }
13840
13841 /* If VALS is a vector constant that can be loaded into a register
13842 using DUP, generate instructions to do so and return an RTX to
13843 assign to the register. Otherwise return NULL_RTX. */
13844 static rtx
13845 aarch64_simd_dup_constant (rtx vals)
13846 {
13847 machine_mode mode = GET_MODE (vals);
13848 machine_mode inner_mode = GET_MODE_INNER (mode);
13849 rtx x;
13850
13851 if (!const_vec_duplicate_p (vals, &x))
13852 return NULL_RTX;
13853
13854 /* We can load this constant by using DUP and a constant in a
13855 single ARM register. This will be cheaper than a vector
13856 load. */
13857 x = copy_to_mode_reg (inner_mode, x);
13858 return gen_vec_duplicate (mode, x);
13859 }
13860
13861
13862 /* Generate code to load VALS, which is a PARALLEL containing only
13863 constants (for vec_init) or CONST_VECTOR, efficiently into a
13864 register. Returns an RTX to copy into the register, or NULL_RTX
13865 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13866 static rtx
13867 aarch64_simd_make_constant (rtx vals)
13868 {
13869 machine_mode mode = GET_MODE (vals);
13870 rtx const_dup;
13871 rtx const_vec = NULL_RTX;
13872 int n_const = 0;
13873 int i;
13874
13875 if (GET_CODE (vals) == CONST_VECTOR)
13876 const_vec = vals;
13877 else if (GET_CODE (vals) == PARALLEL)
13878 {
13879 /* A CONST_VECTOR must contain only CONST_INTs and
13880 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13881 Only store valid constants in a CONST_VECTOR. */
13882 int n_elts = XVECLEN (vals, 0);
13883 for (i = 0; i < n_elts; ++i)
13884 {
13885 rtx x = XVECEXP (vals, 0, i);
13886 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13887 n_const++;
13888 }
13889 if (n_const == n_elts)
13890 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13891 }
13892 else
13893 gcc_unreachable ();
13894
13895 if (const_vec != NULL_RTX
13896 && aarch64_simd_valid_immediate (const_vec, NULL))
13897 /* Load using MOVI/MVNI. */
13898 return const_vec;
13899 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13900 /* Loaded using DUP. */
13901 return const_dup;
13902 else if (const_vec != NULL_RTX)
13903 /* Load from constant pool. We can not take advantage of single-cycle
13904 LD1 because we need a PC-relative addressing mode. */
13905 return const_vec;
13906 else
13907 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13908 We can not construct an initializer. */
13909 return NULL_RTX;
13910 }
13911
13912 /* Expand a vector initialisation sequence, such that TARGET is
13913 initialised to contain VALS. */
13914
13915 void
13916 aarch64_expand_vector_init (rtx target, rtx vals)
13917 {
13918 machine_mode mode = GET_MODE (target);
13919 scalar_mode inner_mode = GET_MODE_INNER (mode);
13920 /* The number of vector elements. */
13921 int n_elts = XVECLEN (vals, 0);
13922 /* The number of vector elements which are not constant. */
13923 int n_var = 0;
13924 rtx any_const = NULL_RTX;
13925 /* The first element of vals. */
13926 rtx v0 = XVECEXP (vals, 0, 0);
13927 bool all_same = true;
13928
13929 /* Count the number of variable elements to initialise. */
13930 for (int i = 0; i < n_elts; ++i)
13931 {
13932 rtx x = XVECEXP (vals, 0, i);
13933 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13934 ++n_var;
13935 else
13936 any_const = x;
13937
13938 all_same &= rtx_equal_p (x, v0);
13939 }
13940
13941 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13942 how best to handle this. */
13943 if (n_var == 0)
13944 {
13945 rtx constant = aarch64_simd_make_constant (vals);
13946 if (constant != NULL_RTX)
13947 {
13948 emit_move_insn (target, constant);
13949 return;
13950 }
13951 }
13952
13953 /* Splat a single non-constant element if we can. */
13954 if (all_same)
13955 {
13956 rtx x = copy_to_mode_reg (inner_mode, v0);
13957 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13958 return;
13959 }
13960
13961 enum insn_code icode = optab_handler (vec_set_optab, mode);
13962 gcc_assert (icode != CODE_FOR_nothing);
13963
13964 /* If there are only variable elements, try to optimize
13965 the insertion using dup for the most common element
13966 followed by insertions. */
13967
13968 /* The algorithm will fill matches[*][0] with the earliest matching element,
13969 and matches[X][1] with the count of duplicate elements (if X is the
13970 earliest element which has duplicates). */
13971
13972 if (n_var == n_elts && n_elts <= 16)
13973 {
13974 int matches[16][2] = {0};
13975 for (int i = 0; i < n_elts; i++)
13976 {
13977 for (int j = 0; j <= i; j++)
13978 {
13979 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13980 {
13981 matches[i][0] = j;
13982 matches[j][1]++;
13983 break;
13984 }
13985 }
13986 }
13987 int maxelement = 0;
13988 int maxv = 0;
13989 for (int i = 0; i < n_elts; i++)
13990 if (matches[i][1] > maxv)
13991 {
13992 maxelement = i;
13993 maxv = matches[i][1];
13994 }
13995
13996 /* Create a duplicate of the most common element, unless all elements
13997 are equally useless to us, in which case just immediately set the
13998 vector register using the first element. */
13999
14000 if (maxv == 1)
14001 {
14002 /* For vectors of two 64-bit elements, we can do even better. */
14003 if (n_elts == 2
14004 && (inner_mode == E_DImode
14005 || inner_mode == E_DFmode))
14006
14007 {
14008 rtx x0 = XVECEXP (vals, 0, 0);
14009 rtx x1 = XVECEXP (vals, 0, 1);
14010 /* Combine can pick up this case, but handling it directly
14011 here leaves clearer RTL.
14012
14013 This is load_pair_lanes<mode>, and also gives us a clean-up
14014 for store_pair_lanes<mode>. */
14015 if (memory_operand (x0, inner_mode)
14016 && memory_operand (x1, inner_mode)
14017 && !STRICT_ALIGNMENT
14018 && rtx_equal_p (XEXP (x1, 0),
14019 plus_constant (Pmode,
14020 XEXP (x0, 0),
14021 GET_MODE_SIZE (inner_mode))))
14022 {
14023 rtx t;
14024 if (inner_mode == DFmode)
14025 t = gen_load_pair_lanesdf (target, x0, x1);
14026 else
14027 t = gen_load_pair_lanesdi (target, x0, x1);
14028 emit_insn (t);
14029 return;
14030 }
14031 }
14032 /* The subreg-move sequence below will move into lane zero of the
14033 vector register. For big-endian we want that position to hold
14034 the last element of VALS. */
14035 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14036 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14037 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14038 }
14039 else
14040 {
14041 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14042 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14043 }
14044
14045 /* Insert the rest. */
14046 for (int i = 0; i < n_elts; i++)
14047 {
14048 rtx x = XVECEXP (vals, 0, i);
14049 if (matches[i][0] == maxelement)
14050 continue;
14051 x = copy_to_mode_reg (inner_mode, x);
14052 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14053 }
14054 return;
14055 }
14056
14057 /* Initialise a vector which is part-variable. We want to first try
14058 to build those lanes which are constant in the most efficient way we
14059 can. */
14060 if (n_var != n_elts)
14061 {
14062 rtx copy = copy_rtx (vals);
14063
14064 /* Load constant part of vector. We really don't care what goes into the
14065 parts we will overwrite, but we're more likely to be able to load the
14066 constant efficiently if it has fewer, larger, repeating parts
14067 (see aarch64_simd_valid_immediate). */
14068 for (int i = 0; i < n_elts; i++)
14069 {
14070 rtx x = XVECEXP (vals, 0, i);
14071 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14072 continue;
14073 rtx subst = any_const;
14074 for (int bit = n_elts / 2; bit > 0; bit /= 2)
14075 {
14076 /* Look in the copied vector, as more elements are const. */
14077 rtx test = XVECEXP (copy, 0, i ^ bit);
14078 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14079 {
14080 subst = test;
14081 break;
14082 }
14083 }
14084 XVECEXP (copy, 0, i) = subst;
14085 }
14086 aarch64_expand_vector_init (target, copy);
14087 }
14088
14089 /* Insert the variable lanes directly. */
14090 for (int i = 0; i < n_elts; i++)
14091 {
14092 rtx x = XVECEXP (vals, 0, i);
14093 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14094 continue;
14095 x = copy_to_mode_reg (inner_mode, x);
14096 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14097 }
14098 }
14099
14100 static unsigned HOST_WIDE_INT
14101 aarch64_shift_truncation_mask (machine_mode mode)
14102 {
14103 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14104 return 0;
14105 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14106 }
14107
14108 /* Select a format to encode pointers in exception handling data. */
14109 int
14110 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14111 {
14112 int type;
14113 switch (aarch64_cmodel)
14114 {
14115 case AARCH64_CMODEL_TINY:
14116 case AARCH64_CMODEL_TINY_PIC:
14117 case AARCH64_CMODEL_SMALL:
14118 case AARCH64_CMODEL_SMALL_PIC:
14119 case AARCH64_CMODEL_SMALL_SPIC:
14120 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14121 for everything. */
14122 type = DW_EH_PE_sdata4;
14123 break;
14124 default:
14125 /* No assumptions here. 8-byte relocs required. */
14126 type = DW_EH_PE_sdata8;
14127 break;
14128 }
14129 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14130 }
14131
14132 /* The last .arch and .tune assembly strings that we printed. */
14133 static std::string aarch64_last_printed_arch_string;
14134 static std::string aarch64_last_printed_tune_string;
14135
14136 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14137 by the function fndecl. */
14138
14139 void
14140 aarch64_declare_function_name (FILE *stream, const char* name,
14141 tree fndecl)
14142 {
14143 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14144
14145 struct cl_target_option *targ_options;
14146 if (target_parts)
14147 targ_options = TREE_TARGET_OPTION (target_parts);
14148 else
14149 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14150 gcc_assert (targ_options);
14151
14152 const struct processor *this_arch
14153 = aarch64_get_arch (targ_options->x_explicit_arch);
14154
14155 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14156 std::string extension
14157 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14158 this_arch->flags);
14159 /* Only update the assembler .arch string if it is distinct from the last
14160 such string we printed. */
14161 std::string to_print = this_arch->name + extension;
14162 if (to_print != aarch64_last_printed_arch_string)
14163 {
14164 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14165 aarch64_last_printed_arch_string = to_print;
14166 }
14167
14168 /* Print the cpu name we're tuning for in the comments, might be
14169 useful to readers of the generated asm. Do it only when it changes
14170 from function to function and verbose assembly is requested. */
14171 const struct processor *this_tune
14172 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14173
14174 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14175 {
14176 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14177 this_tune->name);
14178 aarch64_last_printed_tune_string = this_tune->name;
14179 }
14180
14181 /* Don't forget the type directive for ELF. */
14182 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14183 ASM_OUTPUT_LABEL (stream, name);
14184 }
14185
14186 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14187
14188 static void
14189 aarch64_start_file (void)
14190 {
14191 struct cl_target_option *default_options
14192 = TREE_TARGET_OPTION (target_option_default_node);
14193
14194 const struct processor *default_arch
14195 = aarch64_get_arch (default_options->x_explicit_arch);
14196 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14197 std::string extension
14198 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14199 default_arch->flags);
14200
14201 aarch64_last_printed_arch_string = default_arch->name + extension;
14202 aarch64_last_printed_tune_string = "";
14203 asm_fprintf (asm_out_file, "\t.arch %s\n",
14204 aarch64_last_printed_arch_string.c_str ());
14205
14206 default_file_start ();
14207 }
14208
14209 /* Emit load exclusive. */
14210
14211 static void
14212 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14213 rtx mem, rtx model_rtx)
14214 {
14215 rtx (*gen) (rtx, rtx, rtx);
14216
14217 switch (mode)
14218 {
14219 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14220 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14221 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14222 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14223 default:
14224 gcc_unreachable ();
14225 }
14226
14227 emit_insn (gen (rval, mem, model_rtx));
14228 }
14229
14230 /* Emit store exclusive. */
14231
14232 static void
14233 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14234 rtx rval, rtx mem, rtx model_rtx)
14235 {
14236 rtx (*gen) (rtx, rtx, rtx, rtx);
14237
14238 switch (mode)
14239 {
14240 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14241 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14242 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14243 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14244 default:
14245 gcc_unreachable ();
14246 }
14247
14248 emit_insn (gen (bval, rval, mem, model_rtx));
14249 }
14250
14251 /* Mark the previous jump instruction as unlikely. */
14252
14253 static void
14254 aarch64_emit_unlikely_jump (rtx insn)
14255 {
14256 rtx_insn *jump = emit_jump_insn (insn);
14257 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14258 }
14259
14260 /* Expand a compare and swap pattern. */
14261
14262 void
14263 aarch64_expand_compare_and_swap (rtx operands[])
14264 {
14265 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14266 machine_mode mode, cmp_mode;
14267 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14268 int idx;
14269 gen_cas_fn gen;
14270 const gen_cas_fn split_cas[] =
14271 {
14272 gen_aarch64_compare_and_swapqi,
14273 gen_aarch64_compare_and_swaphi,
14274 gen_aarch64_compare_and_swapsi,
14275 gen_aarch64_compare_and_swapdi
14276 };
14277 const gen_cas_fn atomic_cas[] =
14278 {
14279 gen_aarch64_compare_and_swapqi_lse,
14280 gen_aarch64_compare_and_swaphi_lse,
14281 gen_aarch64_compare_and_swapsi_lse,
14282 gen_aarch64_compare_and_swapdi_lse
14283 };
14284
14285 bval = operands[0];
14286 rval = operands[1];
14287 mem = operands[2];
14288 oldval = operands[3];
14289 newval = operands[4];
14290 is_weak = operands[5];
14291 mod_s = operands[6];
14292 mod_f = operands[7];
14293 mode = GET_MODE (mem);
14294 cmp_mode = mode;
14295
14296 /* Normally the succ memory model must be stronger than fail, but in the
14297 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14298 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14299
14300 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14301 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14302 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14303
14304 switch (mode)
14305 {
14306 case E_QImode:
14307 case E_HImode:
14308 /* For short modes, we're going to perform the comparison in SImode,
14309 so do the zero-extension now. */
14310 cmp_mode = SImode;
14311 rval = gen_reg_rtx (SImode);
14312 oldval = convert_modes (SImode, mode, oldval, true);
14313 /* Fall through. */
14314
14315 case E_SImode:
14316 case E_DImode:
14317 /* Force the value into a register if needed. */
14318 if (!aarch64_plus_operand (oldval, mode))
14319 oldval = force_reg (cmp_mode, oldval);
14320 break;
14321
14322 default:
14323 gcc_unreachable ();
14324 }
14325
14326 switch (mode)
14327 {
14328 case E_QImode: idx = 0; break;
14329 case E_HImode: idx = 1; break;
14330 case E_SImode: idx = 2; break;
14331 case E_DImode: idx = 3; break;
14332 default:
14333 gcc_unreachable ();
14334 }
14335 if (TARGET_LSE)
14336 gen = atomic_cas[idx];
14337 else
14338 gen = split_cas[idx];
14339
14340 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14341
14342 if (mode == QImode || mode == HImode)
14343 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14344
14345 x = gen_rtx_REG (CCmode, CC_REGNUM);
14346 x = gen_rtx_EQ (SImode, x, const0_rtx);
14347 emit_insn (gen_rtx_SET (bval, x));
14348 }
14349
14350 /* Test whether the target supports using a atomic load-operate instruction.
14351 CODE is the operation and AFTER is TRUE if the data in memory after the
14352 operation should be returned and FALSE if the data before the operation
14353 should be returned. Returns FALSE if the operation isn't supported by the
14354 architecture. */
14355
14356 bool
14357 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14358 {
14359 if (!TARGET_LSE)
14360 return false;
14361
14362 switch (code)
14363 {
14364 case SET:
14365 case AND:
14366 case IOR:
14367 case XOR:
14368 case MINUS:
14369 case PLUS:
14370 return true;
14371 default:
14372 return false;
14373 }
14374 }
14375
14376 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14377 sequence implementing an atomic operation. */
14378
14379 static void
14380 aarch64_emit_post_barrier (enum memmodel model)
14381 {
14382 const enum memmodel base_model = memmodel_base (model);
14383
14384 if (is_mm_sync (model)
14385 && (base_model == MEMMODEL_ACQUIRE
14386 || base_model == MEMMODEL_ACQ_REL
14387 || base_model == MEMMODEL_SEQ_CST))
14388 {
14389 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14390 }
14391 }
14392
14393 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14394 for the data in memory. EXPECTED is the value expected to be in memory.
14395 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14396 is the memory ordering to use. */
14397
14398 void
14399 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14400 rtx expected, rtx desired,
14401 rtx model)
14402 {
14403 rtx (*gen) (rtx, rtx, rtx, rtx);
14404 machine_mode mode;
14405
14406 mode = GET_MODE (mem);
14407
14408 switch (mode)
14409 {
14410 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14411 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14412 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14413 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14414 default:
14415 gcc_unreachable ();
14416 }
14417
14418 /* Move the expected value into the CAS destination register. */
14419 emit_insn (gen_rtx_SET (rval, expected));
14420
14421 /* Emit the CAS. */
14422 emit_insn (gen (rval, mem, desired, model));
14423
14424 /* Compare the expected value with the value loaded by the CAS, to establish
14425 whether the swap was made. */
14426 aarch64_gen_compare_reg (EQ, rval, expected);
14427 }
14428
14429 /* Split a compare and swap pattern. */
14430
14431 void
14432 aarch64_split_compare_and_swap (rtx operands[])
14433 {
14434 rtx rval, mem, oldval, newval, scratch;
14435 machine_mode mode;
14436 bool is_weak;
14437 rtx_code_label *label1, *label2;
14438 rtx x, cond;
14439 enum memmodel model;
14440 rtx model_rtx;
14441
14442 rval = operands[0];
14443 mem = operands[1];
14444 oldval = operands[2];
14445 newval = operands[3];
14446 is_weak = (operands[4] != const0_rtx);
14447 model_rtx = operands[5];
14448 scratch = operands[7];
14449 mode = GET_MODE (mem);
14450 model = memmodel_from_int (INTVAL (model_rtx));
14451
14452 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14453 loop:
14454 .label1:
14455 LD[A]XR rval, [mem]
14456 CBNZ rval, .label2
14457 ST[L]XR scratch, newval, [mem]
14458 CBNZ scratch, .label1
14459 .label2:
14460 CMP rval, 0. */
14461 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14462
14463 label1 = NULL;
14464 if (!is_weak)
14465 {
14466 label1 = gen_label_rtx ();
14467 emit_label (label1);
14468 }
14469 label2 = gen_label_rtx ();
14470
14471 /* The initial load can be relaxed for a __sync operation since a final
14472 barrier will be emitted to stop code hoisting. */
14473 if (is_mm_sync (model))
14474 aarch64_emit_load_exclusive (mode, rval, mem,
14475 GEN_INT (MEMMODEL_RELAXED));
14476 else
14477 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14478
14479 if (strong_zero_p)
14480 {
14481 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14482 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14483 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14484 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14485 }
14486 else
14487 {
14488 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14489 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14490 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14491 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14492 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14493 }
14494
14495 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14496
14497 if (!is_weak)
14498 {
14499 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14500 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14501 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14502 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14503 }
14504 else
14505 {
14506 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14507 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14508 emit_insn (gen_rtx_SET (cond, x));
14509 }
14510
14511 emit_label (label2);
14512 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14513 to set the condition flags. If this is not used it will be removed by
14514 later passes. */
14515 if (strong_zero_p)
14516 {
14517 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14518 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14519 emit_insn (gen_rtx_SET (cond, x));
14520 }
14521 /* Emit any final barrier needed for a __sync operation. */
14522 if (is_mm_sync (model))
14523 aarch64_emit_post_barrier (model);
14524 }
14525
14526 /* Emit a BIC instruction. */
14527
14528 static void
14529 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14530 {
14531 rtx shift_rtx = GEN_INT (shift);
14532 rtx (*gen) (rtx, rtx, rtx, rtx);
14533
14534 switch (mode)
14535 {
14536 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14537 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14538 default:
14539 gcc_unreachable ();
14540 }
14541
14542 emit_insn (gen (dst, s2, shift_rtx, s1));
14543 }
14544
14545 /* Emit an atomic swap. */
14546
14547 static void
14548 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14549 rtx mem, rtx model)
14550 {
14551 rtx (*gen) (rtx, rtx, rtx, rtx);
14552
14553 switch (mode)
14554 {
14555 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14556 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14557 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14558 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14559 default:
14560 gcc_unreachable ();
14561 }
14562
14563 emit_insn (gen (dst, mem, value, model));
14564 }
14565
14566 /* Operations supported by aarch64_emit_atomic_load_op. */
14567
14568 enum aarch64_atomic_load_op_code
14569 {
14570 AARCH64_LDOP_PLUS, /* A + B */
14571 AARCH64_LDOP_XOR, /* A ^ B */
14572 AARCH64_LDOP_OR, /* A | B */
14573 AARCH64_LDOP_BIC /* A & ~B */
14574 };
14575
14576 /* Emit an atomic load-operate. */
14577
14578 static void
14579 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14580 machine_mode mode, rtx dst, rtx src,
14581 rtx mem, rtx model)
14582 {
14583 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14584 const aarch64_atomic_load_op_fn plus[] =
14585 {
14586 gen_aarch64_atomic_loadaddqi,
14587 gen_aarch64_atomic_loadaddhi,
14588 gen_aarch64_atomic_loadaddsi,
14589 gen_aarch64_atomic_loadadddi
14590 };
14591 const aarch64_atomic_load_op_fn eor[] =
14592 {
14593 gen_aarch64_atomic_loadeorqi,
14594 gen_aarch64_atomic_loadeorhi,
14595 gen_aarch64_atomic_loadeorsi,
14596 gen_aarch64_atomic_loadeordi
14597 };
14598 const aarch64_atomic_load_op_fn ior[] =
14599 {
14600 gen_aarch64_atomic_loadsetqi,
14601 gen_aarch64_atomic_loadsethi,
14602 gen_aarch64_atomic_loadsetsi,
14603 gen_aarch64_atomic_loadsetdi
14604 };
14605 const aarch64_atomic_load_op_fn bic[] =
14606 {
14607 gen_aarch64_atomic_loadclrqi,
14608 gen_aarch64_atomic_loadclrhi,
14609 gen_aarch64_atomic_loadclrsi,
14610 gen_aarch64_atomic_loadclrdi
14611 };
14612 aarch64_atomic_load_op_fn gen;
14613 int idx = 0;
14614
14615 switch (mode)
14616 {
14617 case E_QImode: idx = 0; break;
14618 case E_HImode: idx = 1; break;
14619 case E_SImode: idx = 2; break;
14620 case E_DImode: idx = 3; break;
14621 default:
14622 gcc_unreachable ();
14623 }
14624
14625 switch (code)
14626 {
14627 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14628 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14629 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14630 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14631 default:
14632 gcc_unreachable ();
14633 }
14634
14635 emit_insn (gen (dst, mem, src, model));
14636 }
14637
14638 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14639 location to store the data read from memory. OUT_RESULT is the location to
14640 store the result of the operation. MEM is the memory location to read and
14641 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14642 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14643 be NULL. */
14644
14645 void
14646 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14647 rtx mem, rtx value, rtx model_rtx)
14648 {
14649 machine_mode mode = GET_MODE (mem);
14650 machine_mode wmode = (mode == DImode ? DImode : SImode);
14651 const bool short_mode = (mode < SImode);
14652 aarch64_atomic_load_op_code ldop_code;
14653 rtx src;
14654 rtx x;
14655
14656 if (out_data)
14657 out_data = gen_lowpart (mode, out_data);
14658
14659 if (out_result)
14660 out_result = gen_lowpart (mode, out_result);
14661
14662 /* Make sure the value is in a register, putting it into a destination
14663 register if it needs to be manipulated. */
14664 if (!register_operand (value, mode)
14665 || code == AND || code == MINUS)
14666 {
14667 src = out_result ? out_result : out_data;
14668 emit_move_insn (src, gen_lowpart (mode, value));
14669 }
14670 else
14671 src = value;
14672 gcc_assert (register_operand (src, mode));
14673
14674 /* Preprocess the data for the operation as necessary. If the operation is
14675 a SET then emit a swap instruction and finish. */
14676 switch (code)
14677 {
14678 case SET:
14679 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14680 return;
14681
14682 case MINUS:
14683 /* Negate the value and treat it as a PLUS. */
14684 {
14685 rtx neg_src;
14686
14687 /* Resize the value if necessary. */
14688 if (short_mode)
14689 src = gen_lowpart (wmode, src);
14690
14691 neg_src = gen_rtx_NEG (wmode, src);
14692 emit_insn (gen_rtx_SET (src, neg_src));
14693
14694 if (short_mode)
14695 src = gen_lowpart (mode, src);
14696 }
14697 /* Fall-through. */
14698 case PLUS:
14699 ldop_code = AARCH64_LDOP_PLUS;
14700 break;
14701
14702 case IOR:
14703 ldop_code = AARCH64_LDOP_OR;
14704 break;
14705
14706 case XOR:
14707 ldop_code = AARCH64_LDOP_XOR;
14708 break;
14709
14710 case AND:
14711 {
14712 rtx not_src;
14713
14714 /* Resize the value if necessary. */
14715 if (short_mode)
14716 src = gen_lowpart (wmode, src);
14717
14718 not_src = gen_rtx_NOT (wmode, src);
14719 emit_insn (gen_rtx_SET (src, not_src));
14720
14721 if (short_mode)
14722 src = gen_lowpart (mode, src);
14723 }
14724 ldop_code = AARCH64_LDOP_BIC;
14725 break;
14726
14727 default:
14728 /* The operation can't be done with atomic instructions. */
14729 gcc_unreachable ();
14730 }
14731
14732 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14733
14734 /* If necessary, calculate the data in memory after the update by redoing the
14735 operation from values in registers. */
14736 if (!out_result)
14737 return;
14738
14739 if (short_mode)
14740 {
14741 src = gen_lowpart (wmode, src);
14742 out_data = gen_lowpart (wmode, out_data);
14743 out_result = gen_lowpart (wmode, out_result);
14744 }
14745
14746 x = NULL_RTX;
14747
14748 switch (code)
14749 {
14750 case MINUS:
14751 case PLUS:
14752 x = gen_rtx_PLUS (wmode, out_data, src);
14753 break;
14754 case IOR:
14755 x = gen_rtx_IOR (wmode, out_data, src);
14756 break;
14757 case XOR:
14758 x = gen_rtx_XOR (wmode, out_data, src);
14759 break;
14760 case AND:
14761 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14762 return;
14763 default:
14764 gcc_unreachable ();
14765 }
14766
14767 emit_set_insn (out_result, x);
14768
14769 return;
14770 }
14771
14772 /* Split an atomic operation. */
14773
14774 void
14775 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14776 rtx value, rtx model_rtx, rtx cond)
14777 {
14778 machine_mode mode = GET_MODE (mem);
14779 machine_mode wmode = (mode == DImode ? DImode : SImode);
14780 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14781 const bool is_sync = is_mm_sync (model);
14782 rtx_code_label *label;
14783 rtx x;
14784
14785 /* Split the atomic operation into a sequence. */
14786 label = gen_label_rtx ();
14787 emit_label (label);
14788
14789 if (new_out)
14790 new_out = gen_lowpart (wmode, new_out);
14791 if (old_out)
14792 old_out = gen_lowpart (wmode, old_out);
14793 else
14794 old_out = new_out;
14795 value = simplify_gen_subreg (wmode, value, mode, 0);
14796
14797 /* The initial load can be relaxed for a __sync operation since a final
14798 barrier will be emitted to stop code hoisting. */
14799 if (is_sync)
14800 aarch64_emit_load_exclusive (mode, old_out, mem,
14801 GEN_INT (MEMMODEL_RELAXED));
14802 else
14803 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14804
14805 switch (code)
14806 {
14807 case SET:
14808 new_out = value;
14809 break;
14810
14811 case NOT:
14812 x = gen_rtx_AND (wmode, old_out, value);
14813 emit_insn (gen_rtx_SET (new_out, x));
14814 x = gen_rtx_NOT (wmode, new_out);
14815 emit_insn (gen_rtx_SET (new_out, x));
14816 break;
14817
14818 case MINUS:
14819 if (CONST_INT_P (value))
14820 {
14821 value = GEN_INT (-INTVAL (value));
14822 code = PLUS;
14823 }
14824 /* Fall through. */
14825
14826 default:
14827 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14828 emit_insn (gen_rtx_SET (new_out, x));
14829 break;
14830 }
14831
14832 aarch64_emit_store_exclusive (mode, cond, mem,
14833 gen_lowpart (mode, new_out), model_rtx);
14834
14835 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14836 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14837 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14838 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14839
14840 /* Emit any final barrier needed for a __sync operation. */
14841 if (is_sync)
14842 aarch64_emit_post_barrier (model);
14843 }
14844
14845 static void
14846 aarch64_init_libfuncs (void)
14847 {
14848 /* Half-precision float operations. The compiler handles all operations
14849 with NULL libfuncs by converting to SFmode. */
14850
14851 /* Conversions. */
14852 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14853 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14854
14855 /* Arithmetic. */
14856 set_optab_libfunc (add_optab, HFmode, NULL);
14857 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14858 set_optab_libfunc (smul_optab, HFmode, NULL);
14859 set_optab_libfunc (neg_optab, HFmode, NULL);
14860 set_optab_libfunc (sub_optab, HFmode, NULL);
14861
14862 /* Comparisons. */
14863 set_optab_libfunc (eq_optab, HFmode, NULL);
14864 set_optab_libfunc (ne_optab, HFmode, NULL);
14865 set_optab_libfunc (lt_optab, HFmode, NULL);
14866 set_optab_libfunc (le_optab, HFmode, NULL);
14867 set_optab_libfunc (ge_optab, HFmode, NULL);
14868 set_optab_libfunc (gt_optab, HFmode, NULL);
14869 set_optab_libfunc (unord_optab, HFmode, NULL);
14870 }
14871
14872 /* Target hook for c_mode_for_suffix. */
14873 static machine_mode
14874 aarch64_c_mode_for_suffix (char suffix)
14875 {
14876 if (suffix == 'q')
14877 return TFmode;
14878
14879 return VOIDmode;
14880 }
14881
14882 /* We can only represent floating point constants which will fit in
14883 "quarter-precision" values. These values are characterised by
14884 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14885 by:
14886
14887 (-1)^s * (n/16) * 2^r
14888
14889 Where:
14890 's' is the sign bit.
14891 'n' is an integer in the range 16 <= n <= 31.
14892 'r' is an integer in the range -3 <= r <= 4. */
14893
14894 /* Return true iff X can be represented by a quarter-precision
14895 floating point immediate operand X. Note, we cannot represent 0.0. */
14896 bool
14897 aarch64_float_const_representable_p (rtx x)
14898 {
14899 /* This represents our current view of how many bits
14900 make up the mantissa. */
14901 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14902 int exponent;
14903 unsigned HOST_WIDE_INT mantissa, mask;
14904 REAL_VALUE_TYPE r, m;
14905 bool fail;
14906
14907 if (!CONST_DOUBLE_P (x))
14908 return false;
14909
14910 /* We don't support HFmode constants yet. */
14911 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14912 return false;
14913
14914 r = *CONST_DOUBLE_REAL_VALUE (x);
14915
14916 /* We cannot represent infinities, NaNs or +/-zero. We won't
14917 know if we have +zero until we analyse the mantissa, but we
14918 can reject the other invalid values. */
14919 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14920 || REAL_VALUE_MINUS_ZERO (r))
14921 return false;
14922
14923 /* Extract exponent. */
14924 r = real_value_abs (&r);
14925 exponent = REAL_EXP (&r);
14926
14927 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14928 highest (sign) bit, with a fixed binary point at bit point_pos.
14929 m1 holds the low part of the mantissa, m2 the high part.
14930 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14931 bits for the mantissa, this can fail (low bits will be lost). */
14932 real_ldexp (&m, &r, point_pos - exponent);
14933 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14934
14935 /* If the low part of the mantissa has bits set we cannot represent
14936 the value. */
14937 if (w.ulow () != 0)
14938 return false;
14939 /* We have rejected the lower HOST_WIDE_INT, so update our
14940 understanding of how many bits lie in the mantissa and
14941 look only at the high HOST_WIDE_INT. */
14942 mantissa = w.elt (1);
14943 point_pos -= HOST_BITS_PER_WIDE_INT;
14944
14945 /* We can only represent values with a mantissa of the form 1.xxxx. */
14946 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14947 if ((mantissa & mask) != 0)
14948 return false;
14949
14950 /* Having filtered unrepresentable values, we may now remove all
14951 but the highest 5 bits. */
14952 mantissa >>= point_pos - 5;
14953
14954 /* We cannot represent the value 0.0, so reject it. This is handled
14955 elsewhere. */
14956 if (mantissa == 0)
14957 return false;
14958
14959 /* Then, as bit 4 is always set, we can mask it off, leaving
14960 the mantissa in the range [0, 15]. */
14961 mantissa &= ~(1 << 4);
14962 gcc_assert (mantissa <= 15);
14963
14964 /* GCC internally does not use IEEE754-like encoding (where normalized
14965 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14966 Our mantissa values are shifted 4 places to the left relative to
14967 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14968 by 5 places to correct for GCC's representation. */
14969 exponent = 5 - exponent;
14970
14971 return (exponent >= 0 && exponent <= 7);
14972 }
14973
14974 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14975 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14976 output MOVI/MVNI, ORR or BIC immediate. */
14977 char*
14978 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14979 enum simd_immediate_check which)
14980 {
14981 bool is_valid;
14982 static char templ[40];
14983 const char *mnemonic;
14984 const char *shift_op;
14985 unsigned int lane_count = 0;
14986 char element_char;
14987
14988 struct simd_immediate_info info;
14989
14990 /* This will return true to show const_vector is legal for use as either
14991 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14992 It will also update INFO to show how the immediate should be generated.
14993 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14994 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14995 gcc_assert (is_valid);
14996
14997 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14998 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14999
15000 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15001 {
15002 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15003 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15004 move immediate path. */
15005 if (aarch64_float_const_zero_rtx_p (info.value))
15006 info.value = GEN_INT (0);
15007 else
15008 {
15009 const unsigned int buf_size = 20;
15010 char float_buf[buf_size] = {'\0'};
15011 real_to_decimal_for_mode (float_buf,
15012 CONST_DOUBLE_REAL_VALUE (info.value),
15013 buf_size, buf_size, 1, info.elt_mode);
15014
15015 if (lane_count == 1)
15016 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15017 else
15018 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15019 lane_count, element_char, float_buf);
15020 return templ;
15021 }
15022 }
15023
15024 gcc_assert (CONST_INT_P (info.value));
15025
15026 if (which == AARCH64_CHECK_MOV)
15027 {
15028 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15029 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15030 if (lane_count == 1)
15031 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15032 mnemonic, UINTVAL (info.value));
15033 else if (info.shift)
15034 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15035 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15036 element_char, UINTVAL (info.value), shift_op, info.shift);
15037 else
15038 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15039 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15040 element_char, UINTVAL (info.value));
15041 }
15042 else
15043 {
15044 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15045 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15046 if (info.shift)
15047 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15048 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15049 element_char, UINTVAL (info.value), "lsl", info.shift);
15050 else
15051 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15052 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15053 element_char, UINTVAL (info.value));
15054 }
15055 return templ;
15056 }
15057
15058 char*
15059 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15060 {
15061
15062 /* If a floating point number was passed and we desire to use it in an
15063 integer mode do the conversion to integer. */
15064 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15065 {
15066 unsigned HOST_WIDE_INT ival;
15067 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15068 gcc_unreachable ();
15069 immediate = gen_int_mode (ival, mode);
15070 }
15071
15072 machine_mode vmode;
15073 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15074 a 128 bit vector mode. */
15075 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15076
15077 vmode = aarch64_simd_container_mode (mode, width);
15078 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15079 return aarch64_output_simd_mov_immediate (v_op, width);
15080 }
15081
15082 /* Return the output string to use for moving immediate CONST_VECTOR
15083 into an SVE register. */
15084
15085 char *
15086 aarch64_output_sve_mov_immediate (rtx const_vector)
15087 {
15088 static char templ[40];
15089 struct simd_immediate_info info;
15090 char element_char;
15091
15092 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15093 gcc_assert (is_valid);
15094
15095 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15096
15097 if (info.step)
15098 {
15099 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15100 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15101 element_char, INTVAL (info.value), INTVAL (info.step));
15102 return templ;
15103 }
15104
15105 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15106 {
15107 if (aarch64_float_const_zero_rtx_p (info.value))
15108 info.value = GEN_INT (0);
15109 else
15110 {
15111 const int buf_size = 20;
15112 char float_buf[buf_size] = {};
15113 real_to_decimal_for_mode (float_buf,
15114 CONST_DOUBLE_REAL_VALUE (info.value),
15115 buf_size, buf_size, 1, info.elt_mode);
15116
15117 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15118 element_char, float_buf);
15119 return templ;
15120 }
15121 }
15122
15123 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15124 element_char, INTVAL (info.value));
15125 return templ;
15126 }
15127
15128 /* Return the asm format for a PTRUE instruction whose destination has
15129 mode MODE. SUFFIX is the element size suffix. */
15130
15131 char *
15132 aarch64_output_ptrue (machine_mode mode, char suffix)
15133 {
15134 unsigned int nunits;
15135 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15136 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15137 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15138 else
15139 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15140 return buf;
15141 }
15142
15143 /* Split operands into moves from op[1] + op[2] into op[0]. */
15144
15145 void
15146 aarch64_split_combinev16qi (rtx operands[3])
15147 {
15148 unsigned int dest = REGNO (operands[0]);
15149 unsigned int src1 = REGNO (operands[1]);
15150 unsigned int src2 = REGNO (operands[2]);
15151 machine_mode halfmode = GET_MODE (operands[1]);
15152 unsigned int halfregs = REG_NREGS (operands[1]);
15153 rtx destlo, desthi;
15154
15155 gcc_assert (halfmode == V16QImode);
15156
15157 if (src1 == dest && src2 == dest + halfregs)
15158 {
15159 /* No-op move. Can't split to nothing; emit something. */
15160 emit_note (NOTE_INSN_DELETED);
15161 return;
15162 }
15163
15164 /* Preserve register attributes for variable tracking. */
15165 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15166 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15167 GET_MODE_SIZE (halfmode));
15168
15169 /* Special case of reversed high/low parts. */
15170 if (reg_overlap_mentioned_p (operands[2], destlo)
15171 && reg_overlap_mentioned_p (operands[1], desthi))
15172 {
15173 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15174 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15175 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15176 }
15177 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15178 {
15179 /* Try to avoid unnecessary moves if part of the result
15180 is in the right place already. */
15181 if (src1 != dest)
15182 emit_move_insn (destlo, operands[1]);
15183 if (src2 != dest + halfregs)
15184 emit_move_insn (desthi, operands[2]);
15185 }
15186 else
15187 {
15188 if (src2 != dest + halfregs)
15189 emit_move_insn (desthi, operands[2]);
15190 if (src1 != dest)
15191 emit_move_insn (destlo, operands[1]);
15192 }
15193 }
15194
15195 /* vec_perm support. */
15196
15197 struct expand_vec_perm_d
15198 {
15199 rtx target, op0, op1;
15200 vec_perm_indices perm;
15201 machine_mode vmode;
15202 unsigned int vec_flags;
15203 bool one_vector_p;
15204 bool testing_p;
15205 };
15206
15207 /* Generate a variable permutation. */
15208
15209 static void
15210 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15211 {
15212 machine_mode vmode = GET_MODE (target);
15213 bool one_vector_p = rtx_equal_p (op0, op1);
15214
15215 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15216 gcc_checking_assert (GET_MODE (op0) == vmode);
15217 gcc_checking_assert (GET_MODE (op1) == vmode);
15218 gcc_checking_assert (GET_MODE (sel) == vmode);
15219 gcc_checking_assert (TARGET_SIMD);
15220
15221 if (one_vector_p)
15222 {
15223 if (vmode == V8QImode)
15224 {
15225 /* Expand the argument to a V16QI mode by duplicating it. */
15226 rtx pair = gen_reg_rtx (V16QImode);
15227 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15228 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15229 }
15230 else
15231 {
15232 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15233 }
15234 }
15235 else
15236 {
15237 rtx pair;
15238
15239 if (vmode == V8QImode)
15240 {
15241 pair = gen_reg_rtx (V16QImode);
15242 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15243 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15244 }
15245 else
15246 {
15247 pair = gen_reg_rtx (OImode);
15248 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15249 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15250 }
15251 }
15252 }
15253
15254 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15255 NELT is the number of elements in the vector. */
15256
15257 void
15258 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15259 unsigned int nelt)
15260 {
15261 machine_mode vmode = GET_MODE (target);
15262 bool one_vector_p = rtx_equal_p (op0, op1);
15263 rtx mask;
15264
15265 /* The TBL instruction does not use a modulo index, so we must take care
15266 of that ourselves. */
15267 mask = aarch64_simd_gen_const_vector_dup (vmode,
15268 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15269 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15270
15271 /* For big-endian, we also need to reverse the index within the vector
15272 (but not which vector). */
15273 if (BYTES_BIG_ENDIAN)
15274 {
15275 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15276 if (!one_vector_p)
15277 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15278 sel = expand_simple_binop (vmode, XOR, sel, mask,
15279 NULL, 0, OPTAB_LIB_WIDEN);
15280 }
15281 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15282 }
15283
15284 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15285
15286 static void
15287 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15288 {
15289 emit_insn (gen_rtx_SET (target,
15290 gen_rtx_UNSPEC (GET_MODE (target),
15291 gen_rtvec (2, op0, op1), code)));
15292 }
15293
15294 /* Expand an SVE vec_perm with the given operands. */
15295
15296 void
15297 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15298 {
15299 machine_mode data_mode = GET_MODE (target);
15300 machine_mode sel_mode = GET_MODE (sel);
15301 /* Enforced by the pattern condition. */
15302 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15303
15304 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15305 size of the two value vectors, i.e. the upper bits of the indices
15306 are effectively ignored. SVE TBL instead produces 0 for any
15307 out-of-range indices, so we need to modulo all the vec_perm indices
15308 to ensure they are all in range. */
15309 rtx sel_reg = force_reg (sel_mode, sel);
15310
15311 /* Check if the sel only references the first values vector. */
15312 if (GET_CODE (sel) == CONST_VECTOR
15313 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15314 {
15315 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15316 return;
15317 }
15318
15319 /* Check if the two values vectors are the same. */
15320 if (rtx_equal_p (op0, op1))
15321 {
15322 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15323 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15324 NULL, 0, OPTAB_DIRECT);
15325 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15326 return;
15327 }
15328
15329 /* Run TBL on for each value vector and combine the results. */
15330
15331 rtx res0 = gen_reg_rtx (data_mode);
15332 rtx res1 = gen_reg_rtx (data_mode);
15333 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15334 if (GET_CODE (sel) != CONST_VECTOR
15335 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15336 {
15337 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15338 2 * nunits - 1);
15339 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15340 NULL, 0, OPTAB_DIRECT);
15341 }
15342 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15343 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15344 NULL, 0, OPTAB_DIRECT);
15345 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15346 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15347 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15348 else
15349 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15350 }
15351
15352 /* Recognize patterns suitable for the TRN instructions. */
15353 static bool
15354 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15355 {
15356 HOST_WIDE_INT odd;
15357 poly_uint64 nelt = d->perm.length ();
15358 rtx out, in0, in1, x;
15359 machine_mode vmode = d->vmode;
15360
15361 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15362 return false;
15363
15364 /* Note that these are little-endian tests.
15365 We correct for big-endian later. */
15366 if (!d->perm[0].is_constant (&odd)
15367 || (odd != 0 && odd != 1)
15368 || !d->perm.series_p (0, 2, odd, 2)
15369 || !d->perm.series_p (1, 2, nelt + odd, 2))
15370 return false;
15371
15372 /* Success! */
15373 if (d->testing_p)
15374 return true;
15375
15376 in0 = d->op0;
15377 in1 = d->op1;
15378 /* We don't need a big-endian lane correction for SVE; see the comment
15379 at the head of aarch64-sve.md for details. */
15380 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15381 {
15382 x = in0, in0 = in1, in1 = x;
15383 odd = !odd;
15384 }
15385 out = d->target;
15386
15387 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15388 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15389 return true;
15390 }
15391
15392 /* Recognize patterns suitable for the UZP instructions. */
15393 static bool
15394 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15395 {
15396 HOST_WIDE_INT odd;
15397 rtx out, in0, in1, x;
15398 machine_mode vmode = d->vmode;
15399
15400 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15401 return false;
15402
15403 /* Note that these are little-endian tests.
15404 We correct for big-endian later. */
15405 if (!d->perm[0].is_constant (&odd)
15406 || (odd != 0 && odd != 1)
15407 || !d->perm.series_p (0, 1, odd, 2))
15408 return false;
15409
15410 /* Success! */
15411 if (d->testing_p)
15412 return true;
15413
15414 in0 = d->op0;
15415 in1 = d->op1;
15416 /* We don't need a big-endian lane correction for SVE; see the comment
15417 at the head of aarch64-sve.md for details. */
15418 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15419 {
15420 x = in0, in0 = in1, in1 = x;
15421 odd = !odd;
15422 }
15423 out = d->target;
15424
15425 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15426 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15427 return true;
15428 }
15429
15430 /* Recognize patterns suitable for the ZIP instructions. */
15431 static bool
15432 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15433 {
15434 unsigned int high;
15435 poly_uint64 nelt = d->perm.length ();
15436 rtx out, in0, in1, x;
15437 machine_mode vmode = d->vmode;
15438
15439 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15440 return false;
15441
15442 /* Note that these are little-endian tests.
15443 We correct for big-endian later. */
15444 poly_uint64 first = d->perm[0];
15445 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15446 || !d->perm.series_p (0, 2, first, 1)
15447 || !d->perm.series_p (1, 2, first + nelt, 1))
15448 return false;
15449 high = maybe_ne (first, 0U);
15450
15451 /* Success! */
15452 if (d->testing_p)
15453 return true;
15454
15455 in0 = d->op0;
15456 in1 = d->op1;
15457 /* We don't need a big-endian lane correction for SVE; see the comment
15458 at the head of aarch64-sve.md for details. */
15459 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15460 {
15461 x = in0, in0 = in1, in1 = x;
15462 high = !high;
15463 }
15464 out = d->target;
15465
15466 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15467 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15468 return true;
15469 }
15470
15471 /* Recognize patterns for the EXT insn. */
15472
15473 static bool
15474 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15475 {
15476 HOST_WIDE_INT location;
15477 rtx offset;
15478
15479 /* The first element always refers to the first vector.
15480 Check if the extracted indices are increasing by one. */
15481 if (d->vec_flags == VEC_SVE_PRED
15482 || !d->perm[0].is_constant (&location)
15483 || !d->perm.series_p (0, 1, location, 1))
15484 return false;
15485
15486 /* Success! */
15487 if (d->testing_p)
15488 return true;
15489
15490 /* The case where (location == 0) is a no-op for both big- and little-endian,
15491 and is removed by the mid-end at optimization levels -O1 and higher.
15492
15493 We don't need a big-endian lane correction for SVE; see the comment
15494 at the head of aarch64-sve.md for details. */
15495 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15496 {
15497 /* After setup, we want the high elements of the first vector (stored
15498 at the LSB end of the register), and the low elements of the second
15499 vector (stored at the MSB end of the register). So swap. */
15500 std::swap (d->op0, d->op1);
15501 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15502 to_constant () is safe since this is restricted to Advanced SIMD
15503 vectors. */
15504 location = d->perm.length ().to_constant () - location;
15505 }
15506
15507 offset = GEN_INT (location);
15508 emit_set_insn (d->target,
15509 gen_rtx_UNSPEC (d->vmode,
15510 gen_rtvec (3, d->op0, d->op1, offset),
15511 UNSPEC_EXT));
15512 return true;
15513 }
15514
15515 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15516 within each 64-bit, 32-bit or 16-bit granule. */
15517
15518 static bool
15519 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15520 {
15521 HOST_WIDE_INT diff;
15522 unsigned int i, size, unspec;
15523 machine_mode pred_mode;
15524
15525 if (d->vec_flags == VEC_SVE_PRED
15526 || !d->one_vector_p
15527 || !d->perm[0].is_constant (&diff))
15528 return false;
15529
15530 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15531 if (size == 8)
15532 {
15533 unspec = UNSPEC_REV64;
15534 pred_mode = VNx2BImode;
15535 }
15536 else if (size == 4)
15537 {
15538 unspec = UNSPEC_REV32;
15539 pred_mode = VNx4BImode;
15540 }
15541 else if (size == 2)
15542 {
15543 unspec = UNSPEC_REV16;
15544 pred_mode = VNx8BImode;
15545 }
15546 else
15547 return false;
15548
15549 unsigned int step = diff + 1;
15550 for (i = 0; i < step; ++i)
15551 if (!d->perm.series_p (i, step, diff - i, step))
15552 return false;
15553
15554 /* Success! */
15555 if (d->testing_p)
15556 return true;
15557
15558 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15559 if (d->vec_flags == VEC_SVE_DATA)
15560 {
15561 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15562 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15563 UNSPEC_MERGE_PTRUE);
15564 }
15565 emit_set_insn (d->target, src);
15566 return true;
15567 }
15568
15569 /* Recognize patterns for the REV insn, which reverses elements within
15570 a full vector. */
15571
15572 static bool
15573 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15574 {
15575 poly_uint64 nelt = d->perm.length ();
15576
15577 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15578 return false;
15579
15580 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15581 return false;
15582
15583 /* Success! */
15584 if (d->testing_p)
15585 return true;
15586
15587 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15588 emit_set_insn (d->target, src);
15589 return true;
15590 }
15591
15592 static bool
15593 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15594 {
15595 rtx out = d->target;
15596 rtx in0;
15597 HOST_WIDE_INT elt;
15598 machine_mode vmode = d->vmode;
15599 rtx lane;
15600
15601 if (d->vec_flags == VEC_SVE_PRED
15602 || d->perm.encoding ().encoded_nelts () != 1
15603 || !d->perm[0].is_constant (&elt))
15604 return false;
15605
15606 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15607 return false;
15608
15609 /* Success! */
15610 if (d->testing_p)
15611 return true;
15612
15613 /* The generic preparation in aarch64_expand_vec_perm_const_1
15614 swaps the operand order and the permute indices if it finds
15615 d->perm[0] to be in the second operand. Thus, we can always
15616 use d->op0 and need not do any extra arithmetic to get the
15617 correct lane number. */
15618 in0 = d->op0;
15619 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15620
15621 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15622 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15623 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15624 return true;
15625 }
15626
15627 static bool
15628 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15629 {
15630 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15631 machine_mode vmode = d->vmode;
15632
15633 /* Make sure that the indices are constant. */
15634 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15635 for (unsigned int i = 0; i < encoded_nelts; ++i)
15636 if (!d->perm[i].is_constant ())
15637 return false;
15638
15639 if (d->testing_p)
15640 return true;
15641
15642 /* Generic code will try constant permutation twice. Once with the
15643 original mode and again with the elements lowered to QImode.
15644 So wait and don't do the selector expansion ourselves. */
15645 if (vmode != V8QImode && vmode != V16QImode)
15646 return false;
15647
15648 /* to_constant is safe since this routine is specific to Advanced SIMD
15649 vectors. */
15650 unsigned int nelt = d->perm.length ().to_constant ();
15651 for (unsigned int i = 0; i < nelt; ++i)
15652 /* If big-endian and two vectors we end up with a weird mixed-endian
15653 mode on NEON. Reverse the index within each word but not the word
15654 itself. to_constant is safe because we checked is_constant above. */
15655 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15656 ? d->perm[i].to_constant () ^ (nelt - 1)
15657 : d->perm[i].to_constant ());
15658
15659 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15660 sel = force_reg (vmode, sel);
15661
15662 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15663 return true;
15664 }
15665
15666 /* Try to implement D using an SVE TBL instruction. */
15667
15668 static bool
15669 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15670 {
15671 unsigned HOST_WIDE_INT nelt;
15672
15673 /* Permuting two variable-length vectors could overflow the
15674 index range. */
15675 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15676 return false;
15677
15678 if (d->testing_p)
15679 return true;
15680
15681 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15682 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15683 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15684 return true;
15685 }
15686
15687 static bool
15688 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15689 {
15690 /* The pattern matching functions above are written to look for a small
15691 number to begin the sequence (0, 1, N/2). If we begin with an index
15692 from the second operand, we can swap the operands. */
15693 poly_int64 nelt = d->perm.length ();
15694 if (known_ge (d->perm[0], nelt))
15695 {
15696 d->perm.rotate_inputs (1);
15697 std::swap (d->op0, d->op1);
15698 }
15699
15700 if ((d->vec_flags == VEC_ADVSIMD
15701 || d->vec_flags == VEC_SVE_DATA
15702 || d->vec_flags == VEC_SVE_PRED)
15703 && known_gt (nelt, 1))
15704 {
15705 if (aarch64_evpc_rev_local (d))
15706 return true;
15707 else if (aarch64_evpc_rev_global (d))
15708 return true;
15709 else if (aarch64_evpc_ext (d))
15710 return true;
15711 else if (aarch64_evpc_dup (d))
15712 return true;
15713 else if (aarch64_evpc_zip (d))
15714 return true;
15715 else if (aarch64_evpc_uzp (d))
15716 return true;
15717 else if (aarch64_evpc_trn (d))
15718 return true;
15719 if (d->vec_flags == VEC_SVE_DATA)
15720 return aarch64_evpc_sve_tbl (d);
15721 else if (d->vec_flags == VEC_SVE_DATA)
15722 return aarch64_evpc_tbl (d);
15723 }
15724 return false;
15725 }
15726
15727 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15728
15729 static bool
15730 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15731 rtx op1, const vec_perm_indices &sel)
15732 {
15733 struct expand_vec_perm_d d;
15734
15735 /* Check whether the mask can be applied to a single vector. */
15736 if (op0 && rtx_equal_p (op0, op1))
15737 d.one_vector_p = true;
15738 else if (sel.all_from_input_p (0))
15739 {
15740 d.one_vector_p = true;
15741 op1 = op0;
15742 }
15743 else if (sel.all_from_input_p (1))
15744 {
15745 d.one_vector_p = true;
15746 op0 = op1;
15747 }
15748 else
15749 d.one_vector_p = false;
15750
15751 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15752 sel.nelts_per_input ());
15753 d.vmode = vmode;
15754 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15755 d.target = target;
15756 d.op0 = op0;
15757 d.op1 = op1;
15758 d.testing_p = !target;
15759
15760 if (!d.testing_p)
15761 return aarch64_expand_vec_perm_const_1 (&d);
15762
15763 rtx_insn *last = get_last_insn ();
15764 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15765 gcc_assert (last == get_last_insn ());
15766
15767 return ret;
15768 }
15769
15770 /* Generate a byte permute mask for a register of mode MODE,
15771 which has NUNITS units. */
15772
15773 rtx
15774 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15775 {
15776 /* We have to reverse each vector because we dont have
15777 a permuted load that can reverse-load according to ABI rules. */
15778 rtx mask;
15779 rtvec v = rtvec_alloc (16);
15780 unsigned int i, j;
15781 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15782
15783 gcc_assert (BYTES_BIG_ENDIAN);
15784 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15785
15786 for (i = 0; i < nunits; i++)
15787 for (j = 0; j < usize; j++)
15788 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15789 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15790 return force_reg (V16QImode, mask);
15791 }
15792
15793 /* Return true if X is a valid second operand for the SVE instruction
15794 that implements integer comparison OP_CODE. */
15795
15796 static bool
15797 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15798 {
15799 if (register_operand (x, VOIDmode))
15800 return true;
15801
15802 switch (op_code)
15803 {
15804 case LTU:
15805 case LEU:
15806 case GEU:
15807 case GTU:
15808 return aarch64_sve_cmp_immediate_p (x, false);
15809 case LT:
15810 case LE:
15811 case GE:
15812 case GT:
15813 case NE:
15814 case EQ:
15815 return aarch64_sve_cmp_immediate_p (x, true);
15816 default:
15817 gcc_unreachable ();
15818 }
15819 }
15820
15821 /* Use predicated SVE instructions to implement the equivalent of:
15822
15823 (set TARGET OP)
15824
15825 given that PTRUE is an all-true predicate of the appropriate mode. */
15826
15827 static void
15828 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15829 {
15830 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15831 gen_rtvec (2, ptrue, op),
15832 UNSPEC_MERGE_PTRUE);
15833 rtx_insn *insn = emit_set_insn (target, unspec);
15834 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15835 }
15836
15837 /* Likewise, but also clobber the condition codes. */
15838
15839 static void
15840 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15841 {
15842 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15843 gen_rtvec (2, ptrue, op),
15844 UNSPEC_MERGE_PTRUE);
15845 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15846 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15847 }
15848
15849 /* Return the UNSPEC_COND_* code for comparison CODE. */
15850
15851 static unsigned int
15852 aarch64_unspec_cond_code (rtx_code code)
15853 {
15854 switch (code)
15855 {
15856 case NE:
15857 return UNSPEC_COND_NE;
15858 case EQ:
15859 return UNSPEC_COND_EQ;
15860 case LT:
15861 return UNSPEC_COND_LT;
15862 case GT:
15863 return UNSPEC_COND_GT;
15864 case LE:
15865 return UNSPEC_COND_LE;
15866 case GE:
15867 return UNSPEC_COND_GE;
15868 default:
15869 gcc_unreachable ();
15870 }
15871 }
15872
15873 /* Emit:
15874
15875 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15876
15877 where <X> is the operation associated with comparison CODE. This form
15878 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15879 semantics, such as when PRED might not be all-true and when comparing
15880 inactive lanes could have side effects. */
15881
15882 static void
15883 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15884 rtx pred, rtx op0, rtx op1)
15885 {
15886 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15887 gen_rtvec (3, pred, op0, op1),
15888 aarch64_unspec_cond_code (code));
15889 emit_set_insn (target, unspec);
15890 }
15891
15892 /* Expand an SVE integer comparison using the SVE equivalent of:
15893
15894 (set TARGET (CODE OP0 OP1)). */
15895
15896 void
15897 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15898 {
15899 machine_mode pred_mode = GET_MODE (target);
15900 machine_mode data_mode = GET_MODE (op0);
15901
15902 if (!aarch64_sve_cmp_operand_p (code, op1))
15903 op1 = force_reg (data_mode, op1);
15904
15905 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15906 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15907 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15908 }
15909
15910 /* Emit the SVE equivalent of:
15911
15912 (set TMP1 (CODE1 OP0 OP1))
15913 (set TMP2 (CODE2 OP0 OP1))
15914 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15915
15916 PTRUE is an all-true predicate with the same mode as TARGET. */
15917
15918 static void
15919 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15920 rtx ptrue, rtx op0, rtx op1)
15921 {
15922 machine_mode pred_mode = GET_MODE (ptrue);
15923 rtx tmp1 = gen_reg_rtx (pred_mode);
15924 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15925 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15926 rtx tmp2 = gen_reg_rtx (pred_mode);
15927 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15928 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15929 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15930 }
15931
15932 /* Emit the SVE equivalent of:
15933
15934 (set TMP (CODE OP0 OP1))
15935 (set TARGET (not TMP))
15936
15937 PTRUE is an all-true predicate with the same mode as TARGET. */
15938
15939 static void
15940 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15941 rtx op0, rtx op1)
15942 {
15943 machine_mode pred_mode = GET_MODE (ptrue);
15944 rtx tmp = gen_reg_rtx (pred_mode);
15945 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15946 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15947 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15948 }
15949
15950 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15951
15952 (set TARGET (CODE OP0 OP1))
15953
15954 If CAN_INVERT_P is true, the caller can also handle inverted results;
15955 return true if the result is in fact inverted. */
15956
15957 bool
15958 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15959 rtx op0, rtx op1, bool can_invert_p)
15960 {
15961 machine_mode pred_mode = GET_MODE (target);
15962 machine_mode data_mode = GET_MODE (op0);
15963
15964 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15965 switch (code)
15966 {
15967 case UNORDERED:
15968 /* UNORDERED has no immediate form. */
15969 op1 = force_reg (data_mode, op1);
15970 /* fall through */
15971 case LT:
15972 case LE:
15973 case GT:
15974 case GE:
15975 case EQ:
15976 case NE:
15977 {
15978 /* There is native support for the comparison. */
15979 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15980 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15981 return false;
15982 }
15983
15984 case LTGT:
15985 /* This is a trapping operation (LT or GT). */
15986 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15987 return false;
15988
15989 case UNEQ:
15990 if (!flag_trapping_math)
15991 {
15992 /* This would trap for signaling NaNs. */
15993 op1 = force_reg (data_mode, op1);
15994 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15995 return false;
15996 }
15997 /* fall through */
15998 case UNLT:
15999 case UNLE:
16000 case UNGT:
16001 case UNGE:
16002 if (flag_trapping_math)
16003 {
16004 /* Work out which elements are ordered. */
16005 rtx ordered = gen_reg_rtx (pred_mode);
16006 op1 = force_reg (data_mode, op1);
16007 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16008
16009 /* Test the opposite condition for the ordered elements,
16010 then invert the result. */
16011 if (code == UNEQ)
16012 code = NE;
16013 else
16014 code = reverse_condition_maybe_unordered (code);
16015 if (can_invert_p)
16016 {
16017 aarch64_emit_sve_predicated_cond (target, code,
16018 ordered, op0, op1);
16019 return true;
16020 }
16021 rtx tmp = gen_reg_rtx (pred_mode);
16022 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16023 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16024 return false;
16025 }
16026 break;
16027
16028 case ORDERED:
16029 /* ORDERED has no immediate form. */
16030 op1 = force_reg (data_mode, op1);
16031 break;
16032
16033 default:
16034 gcc_unreachable ();
16035 }
16036
16037 /* There is native support for the inverse comparison. */
16038 code = reverse_condition_maybe_unordered (code);
16039 if (can_invert_p)
16040 {
16041 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16042 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16043 return true;
16044 }
16045 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16046 return false;
16047 }
16048
16049 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16050 of the data being selected and CMP_MODE is the mode of the values being
16051 compared. */
16052
16053 void
16054 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16055 rtx *ops)
16056 {
16057 machine_mode pred_mode
16058 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16059 GET_MODE_SIZE (cmp_mode)).require ();
16060 rtx pred = gen_reg_rtx (pred_mode);
16061 if (FLOAT_MODE_P (cmp_mode))
16062 {
16063 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16064 ops[4], ops[5], true))
16065 std::swap (ops[1], ops[2]);
16066 }
16067 else
16068 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16069
16070 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16071 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16072 }
16073
16074 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16075 true. However due to issues with register allocation it is preferable
16076 to avoid tieing integer scalar and FP scalar modes. Executing integer
16077 operations in general registers is better than treating them as scalar
16078 vector operations. This reduces latency and avoids redundant int<->FP
16079 moves. So tie modes if they are either the same class, or vector modes
16080 with other vector modes, vector structs or any scalar mode. */
16081
16082 static bool
16083 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16084 {
16085 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16086 return true;
16087
16088 /* We specifically want to allow elements of "structure" modes to
16089 be tieable to the structure. This more general condition allows
16090 other rarer situations too. The reason we don't extend this to
16091 predicate modes is that there are no predicate structure modes
16092 nor any specific instructions for extracting part of a predicate
16093 register. */
16094 if (aarch64_vector_data_mode_p (mode1)
16095 && aarch64_vector_data_mode_p (mode2))
16096 return true;
16097
16098 /* Also allow any scalar modes with vectors. */
16099 if (aarch64_vector_mode_supported_p (mode1)
16100 || aarch64_vector_mode_supported_p (mode2))
16101 return true;
16102
16103 return false;
16104 }
16105
16106 /* Return a new RTX holding the result of moving POINTER forward by
16107 AMOUNT bytes. */
16108
16109 static rtx
16110 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16111 {
16112 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16113
16114 return adjust_automodify_address (pointer, GET_MODE (pointer),
16115 next, amount);
16116 }
16117
16118 /* Return a new RTX holding the result of moving POINTER forward by the
16119 size of the mode it points to. */
16120
16121 static rtx
16122 aarch64_progress_pointer (rtx pointer)
16123 {
16124 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16125 }
16126
16127 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16128 MODE bytes. */
16129
16130 static void
16131 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16132 machine_mode mode)
16133 {
16134 rtx reg = gen_reg_rtx (mode);
16135
16136 /* "Cast" the pointers to the correct mode. */
16137 *src = adjust_address (*src, mode, 0);
16138 *dst = adjust_address (*dst, mode, 0);
16139 /* Emit the memcpy. */
16140 emit_move_insn (reg, *src);
16141 emit_move_insn (*dst, reg);
16142 /* Move the pointers forward. */
16143 *src = aarch64_progress_pointer (*src);
16144 *dst = aarch64_progress_pointer (*dst);
16145 }
16146
16147 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16148 we succeed, otherwise return false. */
16149
16150 bool
16151 aarch64_expand_movmem (rtx *operands)
16152 {
16153 int n, mode_bits;
16154 rtx dst = operands[0];
16155 rtx src = operands[1];
16156 rtx base;
16157 machine_mode cur_mode = BLKmode, next_mode;
16158 bool speed_p = !optimize_function_for_size_p (cfun);
16159
16160 /* When optimizing for size, give a better estimate of the length of a
16161 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16162 will always require an even number of instructions to do now. And each
16163 operation requires both a load+store, so devide the max number by 2. */
16164 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16165
16166 /* We can't do anything smart if the amount to copy is not constant. */
16167 if (!CONST_INT_P (operands[2]))
16168 return false;
16169
16170 n = INTVAL (operands[2]);
16171
16172 /* Try to keep the number of instructions low. For all cases we will do at
16173 most two moves for the residual amount, since we'll always overlap the
16174 remainder. */
16175 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16176 return false;
16177
16178 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16179 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16180
16181 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16182 src = adjust_automodify_address (src, VOIDmode, base, 0);
16183
16184 /* Convert n to bits to make the rest of the code simpler. */
16185 n = n * BITS_PER_UNIT;
16186
16187 while (n > 0)
16188 {
16189 /* Find the largest mode in which to do the copy in without over reading
16190 or writing. */
16191 opt_scalar_int_mode mode_iter;
16192 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16193 if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
16194 cur_mode = mode_iter.require ();
16195
16196 gcc_assert (cur_mode != BLKmode);
16197
16198 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16199 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16200
16201 n -= mode_bits;
16202
16203 /* Do certain trailing copies as overlapping if it's going to be
16204 cheaper. i.e. less instructions to do so. For instance doing a 15
16205 byte copy it's more efficient to do two overlapping 8 byte copies than
16206 8 + 6 + 1. */
16207 next_mode = smallest_mode_for_size (n, MODE_INT);
16208 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16209 if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
16210 {
16211 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16212 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16213 n = n_bits;
16214 }
16215 }
16216
16217 return true;
16218 }
16219
16220 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16221 SImode stores. Handle the case when the constant has identical
16222 bottom and top halves. This is beneficial when the two stores can be
16223 merged into an STP and we avoid synthesising potentially expensive
16224 immediates twice. Return true if such a split is possible. */
16225
16226 bool
16227 aarch64_split_dimode_const_store (rtx dst, rtx src)
16228 {
16229 rtx lo = gen_lowpart (SImode, src);
16230 rtx hi = gen_highpart_mode (SImode, DImode, src);
16231
16232 bool size_p = optimize_function_for_size_p (cfun);
16233
16234 if (!rtx_equal_p (lo, hi))
16235 return false;
16236
16237 unsigned int orig_cost
16238 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16239 unsigned int lo_cost
16240 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16241
16242 /* We want to transform:
16243 MOV x1, 49370
16244 MOVK x1, 0x140, lsl 16
16245 MOVK x1, 0xc0da, lsl 32
16246 MOVK x1, 0x140, lsl 48
16247 STR x1, [x0]
16248 into:
16249 MOV w1, 49370
16250 MOVK w1, 0x140, lsl 16
16251 STP w1, w1, [x0]
16252 So we want to perform this only when we save two instructions
16253 or more. When optimizing for size, however, accept any code size
16254 savings we can. */
16255 if (size_p && orig_cost <= lo_cost)
16256 return false;
16257
16258 if (!size_p
16259 && (orig_cost <= lo_cost + 1))
16260 return false;
16261
16262 rtx mem_lo = adjust_address (dst, SImode, 0);
16263 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16264 return false;
16265
16266 rtx tmp_reg = gen_reg_rtx (SImode);
16267 aarch64_expand_mov_immediate (tmp_reg, lo);
16268 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16269 /* Don't emit an explicit store pair as this may not be always profitable.
16270 Let the sched-fusion logic decide whether to merge them. */
16271 emit_move_insn (mem_lo, tmp_reg);
16272 emit_move_insn (mem_hi, tmp_reg);
16273
16274 return true;
16275 }
16276
16277 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16278
16279 static unsigned HOST_WIDE_INT
16280 aarch64_asan_shadow_offset (void)
16281 {
16282 return (HOST_WIDE_INT_1 << 36);
16283 }
16284
16285 static rtx
16286 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16287 int code, tree treeop0, tree treeop1)
16288 {
16289 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16290 rtx op0, op1;
16291 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16292 insn_code icode;
16293 struct expand_operand ops[4];
16294
16295 start_sequence ();
16296 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16297
16298 op_mode = GET_MODE (op0);
16299 if (op_mode == VOIDmode)
16300 op_mode = GET_MODE (op1);
16301
16302 switch (op_mode)
16303 {
16304 case E_QImode:
16305 case E_HImode:
16306 case E_SImode:
16307 cmp_mode = SImode;
16308 icode = CODE_FOR_cmpsi;
16309 break;
16310
16311 case E_DImode:
16312 cmp_mode = DImode;
16313 icode = CODE_FOR_cmpdi;
16314 break;
16315
16316 case E_SFmode:
16317 cmp_mode = SFmode;
16318 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16319 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16320 break;
16321
16322 case E_DFmode:
16323 cmp_mode = DFmode;
16324 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16325 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16326 break;
16327
16328 default:
16329 end_sequence ();
16330 return NULL_RTX;
16331 }
16332
16333 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16334 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16335 if (!op0 || !op1)
16336 {
16337 end_sequence ();
16338 return NULL_RTX;
16339 }
16340 *prep_seq = get_insns ();
16341 end_sequence ();
16342
16343 create_fixed_operand (&ops[0], op0);
16344 create_fixed_operand (&ops[1], op1);
16345
16346 start_sequence ();
16347 if (!maybe_expand_insn (icode, 2, ops))
16348 {
16349 end_sequence ();
16350 return NULL_RTX;
16351 }
16352 *gen_seq = get_insns ();
16353 end_sequence ();
16354
16355 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16356 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16357 }
16358
16359 static rtx
16360 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16361 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16362 {
16363 rtx op0, op1, target;
16364 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16365 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16366 insn_code icode;
16367 struct expand_operand ops[6];
16368 int aarch64_cond;
16369
16370 push_to_sequence (*prep_seq);
16371 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16372
16373 op_mode = GET_MODE (op0);
16374 if (op_mode == VOIDmode)
16375 op_mode = GET_MODE (op1);
16376
16377 switch (op_mode)
16378 {
16379 case E_QImode:
16380 case E_HImode:
16381 case E_SImode:
16382 cmp_mode = SImode;
16383 icode = CODE_FOR_ccmpsi;
16384 break;
16385
16386 case E_DImode:
16387 cmp_mode = DImode;
16388 icode = CODE_FOR_ccmpdi;
16389 break;
16390
16391 case E_SFmode:
16392 cmp_mode = SFmode;
16393 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16394 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16395 break;
16396
16397 case E_DFmode:
16398 cmp_mode = DFmode;
16399 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16400 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16401 break;
16402
16403 default:
16404 end_sequence ();
16405 return NULL_RTX;
16406 }
16407
16408 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16409 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16410 if (!op0 || !op1)
16411 {
16412 end_sequence ();
16413 return NULL_RTX;
16414 }
16415 *prep_seq = get_insns ();
16416 end_sequence ();
16417
16418 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16419 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16420
16421 if (bit_code != AND)
16422 {
16423 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16424 GET_MODE (XEXP (prev, 0))),
16425 VOIDmode, XEXP (prev, 0), const0_rtx);
16426 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16427 }
16428
16429 create_fixed_operand (&ops[0], XEXP (prev, 0));
16430 create_fixed_operand (&ops[1], target);
16431 create_fixed_operand (&ops[2], op0);
16432 create_fixed_operand (&ops[3], op1);
16433 create_fixed_operand (&ops[4], prev);
16434 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16435
16436 push_to_sequence (*gen_seq);
16437 if (!maybe_expand_insn (icode, 6, ops))
16438 {
16439 end_sequence ();
16440 return NULL_RTX;
16441 }
16442
16443 *gen_seq = get_insns ();
16444 end_sequence ();
16445
16446 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16447 }
16448
16449 #undef TARGET_GEN_CCMP_FIRST
16450 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16451
16452 #undef TARGET_GEN_CCMP_NEXT
16453 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16454
16455 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16456 instruction fusion of some sort. */
16457
16458 static bool
16459 aarch64_macro_fusion_p (void)
16460 {
16461 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16462 }
16463
16464
16465 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16466 should be kept together during scheduling. */
16467
16468 static bool
16469 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16470 {
16471 rtx set_dest;
16472 rtx prev_set = single_set (prev);
16473 rtx curr_set = single_set (curr);
16474 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16475 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16476
16477 if (!aarch64_macro_fusion_p ())
16478 return false;
16479
16480 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16481 {
16482 /* We are trying to match:
16483 prev (mov) == (set (reg r0) (const_int imm16))
16484 curr (movk) == (set (zero_extract (reg r0)
16485 (const_int 16)
16486 (const_int 16))
16487 (const_int imm16_1)) */
16488
16489 set_dest = SET_DEST (curr_set);
16490
16491 if (GET_CODE (set_dest) == ZERO_EXTRACT
16492 && CONST_INT_P (SET_SRC (curr_set))
16493 && CONST_INT_P (SET_SRC (prev_set))
16494 && CONST_INT_P (XEXP (set_dest, 2))
16495 && INTVAL (XEXP (set_dest, 2)) == 16
16496 && REG_P (XEXP (set_dest, 0))
16497 && REG_P (SET_DEST (prev_set))
16498 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16499 {
16500 return true;
16501 }
16502 }
16503
16504 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16505 {
16506
16507 /* We're trying to match:
16508 prev (adrp) == (set (reg r1)
16509 (high (symbol_ref ("SYM"))))
16510 curr (add) == (set (reg r0)
16511 (lo_sum (reg r1)
16512 (symbol_ref ("SYM"))))
16513 Note that r0 need not necessarily be the same as r1, especially
16514 during pre-regalloc scheduling. */
16515
16516 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16517 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16518 {
16519 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16520 && REG_P (XEXP (SET_SRC (curr_set), 0))
16521 && REGNO (XEXP (SET_SRC (curr_set), 0))
16522 == REGNO (SET_DEST (prev_set))
16523 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16524 XEXP (SET_SRC (curr_set), 1)))
16525 return true;
16526 }
16527 }
16528
16529 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16530 {
16531
16532 /* We're trying to match:
16533 prev (movk) == (set (zero_extract (reg r0)
16534 (const_int 16)
16535 (const_int 32))
16536 (const_int imm16_1))
16537 curr (movk) == (set (zero_extract (reg r0)
16538 (const_int 16)
16539 (const_int 48))
16540 (const_int imm16_2)) */
16541
16542 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16543 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16544 && REG_P (XEXP (SET_DEST (prev_set), 0))
16545 && REG_P (XEXP (SET_DEST (curr_set), 0))
16546 && REGNO (XEXP (SET_DEST (prev_set), 0))
16547 == REGNO (XEXP (SET_DEST (curr_set), 0))
16548 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16549 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16550 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16551 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16552 && CONST_INT_P (SET_SRC (prev_set))
16553 && CONST_INT_P (SET_SRC (curr_set)))
16554 return true;
16555
16556 }
16557 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16558 {
16559 /* We're trying to match:
16560 prev (adrp) == (set (reg r0)
16561 (high (symbol_ref ("SYM"))))
16562 curr (ldr) == (set (reg r1)
16563 (mem (lo_sum (reg r0)
16564 (symbol_ref ("SYM")))))
16565 or
16566 curr (ldr) == (set (reg r1)
16567 (zero_extend (mem
16568 (lo_sum (reg r0)
16569 (symbol_ref ("SYM")))))) */
16570 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16571 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16572 {
16573 rtx curr_src = SET_SRC (curr_set);
16574
16575 if (GET_CODE (curr_src) == ZERO_EXTEND)
16576 curr_src = XEXP (curr_src, 0);
16577
16578 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16579 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16580 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16581 == REGNO (SET_DEST (prev_set))
16582 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16583 XEXP (SET_SRC (prev_set), 0)))
16584 return true;
16585 }
16586 }
16587
16588 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16589 && aarch_crypto_can_dual_issue (prev, curr))
16590 return true;
16591
16592 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16593 && any_condjump_p (curr))
16594 {
16595 enum attr_type prev_type = get_attr_type (prev);
16596
16597 unsigned int condreg1, condreg2;
16598 rtx cc_reg_1;
16599 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16600 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16601
16602 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16603 && prev
16604 && modified_in_p (cc_reg_1, prev))
16605 {
16606 /* FIXME: this misses some which is considered simple arthematic
16607 instructions for ThunderX. Simple shifts are missed here. */
16608 if (prev_type == TYPE_ALUS_SREG
16609 || prev_type == TYPE_ALUS_IMM
16610 || prev_type == TYPE_LOGICS_REG
16611 || prev_type == TYPE_LOGICS_IMM)
16612 return true;
16613 }
16614 }
16615
16616 if (prev_set
16617 && curr_set
16618 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16619 && any_condjump_p (curr))
16620 {
16621 /* We're trying to match:
16622 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16623 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16624 (const_int 0))
16625 (label_ref ("SYM"))
16626 (pc)) */
16627 if (SET_DEST (curr_set) == (pc_rtx)
16628 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16629 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16630 && REG_P (SET_DEST (prev_set))
16631 && REGNO (SET_DEST (prev_set))
16632 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16633 {
16634 /* Fuse ALU operations followed by conditional branch instruction. */
16635 switch (get_attr_type (prev))
16636 {
16637 case TYPE_ALU_IMM:
16638 case TYPE_ALU_SREG:
16639 case TYPE_ADC_REG:
16640 case TYPE_ADC_IMM:
16641 case TYPE_ADCS_REG:
16642 case TYPE_ADCS_IMM:
16643 case TYPE_LOGIC_REG:
16644 case TYPE_LOGIC_IMM:
16645 case TYPE_CSEL:
16646 case TYPE_ADR:
16647 case TYPE_MOV_IMM:
16648 case TYPE_SHIFT_REG:
16649 case TYPE_SHIFT_IMM:
16650 case TYPE_BFM:
16651 case TYPE_RBIT:
16652 case TYPE_REV:
16653 case TYPE_EXTEND:
16654 return true;
16655
16656 default:;
16657 }
16658 }
16659 }
16660
16661 return false;
16662 }
16663
16664 /* Return true iff the instruction fusion described by OP is enabled. */
16665
16666 bool
16667 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16668 {
16669 return (aarch64_tune_params.fusible_ops & op) != 0;
16670 }
16671
16672 /* If MEM is in the form of [base+offset], extract the two parts
16673 of address and set to BASE and OFFSET, otherwise return false
16674 after clearing BASE and OFFSET. */
16675
16676 bool
16677 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16678 {
16679 rtx addr;
16680
16681 gcc_assert (MEM_P (mem));
16682
16683 addr = XEXP (mem, 0);
16684
16685 if (REG_P (addr))
16686 {
16687 *base = addr;
16688 *offset = const0_rtx;
16689 return true;
16690 }
16691
16692 if (GET_CODE (addr) == PLUS
16693 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16694 {
16695 *base = XEXP (addr, 0);
16696 *offset = XEXP (addr, 1);
16697 return true;
16698 }
16699
16700 *base = NULL_RTX;
16701 *offset = NULL_RTX;
16702
16703 return false;
16704 }
16705
16706 /* Types for scheduling fusion. */
16707 enum sched_fusion_type
16708 {
16709 SCHED_FUSION_NONE = 0,
16710 SCHED_FUSION_LD_SIGN_EXTEND,
16711 SCHED_FUSION_LD_ZERO_EXTEND,
16712 SCHED_FUSION_LD,
16713 SCHED_FUSION_ST,
16714 SCHED_FUSION_NUM
16715 };
16716
16717 /* If INSN is a load or store of address in the form of [base+offset],
16718 extract the two parts and set to BASE and OFFSET. Return scheduling
16719 fusion type this INSN is. */
16720
16721 static enum sched_fusion_type
16722 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16723 {
16724 rtx x, dest, src;
16725 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16726
16727 gcc_assert (INSN_P (insn));
16728 x = PATTERN (insn);
16729 if (GET_CODE (x) != SET)
16730 return SCHED_FUSION_NONE;
16731
16732 src = SET_SRC (x);
16733 dest = SET_DEST (x);
16734
16735 machine_mode dest_mode = GET_MODE (dest);
16736
16737 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16738 return SCHED_FUSION_NONE;
16739
16740 if (GET_CODE (src) == SIGN_EXTEND)
16741 {
16742 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16743 src = XEXP (src, 0);
16744 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16745 return SCHED_FUSION_NONE;
16746 }
16747 else if (GET_CODE (src) == ZERO_EXTEND)
16748 {
16749 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16750 src = XEXP (src, 0);
16751 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16752 return SCHED_FUSION_NONE;
16753 }
16754
16755 if (GET_CODE (src) == MEM && REG_P (dest))
16756 extract_base_offset_in_addr (src, base, offset);
16757 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16758 {
16759 fusion = SCHED_FUSION_ST;
16760 extract_base_offset_in_addr (dest, base, offset);
16761 }
16762 else
16763 return SCHED_FUSION_NONE;
16764
16765 if (*base == NULL_RTX || *offset == NULL_RTX)
16766 fusion = SCHED_FUSION_NONE;
16767
16768 return fusion;
16769 }
16770
16771 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16772
16773 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16774 and PRI are only calculated for these instructions. For other instruction,
16775 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16776 type instruction fusion can be added by returning different priorities.
16777
16778 It's important that irrelevant instructions get the largest FUSION_PRI. */
16779
16780 static void
16781 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16782 int *fusion_pri, int *pri)
16783 {
16784 int tmp, off_val;
16785 rtx base, offset;
16786 enum sched_fusion_type fusion;
16787
16788 gcc_assert (INSN_P (insn));
16789
16790 tmp = max_pri - 1;
16791 fusion = fusion_load_store (insn, &base, &offset);
16792 if (fusion == SCHED_FUSION_NONE)
16793 {
16794 *pri = tmp;
16795 *fusion_pri = tmp;
16796 return;
16797 }
16798
16799 /* Set FUSION_PRI according to fusion type and base register. */
16800 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16801
16802 /* Calculate PRI. */
16803 tmp /= 2;
16804
16805 /* INSN with smaller offset goes first. */
16806 off_val = (int)(INTVAL (offset));
16807 if (off_val >= 0)
16808 tmp -= (off_val & 0xfffff);
16809 else
16810 tmp += ((- off_val) & 0xfffff);
16811
16812 *pri = tmp;
16813 return;
16814 }
16815
16816 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16817 Adjust priority of sha1h instructions so they are scheduled before
16818 other SHA1 instructions. */
16819
16820 static int
16821 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16822 {
16823 rtx x = PATTERN (insn);
16824
16825 if (GET_CODE (x) == SET)
16826 {
16827 x = SET_SRC (x);
16828
16829 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16830 return priority + 10;
16831 }
16832
16833 return priority;
16834 }
16835
16836 /* Given OPERANDS of consecutive load/store, check if we can merge
16837 them into ldp/stp. LOAD is true if they are load instructions.
16838 MODE is the mode of memory operands. */
16839
16840 bool
16841 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16842 machine_mode mode)
16843 {
16844 HOST_WIDE_INT offval_1, offval_2, msize;
16845 enum reg_class rclass_1, rclass_2;
16846 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16847
16848 if (load)
16849 {
16850 mem_1 = operands[1];
16851 mem_2 = operands[3];
16852 reg_1 = operands[0];
16853 reg_2 = operands[2];
16854 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16855 if (REGNO (reg_1) == REGNO (reg_2))
16856 return false;
16857 }
16858 else
16859 {
16860 mem_1 = operands[0];
16861 mem_2 = operands[2];
16862 reg_1 = operands[1];
16863 reg_2 = operands[3];
16864 }
16865
16866 /* The mems cannot be volatile. */
16867 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16868 return false;
16869
16870 /* If we have SImode and slow unaligned ldp,
16871 check the alignment to be at least 8 byte. */
16872 if (mode == SImode
16873 && (aarch64_tune_params.extra_tuning_flags
16874 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16875 && !optimize_size
16876 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16877 return false;
16878
16879 /* Check if the addresses are in the form of [base+offset]. */
16880 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16881 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16882 return false;
16883 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16884 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16885 return false;
16886
16887 /* Check if the bases are same. */
16888 if (!rtx_equal_p (base_1, base_2))
16889 return false;
16890
16891 /* The operands must be of the same size. */
16892 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16893 GET_MODE_SIZE (GET_MODE (mem_2))));
16894
16895 offval_1 = INTVAL (offset_1);
16896 offval_2 = INTVAL (offset_2);
16897 /* We should only be trying this for fixed-sized modes. There is no
16898 SVE LDP/STP instruction. */
16899 msize = GET_MODE_SIZE (mode).to_constant ();
16900 /* Check if the offsets are consecutive. */
16901 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16902 return false;
16903
16904 /* Check if the addresses are clobbered by load. */
16905 if (load)
16906 {
16907 if (reg_mentioned_p (reg_1, mem_1))
16908 return false;
16909
16910 /* In increasing order, the last load can clobber the address. */
16911 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16912 return false;
16913 }
16914
16915 /* One of the memory accesses must be a mempair operand.
16916 If it is not the first one, they need to be swapped by the
16917 peephole. */
16918 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16919 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16920 return false;
16921
16922 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16923 rclass_1 = FP_REGS;
16924 else
16925 rclass_1 = GENERAL_REGS;
16926
16927 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16928 rclass_2 = FP_REGS;
16929 else
16930 rclass_2 = GENERAL_REGS;
16931
16932 /* Check if the registers are of same class. */
16933 if (rclass_1 != rclass_2)
16934 return false;
16935
16936 return true;
16937 }
16938
16939 /* Given OPERANDS of consecutive load/store that can be merged,
16940 swap them if they are not in ascending order. */
16941 void
16942 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16943 {
16944 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16945 HOST_WIDE_INT offval_1, offval_2;
16946
16947 if (load)
16948 {
16949 mem_1 = operands[1];
16950 mem_2 = operands[3];
16951 }
16952 else
16953 {
16954 mem_1 = operands[0];
16955 mem_2 = operands[2];
16956 }
16957
16958 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16959 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16960
16961 offval_1 = INTVAL (offset_1);
16962 offval_2 = INTVAL (offset_2);
16963
16964 if (offval_1 > offval_2)
16965 {
16966 /* Irrespective of whether this is a load or a store,
16967 we do the same swap. */
16968 std::swap (operands[0], operands[2]);
16969 std::swap (operands[1], operands[3]);
16970 }
16971 }
16972
16973 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16974 comparison between the two. */
16975 int
16976 aarch64_host_wide_int_compare (const void *x, const void *y)
16977 {
16978 return wi::cmps (* ((const HOST_WIDE_INT *) x),
16979 * ((const HOST_WIDE_INT *) y));
16980 }
16981
16982 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16983 other pointing to a REG rtx containing an offset, compare the offsets
16984 of the two pairs.
16985
16986 Return:
16987
16988 1 iff offset (X) > offset (Y)
16989 0 iff offset (X) == offset (Y)
16990 -1 iff offset (X) < offset (Y) */
16991 int
16992 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16993 {
16994 const rtx * operands_1 = (const rtx *) x;
16995 const rtx * operands_2 = (const rtx *) y;
16996 rtx mem_1, mem_2, base, offset_1, offset_2;
16997
16998 if (MEM_P (operands_1[0]))
16999 mem_1 = operands_1[0];
17000 else
17001 mem_1 = operands_1[1];
17002
17003 if (MEM_P (operands_2[0]))
17004 mem_2 = operands_2[0];
17005 else
17006 mem_2 = operands_2[1];
17007
17008 /* Extract the offsets. */
17009 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17010 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17011
17012 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17013
17014 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17015 }
17016
17017 /* Given OPERANDS of consecutive load/store, check if we can merge
17018 them into ldp/stp by adjusting the offset. LOAD is true if they
17019 are load instructions. MODE is the mode of memory operands.
17020
17021 Given below consecutive stores:
17022
17023 str w1, [xb, 0x100]
17024 str w1, [xb, 0x104]
17025 str w1, [xb, 0x108]
17026 str w1, [xb, 0x10c]
17027
17028 Though the offsets are out of the range supported by stp, we can
17029 still pair them after adjusting the offset, like:
17030
17031 add scratch, xb, 0x100
17032 stp w1, w1, [scratch]
17033 stp w1, w1, [scratch, 0x8]
17034
17035 The peephole patterns detecting this opportunity should guarantee
17036 the scratch register is avaliable. */
17037
17038 bool
17039 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17040 scalar_mode mode)
17041 {
17042 const int num_insns = 4;
17043 enum reg_class rclass;
17044 HOST_WIDE_INT offvals[num_insns], msize;
17045 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17046
17047 if (load)
17048 {
17049 for (int i = 0; i < num_insns; i++)
17050 {
17051 reg[i] = operands[2 * i];
17052 mem[i] = operands[2 * i + 1];
17053
17054 gcc_assert (REG_P (reg[i]));
17055 }
17056
17057 /* Do not attempt to merge the loads if the loads clobber each other. */
17058 for (int i = 0; i < 8; i += 2)
17059 for (int j = i + 2; j < 8; j += 2)
17060 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17061 return false;
17062 }
17063 else
17064 for (int i = 0; i < num_insns; i++)
17065 {
17066 mem[i] = operands[2 * i];
17067 reg[i] = operands[2 * i + 1];
17068 }
17069
17070 /* Skip if memory operand is by itself valid for ldp/stp. */
17071 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17072 return false;
17073
17074 for (int i = 0; i < num_insns; i++)
17075 {
17076 /* The mems cannot be volatile. */
17077 if (MEM_VOLATILE_P (mem[i]))
17078 return false;
17079
17080 /* Check if the addresses are in the form of [base+offset]. */
17081 extract_base_offset_in_addr (mem[i], base + i, offset + i);
17082 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17083 return false;
17084 }
17085
17086 /* Check if addresses are clobbered by load. */
17087 if (load)
17088 for (int i = 0; i < num_insns; i++)
17089 if (reg_mentioned_p (reg[i], mem[i]))
17090 return false;
17091
17092 /* Check if the bases are same. */
17093 for (int i = 0; i < num_insns - 1; i++)
17094 if (!rtx_equal_p (base[i], base[i + 1]))
17095 return false;
17096
17097 for (int i = 0; i < num_insns; i++)
17098 offvals[i] = INTVAL (offset[i]);
17099
17100 msize = GET_MODE_SIZE (mode);
17101
17102 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17103 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17104 aarch64_host_wide_int_compare);
17105
17106 if (!(offvals[1] == offvals[0] + msize
17107 && offvals[3] == offvals[2] + msize))
17108 return false;
17109
17110 /* Check that offsets are within range of each other. The ldp/stp
17111 instructions have 7 bit immediate offsets, so use 0x80. */
17112 if (offvals[2] - offvals[0] >= msize * 0x80)
17113 return false;
17114
17115 /* The offsets must be aligned with respect to each other. */
17116 if (offvals[0] % msize != offvals[2] % msize)
17117 return false;
17118
17119 /* If we have SImode and slow unaligned ldp,
17120 check the alignment to be at least 8 byte. */
17121 if (mode == SImode
17122 && (aarch64_tune_params.extra_tuning_flags
17123 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17124 && !optimize_size
17125 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17126 return false;
17127
17128 /* Check if the registers are of same class. */
17129 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17130 ? FP_REGS : GENERAL_REGS;
17131
17132 for (int i = 1; i < num_insns; i++)
17133 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17134 {
17135 if (rclass != FP_REGS)
17136 return false;
17137 }
17138 else
17139 {
17140 if (rclass != GENERAL_REGS)
17141 return false;
17142 }
17143
17144 return true;
17145 }
17146
17147 /* Given OPERANDS of consecutive load/store, this function pairs them
17148 into LDP/STP after adjusting the offset. It depends on the fact
17149 that the operands can be sorted so the offsets are correct for STP.
17150 MODE is the mode of memory operands. CODE is the rtl operator
17151 which should be applied to all memory operands, it's SIGN_EXTEND,
17152 ZERO_EXTEND or UNKNOWN. */
17153
17154 bool
17155 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17156 scalar_mode mode, RTX_CODE code)
17157 {
17158 rtx base, offset_1, offset_3, t1, t2;
17159 rtx mem_1, mem_2, mem_3, mem_4;
17160 rtx temp_operands[8];
17161 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17162 stp_off_upper_limit, stp_off_lower_limit, msize;
17163
17164 /* We make changes on a copy as we may still bail out. */
17165 for (int i = 0; i < 8; i ++)
17166 temp_operands[i] = operands[i];
17167
17168 /* Sort the operands. */
17169 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17170
17171 if (load)
17172 {
17173 mem_1 = temp_operands[1];
17174 mem_2 = temp_operands[3];
17175 mem_3 = temp_operands[5];
17176 mem_4 = temp_operands[7];
17177 }
17178 else
17179 {
17180 mem_1 = temp_operands[0];
17181 mem_2 = temp_operands[2];
17182 mem_3 = temp_operands[4];
17183 mem_4 = temp_operands[6];
17184 gcc_assert (code == UNKNOWN);
17185 }
17186
17187 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17188 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17189 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17190 && offset_3 != NULL_RTX);
17191
17192 /* Adjust offset so it can fit in LDP/STP instruction. */
17193 msize = GET_MODE_SIZE (mode);
17194 stp_off_upper_limit = msize * (0x40 - 1);
17195 stp_off_lower_limit = - msize * 0x40;
17196
17197 off_val_1 = INTVAL (offset_1);
17198 off_val_3 = INTVAL (offset_3);
17199
17200 /* The base offset is optimally half way between the two STP/LDP offsets. */
17201 if (msize <= 4)
17202 base_off = (off_val_1 + off_val_3) / 2;
17203 else
17204 /* However, due to issues with negative LDP/STP offset generation for
17205 larger modes, for DF, DI and vector modes. we must not use negative
17206 addresses smaller than 9 signed unadjusted bits can store. This
17207 provides the most range in this case. */
17208 base_off = off_val_1;
17209
17210 /* Adjust the base so that it is aligned with the addresses but still
17211 optimal. */
17212 if (base_off % msize != off_val_1 % msize)
17213 /* Fix the offset, bearing in mind we want to make it bigger not
17214 smaller. */
17215 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17216 else if (msize <= 4)
17217 /* The negative range of LDP/STP is one larger than the positive range. */
17218 base_off += msize;
17219
17220 /* Check if base offset is too big or too small. We can attempt to resolve
17221 this issue by setting it to the maximum value and seeing if the offsets
17222 still fit. */
17223 if (base_off >= 0x1000)
17224 {
17225 base_off = 0x1000 - 1;
17226 /* We must still make sure that the base offset is aligned with respect
17227 to the address. But it may may not be made any bigger. */
17228 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17229 }
17230
17231 /* Likewise for the case where the base is too small. */
17232 if (base_off <= -0x1000)
17233 {
17234 base_off = -0x1000 + 1;
17235 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17236 }
17237
17238 /* Offset of the first STP/LDP. */
17239 new_off_1 = off_val_1 - base_off;
17240
17241 /* Offset of the second STP/LDP. */
17242 new_off_3 = off_val_3 - base_off;
17243
17244 /* The offsets must be within the range of the LDP/STP instructions. */
17245 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17246 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17247 return false;
17248
17249 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17250 new_off_1), true);
17251 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17252 new_off_1 + msize), true);
17253 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17254 new_off_3), true);
17255 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17256 new_off_3 + msize), true);
17257
17258 if (!aarch64_mem_pair_operand (mem_1, mode)
17259 || !aarch64_mem_pair_operand (mem_3, mode))
17260 return false;
17261
17262 if (code == ZERO_EXTEND)
17263 {
17264 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17265 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17266 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17267 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17268 }
17269 else if (code == SIGN_EXTEND)
17270 {
17271 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17272 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17273 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17274 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17275 }
17276
17277 if (load)
17278 {
17279 operands[0] = temp_operands[0];
17280 operands[1] = mem_1;
17281 operands[2] = temp_operands[2];
17282 operands[3] = mem_2;
17283 operands[4] = temp_operands[4];
17284 operands[5] = mem_3;
17285 operands[6] = temp_operands[6];
17286 operands[7] = mem_4;
17287 }
17288 else
17289 {
17290 operands[0] = mem_1;
17291 operands[1] = temp_operands[1];
17292 operands[2] = mem_2;
17293 operands[3] = temp_operands[3];
17294 operands[4] = mem_3;
17295 operands[5] = temp_operands[5];
17296 operands[6] = mem_4;
17297 operands[7] = temp_operands[7];
17298 }
17299
17300 /* Emit adjusting instruction. */
17301 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17302 /* Emit ldp/stp instructions. */
17303 t1 = gen_rtx_SET (operands[0], operands[1]);
17304 t2 = gen_rtx_SET (operands[2], operands[3]);
17305 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17306 t1 = gen_rtx_SET (operands[4], operands[5]);
17307 t2 = gen_rtx_SET (operands[6], operands[7]);
17308 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17309 return true;
17310 }
17311
17312 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17313 it isn't worth branching around empty masked ops (including masked
17314 stores). */
17315
17316 static bool
17317 aarch64_empty_mask_is_expensive (unsigned)
17318 {
17319 return false;
17320 }
17321
17322 /* Return 1 if pseudo register should be created and used to hold
17323 GOT address for PIC code. */
17324
17325 bool
17326 aarch64_use_pseudo_pic_reg (void)
17327 {
17328 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17329 }
17330
17331 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17332
17333 static int
17334 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17335 {
17336 switch (XINT (x, 1))
17337 {
17338 case UNSPEC_GOTSMALLPIC:
17339 case UNSPEC_GOTSMALLPIC28K:
17340 case UNSPEC_GOTTINYPIC:
17341 return 0;
17342 default:
17343 break;
17344 }
17345
17346 return default_unspec_may_trap_p (x, flags);
17347 }
17348
17349
17350 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17351 return the log2 of that value. Otherwise return -1. */
17352
17353 int
17354 aarch64_fpconst_pow_of_2 (rtx x)
17355 {
17356 const REAL_VALUE_TYPE *r;
17357
17358 if (!CONST_DOUBLE_P (x))
17359 return -1;
17360
17361 r = CONST_DOUBLE_REAL_VALUE (x);
17362
17363 if (REAL_VALUE_NEGATIVE (*r)
17364 || REAL_VALUE_ISNAN (*r)
17365 || REAL_VALUE_ISINF (*r)
17366 || !real_isinteger (r, DFmode))
17367 return -1;
17368
17369 return exact_log2 (real_to_integer (r));
17370 }
17371
17372 /* If X is a vector of equal CONST_DOUBLE values and that value is
17373 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17374
17375 int
17376 aarch64_vec_fpconst_pow_of_2 (rtx x)
17377 {
17378 int nelts;
17379 if (GET_CODE (x) != CONST_VECTOR
17380 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17381 return -1;
17382
17383 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17384 return -1;
17385
17386 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17387 if (firstval <= 0)
17388 return -1;
17389
17390 for (int i = 1; i < nelts; i++)
17391 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17392 return -1;
17393
17394 return firstval;
17395 }
17396
17397 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17398 to float.
17399
17400 __fp16 always promotes through this hook.
17401 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17402 through the generic excess precision logic rather than here. */
17403
17404 static tree
17405 aarch64_promoted_type (const_tree t)
17406 {
17407 if (SCALAR_FLOAT_TYPE_P (t)
17408 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17409 return float_type_node;
17410
17411 return NULL_TREE;
17412 }
17413
17414 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17415
17416 static bool
17417 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17418 optimization_type opt_type)
17419 {
17420 switch (op)
17421 {
17422 case rsqrt_optab:
17423 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17424
17425 default:
17426 return true;
17427 }
17428 }
17429
17430 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17431
17432 static unsigned int
17433 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17434 int *offset)
17435 {
17436 /* Polynomial invariant 1 == (VG / 2) - 1. */
17437 gcc_assert (i == 1);
17438 *factor = 2;
17439 *offset = 1;
17440 return AARCH64_DWARF_VG;
17441 }
17442
17443 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17444 if MODE is HFmode, and punt to the generic implementation otherwise. */
17445
17446 static bool
17447 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17448 {
17449 return (mode == HFmode
17450 ? true
17451 : default_libgcc_floating_mode_supported_p (mode));
17452 }
17453
17454 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17455 if MODE is HFmode, and punt to the generic implementation otherwise. */
17456
17457 static bool
17458 aarch64_scalar_mode_supported_p (scalar_mode mode)
17459 {
17460 return (mode == HFmode
17461 ? true
17462 : default_scalar_mode_supported_p (mode));
17463 }
17464
17465 /* Set the value of FLT_EVAL_METHOD.
17466 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17467
17468 0: evaluate all operations and constants, whose semantic type has at
17469 most the range and precision of type float, to the range and
17470 precision of float; evaluate all other operations and constants to
17471 the range and precision of the semantic type;
17472
17473 N, where _FloatN is a supported interchange floating type
17474 evaluate all operations and constants, whose semantic type has at
17475 most the range and precision of _FloatN type, to the range and
17476 precision of the _FloatN type; evaluate all other operations and
17477 constants to the range and precision of the semantic type;
17478
17479 If we have the ARMv8.2-A extensions then we support _Float16 in native
17480 precision, so we should set this to 16. Otherwise, we support the type,
17481 but want to evaluate expressions in float precision, so set this to
17482 0. */
17483
17484 static enum flt_eval_method
17485 aarch64_excess_precision (enum excess_precision_type type)
17486 {
17487 switch (type)
17488 {
17489 case EXCESS_PRECISION_TYPE_FAST:
17490 case EXCESS_PRECISION_TYPE_STANDARD:
17491 /* We can calculate either in 16-bit range and precision or
17492 32-bit range and precision. Make that decision based on whether
17493 we have native support for the ARMv8.2-A 16-bit floating-point
17494 instructions or not. */
17495 return (TARGET_FP_F16INST
17496 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17497 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17498 case EXCESS_PRECISION_TYPE_IMPLICIT:
17499 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17500 default:
17501 gcc_unreachable ();
17502 }
17503 return FLT_EVAL_METHOD_UNPREDICTABLE;
17504 }
17505
17506 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17507 scheduled for speculative execution. Reject the long-running division
17508 and square-root instructions. */
17509
17510 static bool
17511 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17512 {
17513 switch (get_attr_type (insn))
17514 {
17515 case TYPE_SDIV:
17516 case TYPE_UDIV:
17517 case TYPE_FDIVS:
17518 case TYPE_FDIVD:
17519 case TYPE_FSQRTS:
17520 case TYPE_FSQRTD:
17521 case TYPE_NEON_FP_SQRT_S:
17522 case TYPE_NEON_FP_SQRT_D:
17523 case TYPE_NEON_FP_SQRT_S_Q:
17524 case TYPE_NEON_FP_SQRT_D_Q:
17525 case TYPE_NEON_FP_DIV_S:
17526 case TYPE_NEON_FP_DIV_D:
17527 case TYPE_NEON_FP_DIV_S_Q:
17528 case TYPE_NEON_FP_DIV_D_Q:
17529 return false;
17530 default:
17531 return true;
17532 }
17533 }
17534
17535 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17536
17537 static int
17538 aarch64_compute_pressure_classes (reg_class *classes)
17539 {
17540 int i = 0;
17541 classes[i++] = GENERAL_REGS;
17542 classes[i++] = FP_REGS;
17543 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17544 registers need to go in PR_LO_REGS at some point during their
17545 lifetime. Splitting it into two halves has the effect of making
17546 all predicates count against PR_LO_REGS, so that we try whenever
17547 possible to restrict the number of live predicates to 8. This
17548 greatly reduces the amount of spilling in certain loops. */
17549 classes[i++] = PR_LO_REGS;
17550 classes[i++] = PR_HI_REGS;
17551 return i;
17552 }
17553
17554 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17555
17556 static bool
17557 aarch64_can_change_mode_class (machine_mode from,
17558 machine_mode to, reg_class_t)
17559 {
17560 if (BYTES_BIG_ENDIAN)
17561 {
17562 bool from_sve_p = aarch64_sve_data_mode_p (from);
17563 bool to_sve_p = aarch64_sve_data_mode_p (to);
17564
17565 /* Don't allow changes between SVE data modes and non-SVE modes.
17566 See the comment at the head of aarch64-sve.md for details. */
17567 if (from_sve_p != to_sve_p)
17568 return false;
17569
17570 /* Don't allow changes in element size: lane 0 of the new vector
17571 would not then be lane 0 of the old vector. See the comment
17572 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17573 description.
17574
17575 In the worst case, this forces a register to be spilled in
17576 one mode and reloaded in the other, which handles the
17577 endianness correctly. */
17578 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17579 return false;
17580 }
17581 return true;
17582 }
17583
17584 /* Implement TARGET_EARLY_REMAT_MODES. */
17585
17586 static void
17587 aarch64_select_early_remat_modes (sbitmap modes)
17588 {
17589 /* SVE values are not normally live across a call, so it should be
17590 worth doing early rematerialization even in VL-specific mode. */
17591 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17592 {
17593 machine_mode mode = (machine_mode) i;
17594 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17595 if (vec_flags & VEC_ANY_SVE)
17596 bitmap_set_bit (modes, i);
17597 }
17598 }
17599
17600 /* Target-specific selftests. */
17601
17602 #if CHECKING_P
17603
17604 namespace selftest {
17605
17606 /* Selftest for the RTL loader.
17607 Verify that the RTL loader copes with a dump from
17608 print_rtx_function. This is essentially just a test that class
17609 function_reader can handle a real dump, but it also verifies
17610 that lookup_reg_by_dump_name correctly handles hard regs.
17611 The presence of hard reg names in the dump means that the test is
17612 target-specific, hence it is in this file. */
17613
17614 static void
17615 aarch64_test_loading_full_dump ()
17616 {
17617 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17618
17619 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17620
17621 rtx_insn *insn_1 = get_insn_by_uid (1);
17622 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17623
17624 rtx_insn *insn_15 = get_insn_by_uid (15);
17625 ASSERT_EQ (INSN, GET_CODE (insn_15));
17626 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17627
17628 /* Verify crtl->return_rtx. */
17629 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17630 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17631 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17632 }
17633
17634 /* Run all target-specific selftests. */
17635
17636 static void
17637 aarch64_run_selftests (void)
17638 {
17639 aarch64_test_loading_full_dump ();
17640 }
17641
17642 } // namespace selftest
17643
17644 #endif /* #if CHECKING_P */
17645
17646 #undef TARGET_ADDRESS_COST
17647 #define TARGET_ADDRESS_COST aarch64_address_cost
17648
17649 /* This hook will determines whether unnamed bitfields affect the alignment
17650 of the containing structure. The hook returns true if the structure
17651 should inherit the alignment requirements of an unnamed bitfield's
17652 type. */
17653 #undef TARGET_ALIGN_ANON_BITFIELD
17654 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17655
17656 #undef TARGET_ASM_ALIGNED_DI_OP
17657 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17658
17659 #undef TARGET_ASM_ALIGNED_HI_OP
17660 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17661
17662 #undef TARGET_ASM_ALIGNED_SI_OP
17663 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17664
17665 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17666 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17667 hook_bool_const_tree_hwi_hwi_const_tree_true
17668
17669 #undef TARGET_ASM_FILE_START
17670 #define TARGET_ASM_FILE_START aarch64_start_file
17671
17672 #undef TARGET_ASM_OUTPUT_MI_THUNK
17673 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17674
17675 #undef TARGET_ASM_SELECT_RTX_SECTION
17676 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17677
17678 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17679 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17680
17681 #undef TARGET_BUILD_BUILTIN_VA_LIST
17682 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17683
17684 #undef TARGET_CALLEE_COPIES
17685 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17686
17687 #undef TARGET_CAN_ELIMINATE
17688 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17689
17690 #undef TARGET_CAN_INLINE_P
17691 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17692
17693 #undef TARGET_CANNOT_FORCE_CONST_MEM
17694 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17695
17696 #undef TARGET_CASE_VALUES_THRESHOLD
17697 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17698
17699 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17700 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17701
17702 /* Only the least significant bit is used for initialization guard
17703 variables. */
17704 #undef TARGET_CXX_GUARD_MASK_BIT
17705 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17706
17707 #undef TARGET_C_MODE_FOR_SUFFIX
17708 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17709
17710 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17711 #undef TARGET_DEFAULT_TARGET_FLAGS
17712 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17713 #endif
17714
17715 #undef TARGET_CLASS_MAX_NREGS
17716 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17717
17718 #undef TARGET_BUILTIN_DECL
17719 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17720
17721 #undef TARGET_BUILTIN_RECIPROCAL
17722 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17723
17724 #undef TARGET_C_EXCESS_PRECISION
17725 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17726
17727 #undef TARGET_EXPAND_BUILTIN
17728 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17729
17730 #undef TARGET_EXPAND_BUILTIN_VA_START
17731 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17732
17733 #undef TARGET_FOLD_BUILTIN
17734 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17735
17736 #undef TARGET_FUNCTION_ARG
17737 #define TARGET_FUNCTION_ARG aarch64_function_arg
17738
17739 #undef TARGET_FUNCTION_ARG_ADVANCE
17740 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17741
17742 #undef TARGET_FUNCTION_ARG_BOUNDARY
17743 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17744
17745 #undef TARGET_FUNCTION_ARG_PADDING
17746 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17747
17748 #undef TARGET_GET_RAW_RESULT_MODE
17749 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17750 #undef TARGET_GET_RAW_ARG_MODE
17751 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17752
17753 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17754 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17755
17756 #undef TARGET_FUNCTION_VALUE
17757 #define TARGET_FUNCTION_VALUE aarch64_function_value
17758
17759 #undef TARGET_FUNCTION_VALUE_REGNO_P
17760 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17761
17762 #undef TARGET_GIMPLE_FOLD_BUILTIN
17763 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17764
17765 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17766 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17767
17768 #undef TARGET_INIT_BUILTINS
17769 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17770
17771 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17772 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17773 aarch64_ira_change_pseudo_allocno_class
17774
17775 #undef TARGET_LEGITIMATE_ADDRESS_P
17776 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17777
17778 #undef TARGET_LEGITIMATE_CONSTANT_P
17779 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17780
17781 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17782 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17783 aarch64_legitimize_address_displacement
17784
17785 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17786 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17787
17788 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17789 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17790 aarch64_libgcc_floating_mode_supported_p
17791
17792 #undef TARGET_MANGLE_TYPE
17793 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17794
17795 #undef TARGET_MEMORY_MOVE_COST
17796 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17797
17798 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17799 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17800
17801 #undef TARGET_MUST_PASS_IN_STACK
17802 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17803
17804 /* This target hook should return true if accesses to volatile bitfields
17805 should use the narrowest mode possible. It should return false if these
17806 accesses should use the bitfield container type. */
17807 #undef TARGET_NARROW_VOLATILE_BITFIELD
17808 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17809
17810 #undef TARGET_OPTION_OVERRIDE
17811 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17812
17813 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17814 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17815 aarch64_override_options_after_change
17816
17817 #undef TARGET_OPTION_SAVE
17818 #define TARGET_OPTION_SAVE aarch64_option_save
17819
17820 #undef TARGET_OPTION_RESTORE
17821 #define TARGET_OPTION_RESTORE aarch64_option_restore
17822
17823 #undef TARGET_OPTION_PRINT
17824 #define TARGET_OPTION_PRINT aarch64_option_print
17825
17826 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17827 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17828
17829 #undef TARGET_SET_CURRENT_FUNCTION
17830 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17831
17832 #undef TARGET_PASS_BY_REFERENCE
17833 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17834
17835 #undef TARGET_PREFERRED_RELOAD_CLASS
17836 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17837
17838 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17839 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17840
17841 #undef TARGET_PROMOTED_TYPE
17842 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17843
17844 #undef TARGET_SECONDARY_RELOAD
17845 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17846
17847 #undef TARGET_SHIFT_TRUNCATION_MASK
17848 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17849
17850 #undef TARGET_SETUP_INCOMING_VARARGS
17851 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17852
17853 #undef TARGET_STRUCT_VALUE_RTX
17854 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17855
17856 #undef TARGET_REGISTER_MOVE_COST
17857 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17858
17859 #undef TARGET_RETURN_IN_MEMORY
17860 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17861
17862 #undef TARGET_RETURN_IN_MSB
17863 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17864
17865 #undef TARGET_RTX_COSTS
17866 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17867
17868 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17869 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17870
17871 #undef TARGET_SCHED_ISSUE_RATE
17872 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17873
17874 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17875 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17876 aarch64_sched_first_cycle_multipass_dfa_lookahead
17877
17878 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17879 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17880 aarch64_first_cycle_multipass_dfa_lookahead_guard
17881
17882 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17883 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17884 aarch64_get_separate_components
17885
17886 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17887 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17888 aarch64_components_for_bb
17889
17890 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17891 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17892 aarch64_disqualify_components
17893
17894 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17895 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17896 aarch64_emit_prologue_components
17897
17898 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17899 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17900 aarch64_emit_epilogue_components
17901
17902 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17903 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17904 aarch64_set_handled_components
17905
17906 #undef TARGET_TRAMPOLINE_INIT
17907 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17908
17909 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17910 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17911
17912 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17913 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17914
17915 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17916 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17917 aarch64_builtin_support_vector_misalignment
17918
17919 #undef TARGET_ARRAY_MODE
17920 #define TARGET_ARRAY_MODE aarch64_array_mode
17921
17922 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17923 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17924
17925 #undef TARGET_VECTORIZE_ADD_STMT_COST
17926 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17927
17928 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17929 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17930 aarch64_builtin_vectorization_cost
17931
17932 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17933 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17934
17935 #undef TARGET_VECTORIZE_BUILTINS
17936 #define TARGET_VECTORIZE_BUILTINS
17937
17938 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17939 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17940 aarch64_builtin_vectorized_function
17941
17942 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17943 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17944 aarch64_autovectorize_vector_sizes
17945
17946 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17947 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17948 aarch64_atomic_assign_expand_fenv
17949
17950 /* Section anchor support. */
17951
17952 #undef TARGET_MIN_ANCHOR_OFFSET
17953 #define TARGET_MIN_ANCHOR_OFFSET -256
17954
17955 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17956 byte offset; we can do much more for larger data types, but have no way
17957 to determine the size of the access. We assume accesses are aligned. */
17958 #undef TARGET_MAX_ANCHOR_OFFSET
17959 #define TARGET_MAX_ANCHOR_OFFSET 4095
17960
17961 #undef TARGET_VECTOR_ALIGNMENT
17962 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17963
17964 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17965 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17966 aarch64_vectorize_preferred_vector_alignment
17967 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17968 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17969 aarch64_simd_vector_alignment_reachable
17970
17971 /* vec_perm support. */
17972
17973 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17974 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17975 aarch64_vectorize_vec_perm_const
17976
17977 #undef TARGET_VECTORIZE_GET_MASK_MODE
17978 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17979 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17980 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17981 aarch64_empty_mask_is_expensive
17982 #undef TARGET_PREFERRED_ELSE_VALUE
17983 #define TARGET_PREFERRED_ELSE_VALUE \
17984 aarch64_preferred_else_value
17985
17986 #undef TARGET_INIT_LIBFUNCS
17987 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17988
17989 #undef TARGET_FIXED_CONDITION_CODE_REGS
17990 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17991
17992 #undef TARGET_FLAGS_REGNUM
17993 #define TARGET_FLAGS_REGNUM CC_REGNUM
17994
17995 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17996 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17997
17998 #undef TARGET_ASAN_SHADOW_OFFSET
17999 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18000
18001 #undef TARGET_LEGITIMIZE_ADDRESS
18002 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18003
18004 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18005 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18006
18007 #undef TARGET_CAN_USE_DOLOOP_P
18008 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18009
18010 #undef TARGET_SCHED_ADJUST_PRIORITY
18011 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18012
18013 #undef TARGET_SCHED_MACRO_FUSION_P
18014 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18015
18016 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18017 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18018
18019 #undef TARGET_SCHED_FUSION_PRIORITY
18020 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18021
18022 #undef TARGET_UNSPEC_MAY_TRAP_P
18023 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18024
18025 #undef TARGET_USE_PSEUDO_PIC_REG
18026 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18027
18028 #undef TARGET_PRINT_OPERAND
18029 #define TARGET_PRINT_OPERAND aarch64_print_operand
18030
18031 #undef TARGET_PRINT_OPERAND_ADDRESS
18032 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18033
18034 #undef TARGET_OPTAB_SUPPORTED_P
18035 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18036
18037 #undef TARGET_OMIT_STRUCT_RETURN_REG
18038 #define TARGET_OMIT_STRUCT_RETURN_REG true
18039
18040 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18041 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18042 aarch64_dwarf_poly_indeterminate_value
18043
18044 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18045 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18046 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18047
18048 #undef TARGET_HARD_REGNO_NREGS
18049 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18050 #undef TARGET_HARD_REGNO_MODE_OK
18051 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18052
18053 #undef TARGET_MODES_TIEABLE_P
18054 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18055
18056 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18057 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18058 aarch64_hard_regno_call_part_clobbered
18059
18060 #undef TARGET_CONSTANT_ALIGNMENT
18061 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18062
18063 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18064 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18065
18066 #undef TARGET_CAN_CHANGE_MODE_CLASS
18067 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18068
18069 #undef TARGET_SELECT_EARLY_REMAT_MODES
18070 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18071
18072 #if CHECKING_P
18073 #undef TARGET_RUN_TARGET_SELFTESTS
18074 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18075 #endif /* #if CHECKING_P */
18076
18077 struct gcc_target targetm = TARGET_INITIALIZER;
18078
18079 #include "gt-aarch64.h"