1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
84 A simple base register plus immediate offset.
87 A base register indexed by immediate offset with writeback.
90 A base register indexed by (optionally scaled) register.
93 A base register indexed by (optionally scaled) zero-extended register.
96 A base register indexed by (optionally scaled) sign-extended register.
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type
{
114 struct aarch64_address_info
{
115 enum aarch64_address_type type
;
118 poly_int64 const_offset
;
120 enum aarch64_symbol_type symbol_type
;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type
{ MOV
, MVN
};
127 enum modifier_type
{ LSL
, MSL
};
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode
, rtx
);
131 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
132 insn_type
= MOV
, modifier_type
= LSL
,
134 simd_immediate_info (scalar_mode
, rtx
, rtx
);
136 /* The mode of the elements. */
137 scalar_mode elt_mode
;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
143 /* The value of the step if the constant is a series, null otherwise. */
146 /* The instruction to use to move the immediate into a vector. */
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier
;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
159 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
160 modifier (LSL
), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
168 unsigned HOST_WIDE_INT value_in
,
169 insn_type insn_in
, modifier_type modifier_in
,
170 unsigned int shift_in
)
171 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
172 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
179 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
180 modifier (LSL
), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel
;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg
;
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
194 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
197 machine_mode
*, int *,
199 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
200 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode
);
203 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
208 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
209 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
210 aarch64_addr_query_type
);
212 /* Major revision number of the ARM Architecture implemented by the target. */
213 unsigned aarch64_architecture_version
;
215 /* The processor for which instructions should be scheduled. */
216 enum aarch64_processor aarch64_tune
= cortexa53
;
218 /* Mask to specify which instruction scheduling options should be used. */
219 unsigned long aarch64_tune_flags
= 0;
221 /* Global flag for PC relative loads. */
222 bool aarch64_pcrelative_literal_loads
;
224 /* Global flag for whether frame pointer is enabled. */
225 bool aarch64_use_frame_pointer
;
227 /* Support for command line parsing of boolean flags in the tuning
229 struct aarch64_flag_desc
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
239 { "none", AARCH64_FUSE_NOTHING
},
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL
},
242 { NULL
, AARCH64_FUSE_NOTHING
}
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
249 { "none", AARCH64_EXTRA_TUNE_NONE
},
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL
},
252 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
255 /* Tuning parameters. */
257 static const struct cpu_addrcost_table generic_addrcost_table
=
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
273 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
289 static const struct cpu_addrcost_table xgene1_addrcost_table
=
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
321 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
331 3, /* register_offset */
332 4, /* register_sextend */
333 3, /* register_zextend */
337 static const struct cpu_regmove_cost generic_regmove_cost
=
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
347 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost. */
357 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
360 /* Avoid the use of slow int<->fp moves for spilling by setting
361 their cost higher than memmov_cost. */
367 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
370 /* Avoid the use of slow int<->fp moves for spilling by setting
371 their cost higher than memmov_cost (actual, 4 and 9). */
377 static const struct cpu_regmove_cost thunderx_regmove_cost
=
385 static const struct cpu_regmove_cost xgene1_regmove_cost
=
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost. */
395 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
398 /* Avoid the use of int<->fp moves for spilling. */
404 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
407 /* Avoid the use of int<->fp moves for spilling. */
413 /* Generic costs for vector insn classes. */
414 static const struct cpu_vector_cost generic_vector_cost
=
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 1, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 1, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 2, /* vec_permute_cost */
423 1, /* vec_to_scalar_cost */
424 1, /* scalar_to_vec_cost */
425 1, /* vec_align_load_cost */
426 1, /* vec_unalign_load_cost */
427 1, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 1 /* cond_not_taken_branch_cost */
433 /* ThunderX costs for vector insn classes. */
434 static const struct cpu_vector_cost thunderx_vector_cost
=
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 3, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 4, /* vec_int_stmt_cost */
441 1, /* vec_fp_stmt_cost */
442 4, /* vec_permute_cost */
443 2, /* vec_to_scalar_cost */
444 2, /* scalar_to_vec_cost */
445 3, /* vec_align_load_cost */
446 5, /* vec_unalign_load_cost */
447 5, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 3, /* cond_taken_branch_cost */
450 3 /* cond_not_taken_branch_cost */
453 /* Generic costs for vector insn classes. */
454 static const struct cpu_vector_cost cortexa57_vector_cost
=
456 1, /* scalar_int_stmt_cost */
457 1, /* scalar_fp_stmt_cost */
458 4, /* scalar_load_cost */
459 1, /* scalar_store_cost */
460 2, /* vec_int_stmt_cost */
461 2, /* vec_fp_stmt_cost */
462 3, /* vec_permute_cost */
463 8, /* vec_to_scalar_cost */
464 8, /* scalar_to_vec_cost */
465 4, /* vec_align_load_cost */
466 4, /* vec_unalign_load_cost */
467 1, /* vec_unalign_store_cost */
468 1, /* vec_store_cost */
469 1, /* cond_taken_branch_cost */
470 1 /* cond_not_taken_branch_cost */
473 static const struct cpu_vector_cost exynosm1_vector_cost
=
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 3, /* vec_int_stmt_cost */
480 3, /* vec_fp_stmt_cost */
481 3, /* vec_permute_cost */
482 3, /* vec_to_scalar_cost */
483 3, /* scalar_to_vec_cost */
484 5, /* vec_align_load_cost */
485 5, /* vec_unalign_load_cost */
486 1, /* vec_unalign_store_cost */
487 1, /* vec_store_cost */
488 1, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Generic costs for vector insn classes. */
493 static const struct cpu_vector_cost xgene1_vector_cost
=
495 1, /* scalar_int_stmt_cost */
496 1, /* scalar_fp_stmt_cost */
497 5, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 2, /* vec_int_stmt_cost */
500 2, /* vec_fp_stmt_cost */
501 2, /* vec_permute_cost */
502 4, /* vec_to_scalar_cost */
503 4, /* scalar_to_vec_cost */
504 10, /* vec_align_load_cost */
505 10, /* vec_unalign_load_cost */
506 2, /* vec_unalign_store_cost */
507 2, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Costs for vector insn classes for Vulcan. */
513 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
515 1, /* scalar_int_stmt_cost */
516 6, /* scalar_fp_stmt_cost */
517 4, /* scalar_load_cost */
518 1, /* scalar_store_cost */
519 5, /* vec_int_stmt_cost */
520 6, /* vec_fp_stmt_cost */
521 3, /* vec_permute_cost */
522 6, /* vec_to_scalar_cost */
523 5, /* scalar_to_vec_cost */
524 8, /* vec_align_load_cost */
525 8, /* vec_unalign_load_cost */
526 4, /* vec_unalign_store_cost */
527 4, /* vec_store_cost */
528 2, /* cond_taken_branch_cost */
529 1 /* cond_not_taken_branch_cost */
532 /* Generic costs for branch instructions. */
533 static const struct cpu_branch_cost generic_branch_cost
=
535 1, /* Predictable. */
536 3 /* Unpredictable. */
539 /* Generic approximation modes. */
540 static const cpu_approx_modes generic_approx_modes
=
542 AARCH64_APPROX_NONE
, /* division */
543 AARCH64_APPROX_NONE
, /* sqrt */
544 AARCH64_APPROX_NONE
/* recip_sqrt */
547 /* Approximation modes for Exynos M1. */
548 static const cpu_approx_modes exynosm1_approx_modes
=
550 AARCH64_APPROX_NONE
, /* division */
551 AARCH64_APPROX_ALL
, /* sqrt */
552 AARCH64_APPROX_ALL
/* recip_sqrt */
555 /* Approximation modes for X-Gene 1. */
556 static const cpu_approx_modes xgene1_approx_modes
=
558 AARCH64_APPROX_NONE
, /* division */
559 AARCH64_APPROX_NONE
, /* sqrt */
560 AARCH64_APPROX_ALL
/* recip_sqrt */
563 /* Generic prefetch settings (which disable prefetch). */
564 static const cpu_prefetch_tune generic_prefetch_tune
=
567 -1, /* l1_cache_size */
568 -1, /* l1_cache_line_size */
569 -1, /* l2_cache_size */
570 true, /* prefetch_dynamic_strides */
571 -1, /* minimum_stride */
572 -1 /* default_opt_level */
575 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
578 -1, /* l1_cache_size */
579 64, /* l1_cache_line_size */
580 -1, /* l2_cache_size */
581 true, /* prefetch_dynamic_strides */
582 -1, /* minimum_stride */
583 -1 /* default_opt_level */
586 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
589 32, /* l1_cache_size */
590 64, /* l1_cache_line_size */
591 512, /* l2_cache_size */
592 false, /* prefetch_dynamic_strides */
593 2048, /* minimum_stride */
594 3 /* default_opt_level */
597 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
600 32, /* l1_cache_size */
601 128, /* l1_cache_line_size */
602 16*1024, /* l2_cache_size */
603 true, /* prefetch_dynamic_strides */
604 -1, /* minimum_stride */
605 3 /* default_opt_level */
608 static const cpu_prefetch_tune thunderx_prefetch_tune
=
611 32, /* l1_cache_size */
612 128, /* l1_cache_line_size */
613 -1, /* l2_cache_size */
614 true, /* prefetch_dynamic_strides */
615 -1, /* minimum_stride */
616 -1 /* default_opt_level */
619 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
622 32, /* l1_cache_size */
623 64, /* l1_cache_line_size */
624 256, /* l2_cache_size */
625 true, /* prefetch_dynamic_strides */
626 -1, /* minimum_stride */
627 -1 /* default_opt_level */
630 static const struct tune_params generic_tunings
=
632 &cortexa57_extra_costs
,
633 &generic_addrcost_table
,
634 &generic_regmove_cost
,
635 &generic_vector_cost
,
636 &generic_branch_cost
,
637 &generic_approx_modes
,
640 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
641 "8", /* function_align. */
642 "4", /* jump_align. */
643 "8", /* loop_align. */
644 2, /* int_reassoc_width. */
645 4, /* fp_reassoc_width. */
646 1, /* vec_reassoc_width. */
647 2, /* min_div_recip_mul_sf. */
648 2, /* min_div_recip_mul_df. */
649 0, /* max_case_values. */
650 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
651 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
652 &generic_prefetch_tune
655 static const struct tune_params cortexa35_tunings
=
657 &cortexa53_extra_costs
,
658 &generic_addrcost_table
,
659 &cortexa53_regmove_cost
,
660 &generic_vector_cost
,
661 &generic_branch_cost
,
662 &generic_approx_modes
,
665 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
666 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
667 "16", /* function_align. */
668 "4", /* jump_align. */
669 "8", /* loop_align. */
670 2, /* int_reassoc_width. */
671 4, /* fp_reassoc_width. */
672 1, /* vec_reassoc_width. */
673 2, /* min_div_recip_mul_sf. */
674 2, /* min_div_recip_mul_df. */
675 0, /* max_case_values. */
676 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
677 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
678 &generic_prefetch_tune
681 static const struct tune_params cortexa53_tunings
=
683 &cortexa53_extra_costs
,
684 &generic_addrcost_table
,
685 &cortexa53_regmove_cost
,
686 &generic_vector_cost
,
687 &generic_branch_cost
,
688 &generic_approx_modes
,
691 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
692 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
693 "16", /* function_align. */
694 "4", /* jump_align. */
695 "8", /* loop_align. */
696 2, /* int_reassoc_width. */
697 4, /* fp_reassoc_width. */
698 1, /* vec_reassoc_width. */
699 2, /* min_div_recip_mul_sf. */
700 2, /* min_div_recip_mul_df. */
701 0, /* max_case_values. */
702 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
703 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
704 &generic_prefetch_tune
707 static const struct tune_params cortexa57_tunings
=
709 &cortexa57_extra_costs
,
710 &generic_addrcost_table
,
711 &cortexa57_regmove_cost
,
712 &cortexa57_vector_cost
,
713 &generic_branch_cost
,
714 &generic_approx_modes
,
717 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
718 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
719 "16", /* function_align. */
720 "4", /* jump_align. */
721 "8", /* loop_align. */
722 2, /* int_reassoc_width. */
723 4, /* fp_reassoc_width. */
724 1, /* vec_reassoc_width. */
725 2, /* min_div_recip_mul_sf. */
726 2, /* min_div_recip_mul_df. */
727 0, /* max_case_values. */
728 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
729 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
730 &generic_prefetch_tune
733 static const struct tune_params cortexa72_tunings
=
735 &cortexa57_extra_costs
,
736 &generic_addrcost_table
,
737 &cortexa57_regmove_cost
,
738 &cortexa57_vector_cost
,
739 &generic_branch_cost
,
740 &generic_approx_modes
,
743 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
744 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
745 "16", /* function_align. */
746 "4", /* jump_align. */
747 "8", /* loop_align. */
748 2, /* int_reassoc_width. */
749 4, /* fp_reassoc_width. */
750 1, /* vec_reassoc_width. */
751 2, /* min_div_recip_mul_sf. */
752 2, /* min_div_recip_mul_df. */
753 0, /* max_case_values. */
754 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
755 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
756 &generic_prefetch_tune
759 static const struct tune_params cortexa73_tunings
=
761 &cortexa57_extra_costs
,
762 &generic_addrcost_table
,
763 &cortexa57_regmove_cost
,
764 &cortexa57_vector_cost
,
765 &generic_branch_cost
,
766 &generic_approx_modes
,
767 4, /* memmov_cost. */
769 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
770 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
771 "16", /* function_align. */
772 "4", /* jump_align. */
773 "8", /* loop_align. */
774 2, /* int_reassoc_width. */
775 4, /* fp_reassoc_width. */
776 1, /* vec_reassoc_width. */
777 2, /* min_div_recip_mul_sf. */
778 2, /* min_div_recip_mul_df. */
779 0, /* max_case_values. */
780 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
781 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
782 &generic_prefetch_tune
787 static const struct tune_params exynosm1_tunings
=
789 &exynosm1_extra_costs
,
790 &exynosm1_addrcost_table
,
791 &exynosm1_regmove_cost
,
792 &exynosm1_vector_cost
,
793 &generic_branch_cost
,
794 &exynosm1_approx_modes
,
797 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
798 "4", /* function_align. */
799 "4", /* jump_align. */
800 "4", /* loop_align. */
801 2, /* int_reassoc_width. */
802 4, /* fp_reassoc_width. */
803 1, /* vec_reassoc_width. */
804 2, /* min_div_recip_mul_sf. */
805 2, /* min_div_recip_mul_df. */
806 48, /* max_case_values. */
807 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
808 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
809 &exynosm1_prefetch_tune
812 static const struct tune_params thunderxt88_tunings
=
814 &thunderx_extra_costs
,
815 &generic_addrcost_table
,
816 &thunderx_regmove_cost
,
817 &thunderx_vector_cost
,
818 &generic_branch_cost
,
819 &generic_approx_modes
,
822 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
823 "8", /* function_align. */
824 "8", /* jump_align. */
825 "8", /* loop_align. */
826 2, /* int_reassoc_width. */
827 4, /* fp_reassoc_width. */
828 1, /* vec_reassoc_width. */
829 2, /* min_div_recip_mul_sf. */
830 2, /* min_div_recip_mul_df. */
831 0, /* max_case_values. */
832 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
833 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
834 &thunderxt88_prefetch_tune
837 static const struct tune_params thunderx_tunings
=
839 &thunderx_extra_costs
,
840 &generic_addrcost_table
,
841 &thunderx_regmove_cost
,
842 &thunderx_vector_cost
,
843 &generic_branch_cost
,
844 &generic_approx_modes
,
847 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
848 "8", /* function_align. */
849 "8", /* jump_align. */
850 "8", /* loop_align. */
851 2, /* int_reassoc_width. */
852 4, /* fp_reassoc_width. */
853 1, /* vec_reassoc_width. */
854 2, /* min_div_recip_mul_sf. */
855 2, /* min_div_recip_mul_df. */
856 0, /* max_case_values. */
857 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
858 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
859 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
860 &thunderx_prefetch_tune
863 static const struct tune_params xgene1_tunings
=
866 &xgene1_addrcost_table
,
867 &xgene1_regmove_cost
,
869 &generic_branch_cost
,
870 &xgene1_approx_modes
,
873 AARCH64_FUSE_NOTHING
, /* fusible_ops */
874 "16", /* function_align. */
875 "8", /* jump_align. */
876 "16", /* loop_align. */
877 2, /* int_reassoc_width. */
878 4, /* fp_reassoc_width. */
879 1, /* vec_reassoc_width. */
880 2, /* min_div_recip_mul_sf. */
881 2, /* min_div_recip_mul_df. */
882 0, /* max_case_values. */
883 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
884 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
885 &generic_prefetch_tune
888 static const struct tune_params qdf24xx_tunings
=
890 &qdf24xx_extra_costs
,
891 &qdf24xx_addrcost_table
,
892 &qdf24xx_regmove_cost
,
893 &generic_vector_cost
,
894 &generic_branch_cost
,
895 &generic_approx_modes
,
898 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
899 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
900 "16", /* function_align. */
901 "8", /* jump_align. */
902 "16", /* loop_align. */
903 2, /* int_reassoc_width. */
904 4, /* fp_reassoc_width. */
905 1, /* vec_reassoc_width. */
906 2, /* min_div_recip_mul_sf. */
907 2, /* min_div_recip_mul_df. */
908 0, /* max_case_values. */
909 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
910 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
911 &qdf24xx_prefetch_tune
914 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
916 static const struct tune_params saphira_tunings
=
918 &generic_extra_costs
,
919 &generic_addrcost_table
,
920 &generic_regmove_cost
,
921 &generic_vector_cost
,
922 &generic_branch_cost
,
923 &generic_approx_modes
,
926 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
927 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
928 "16", /* function_align. */
929 "8", /* jump_align. */
930 "16", /* loop_align. */
931 2, /* int_reassoc_width. */
932 4, /* fp_reassoc_width. */
933 1, /* vec_reassoc_width. */
934 2, /* min_div_recip_mul_sf. */
935 2, /* min_div_recip_mul_df. */
936 0, /* max_case_values. */
937 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
938 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
939 &generic_prefetch_tune
942 static const struct tune_params thunderx2t99_tunings
=
944 &thunderx2t99_extra_costs
,
945 &thunderx2t99_addrcost_table
,
946 &thunderx2t99_regmove_cost
,
947 &thunderx2t99_vector_cost
,
948 &generic_branch_cost
,
949 &generic_approx_modes
,
950 4, /* memmov_cost. */
952 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
953 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
954 "16", /* function_align. */
955 "8", /* jump_align. */
956 "16", /* loop_align. */
957 3, /* int_reassoc_width. */
958 2, /* fp_reassoc_width. */
959 2, /* vec_reassoc_width. */
960 2, /* min_div_recip_mul_sf. */
961 2, /* min_div_recip_mul_df. */
962 0, /* max_case_values. */
963 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
964 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
965 &thunderx2t99_prefetch_tune
968 /* Support for fine-grained override of the tuning structures. */
969 struct aarch64_tuning_override_function
972 void (*parse_override
)(const char*, struct tune_params
*);
975 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
976 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
978 static const struct aarch64_tuning_override_function
979 aarch64_tuning_override_functions
[] =
981 { "fuse", aarch64_parse_fuse_string
},
982 { "tune", aarch64_parse_tune_string
},
986 /* A processor implementing AArch64. */
989 const char *const name
;
990 enum aarch64_processor ident
;
991 enum aarch64_processor sched_core
;
992 enum aarch64_arch arch
;
993 unsigned architecture_version
;
994 const unsigned long flags
;
995 const struct tune_params
*const tune
;
998 /* Architectures implementing AArch64. */
999 static const struct processor all_architectures
[] =
1001 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1002 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1003 #include "aarch64-arches.def"
1004 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1007 /* Processor cores implementing AArch64. */
1008 static const struct processor all_cores
[] =
1010 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1011 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1012 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1013 FLAGS, &COSTS##_tunings},
1014 #include "aarch64-cores.def"
1015 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1016 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1017 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1021 /* Target specification. These are populated by the -march, -mtune, -mcpu
1022 handling code or by target attributes. */
1023 static const struct processor
*selected_arch
;
1024 static const struct processor
*selected_cpu
;
1025 static const struct processor
*selected_tune
;
1027 /* The current tuning set. */
1028 struct tune_params aarch64_tune_params
= generic_tunings
;
1030 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1032 /* An ISA extension in the co-processor and main instruction set space. */
1033 struct aarch64_option_extension
1035 const char *const name
;
1036 const unsigned long flags_on
;
1037 const unsigned long flags_off
;
1040 typedef enum aarch64_cond_code
1042 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1043 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1044 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1048 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1050 /* The condition codes of the processor, and the inverse function. */
1051 static const char * const aarch64_condition_codes
[] =
1053 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1054 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1057 /* Generate code to enable conditional branches in functions over 1 MiB. */
1059 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1060 const char * branch_format
)
1062 rtx_code_label
* tmp_label
= gen_label_rtx ();
1063 char label_buf
[256];
1065 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1066 CODE_LABEL_NUMBER (tmp_label
));
1067 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1068 rtx dest_label
= operands
[pos_label
];
1069 operands
[pos_label
] = tmp_label
;
1071 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1072 output_asm_insn (buffer
, operands
);
1074 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1075 operands
[pos_label
] = dest_label
;
1076 output_asm_insn (buffer
, operands
);
1081 aarch64_err_no_fpadvsimd (machine_mode mode
)
1083 if (TARGET_GENERAL_REGS_ONLY
)
1084 if (FLOAT_MODE_P (mode
))
1085 error ("%qs is incompatible with the use of floating-point types",
1086 "-mgeneral-regs-only");
1088 error ("%qs is incompatible with the use of vector types",
1089 "-mgeneral-regs-only");
1091 if (FLOAT_MODE_P (mode
))
1092 error ("%qs feature modifier is incompatible with the use of"
1093 " floating-point types", "+nofp");
1095 error ("%qs feature modifier is incompatible with the use of"
1096 " vector types", "+nofp");
1099 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1100 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1101 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1102 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1103 and GENERAL_REGS is lower than the memory cost (in this case the best class
1104 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1105 cost results in bad allocations with many redundant int<->FP moves which
1106 are expensive on various cores.
1107 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1108 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1109 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1110 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1111 The result of this is that it is no longer inefficient to have a higher
1112 memory move cost than the register move cost.
1116 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1117 reg_class_t best_class
)
1121 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1122 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1123 return allocno_class
;
1125 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1126 || !reg_class_subset_p (FP_REGS
, best_class
))
1129 mode
= PSEUDO_REGNO_MODE (regno
);
1130 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1134 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1136 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1137 return aarch64_tune_params
.min_div_recip_mul_sf
;
1138 return aarch64_tune_params
.min_div_recip_mul_df
;
1141 /* Return the reassociation width of treeop OPC with mode MODE. */
1143 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1145 if (VECTOR_MODE_P (mode
))
1146 return aarch64_tune_params
.vec_reassoc_width
;
1147 if (INTEGRAL_MODE_P (mode
))
1148 return aarch64_tune_params
.int_reassoc_width
;
1149 /* Avoid reassociating floating point addition so we emit more FMAs. */
1150 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1151 return aarch64_tune_params
.fp_reassoc_width
;
1155 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1157 aarch64_dbx_register_number (unsigned regno
)
1159 if (GP_REGNUM_P (regno
))
1160 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1161 else if (regno
== SP_REGNUM
)
1162 return AARCH64_DWARF_SP
;
1163 else if (FP_REGNUM_P (regno
))
1164 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1165 else if (PR_REGNUM_P (regno
))
1166 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1167 else if (regno
== VG_REGNUM
)
1168 return AARCH64_DWARF_VG
;
1170 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1171 equivalent DWARF register. */
1172 return DWARF_FRAME_REGISTERS
;
1175 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1177 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1180 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1183 /* Return true if MODE is an SVE predicate mode. */
1185 aarch64_sve_pred_mode_p (machine_mode mode
)
1188 && (mode
== VNx16BImode
1189 || mode
== VNx8BImode
1190 || mode
== VNx4BImode
1191 || mode
== VNx2BImode
));
1194 /* Three mutually-exclusive flags describing a vector or predicate type. */
1195 const unsigned int VEC_ADVSIMD
= 1;
1196 const unsigned int VEC_SVE_DATA
= 2;
1197 const unsigned int VEC_SVE_PRED
= 4;
1198 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1199 a structure of 2, 3 or 4 vectors. */
1200 const unsigned int VEC_STRUCT
= 8;
1201 /* Useful combinations of the above. */
1202 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1203 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1205 /* Return a set of flags describing the vector properties of mode MODE.
1206 Ignore modes that are not supported by the current target. */
1208 aarch64_classify_vector_mode (machine_mode mode
)
1210 if (aarch64_advsimd_struct_mode_p (mode
))
1211 return VEC_ADVSIMD
| VEC_STRUCT
;
1213 if (aarch64_sve_pred_mode_p (mode
))
1214 return VEC_SVE_PRED
;
1216 scalar_mode inner
= GET_MODE_INNER (mode
);
1217 if (VECTOR_MODE_P (mode
)
1224 || inner
== DFmode
))
1228 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1229 return VEC_SVE_DATA
;
1230 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1231 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1232 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1233 return VEC_SVE_DATA
| VEC_STRUCT
;
1236 /* This includes V1DF but not V1DI (which doesn't exist). */
1238 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1239 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1246 /* Return true if MODE is any of the data vector modes, including
1249 aarch64_vector_data_mode_p (machine_mode mode
)
1251 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1254 /* Return true if MODE is an SVE data vector mode; either a single vector
1255 or a structure of vectors. */
1257 aarch64_sve_data_mode_p (machine_mode mode
)
1259 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1262 /* Implement target hook TARGET_ARRAY_MODE. */
1263 static opt_machine_mode
1264 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1266 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1267 && IN_RANGE (nelems
, 2, 4))
1268 return mode_for_vector (GET_MODE_INNER (mode
),
1269 GET_MODE_NUNITS (mode
) * nelems
);
1271 return opt_machine_mode ();
1274 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1276 aarch64_array_mode_supported_p (machine_mode mode
,
1277 unsigned HOST_WIDE_INT nelems
)
1280 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1281 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1282 && (nelems
>= 2 && nelems
<= 4))
1288 /* Return the SVE predicate mode to use for elements that have
1289 ELEM_NBYTES bytes, if such a mode exists. */
1292 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1296 if (elem_nbytes
== 1)
1298 if (elem_nbytes
== 2)
1300 if (elem_nbytes
== 4)
1302 if (elem_nbytes
== 8)
1305 return opt_machine_mode ();
1308 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1310 static opt_machine_mode
1311 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1313 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1315 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1316 machine_mode pred_mode
;
1317 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1321 return default_get_mask_mode (nunits
, nbytes
);
1324 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1325 prefer to use the first arithmetic operand as the else value if
1326 the else value doesn't matter, since that exactly matches the SVE
1327 destructive merging form. For ternary operations we could either
1328 pick the first operand and use FMAD-like instructions or the last
1329 operand and use FMLA-like instructions; the latter seems more
1333 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1335 return nops
== 3 ? ops
[2] : ops
[0];
1338 /* Implement TARGET_HARD_REGNO_NREGS. */
1341 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1343 /* ??? Logically we should only need to provide a value when
1344 HARD_REGNO_MODE_OK says that the combination is valid,
1345 but at the moment we need to handle all modes. Just ignore
1346 any runtime parts for registers that can't store them. */
1347 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1348 switch (aarch64_regno_regclass (regno
))
1352 if (aarch64_sve_data_mode_p (mode
))
1353 return exact_div (GET_MODE_SIZE (mode
),
1354 BYTES_PER_SVE_VECTOR
).to_constant ();
1355 return CEIL (lowest_size
, UNITS_PER_VREG
);
1361 return CEIL (lowest_size
, UNITS_PER_WORD
);
1366 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1369 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1371 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1372 return regno
== CC_REGNUM
;
1374 if (regno
== VG_REGNUM
)
1375 /* This must have the same size as _Unwind_Word. */
1376 return mode
== DImode
;
1378 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1379 if (vec_flags
& VEC_SVE_PRED
)
1380 return PR_REGNUM_P (regno
);
1382 if (PR_REGNUM_P (regno
))
1385 if (regno
== SP_REGNUM
)
1386 /* The purpose of comparing with ptr_mode is to support the
1387 global register variable associated with the stack pointer
1388 register via the syntax of asm ("wsp") in ILP32. */
1389 return mode
== Pmode
|| mode
== ptr_mode
;
1391 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1392 return mode
== Pmode
;
1394 if (GP_REGNUM_P (regno
) && known_le (GET_MODE_SIZE (mode
), 16))
1397 if (FP_REGNUM_P (regno
))
1399 if (vec_flags
& VEC_STRUCT
)
1400 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1402 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1408 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1409 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1410 clobbers the top 64 bits when restoring the bottom 64 bits. */
1413 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1415 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1418 /* Implement REGMODE_NATURAL_SIZE. */
1420 aarch64_regmode_natural_size (machine_mode mode
)
1422 /* The natural size for SVE data modes is one SVE data vector,
1423 and similarly for predicates. We can't independently modify
1424 anything smaller than that. */
1425 /* ??? For now, only do this for variable-width SVE registers.
1426 Doing it for constant-sized registers breaks lower-subreg.c. */
1427 /* ??? And once that's fixed, we should probably have similar
1428 code for Advanced SIMD. */
1429 if (!aarch64_sve_vg
.is_constant ())
1431 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1432 if (vec_flags
& VEC_SVE_PRED
)
1433 return BYTES_PER_SVE_PRED
;
1434 if (vec_flags
& VEC_SVE_DATA
)
1435 return BYTES_PER_SVE_VECTOR
;
1437 return UNITS_PER_WORD
;
1440 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1442 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1445 /* The predicate mode determines which bits are significant and
1446 which are "don't care". Decreasing the number of lanes would
1447 lose data while increasing the number of lanes would make bits
1448 unnecessarily significant. */
1449 if (PR_REGNUM_P (regno
))
1451 if (known_ge (GET_MODE_SIZE (mode
), 4))
1457 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1458 that strcpy from constants will be faster. */
1460 static HOST_WIDE_INT
1461 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1463 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1464 return MAX (align
, BITS_PER_WORD
);
1468 /* Return true if calls to DECL should be treated as
1469 long-calls (ie called via a register). */
1471 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1476 /* Return true if calls to symbol-ref SYM should be treated as
1477 long-calls (ie called via a register). */
1479 aarch64_is_long_call_p (rtx sym
)
1481 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1484 /* Return true if calls to symbol-ref SYM should not go through
1488 aarch64_is_noplt_call_p (rtx sym
)
1490 const_tree decl
= SYMBOL_REF_DECL (sym
);
1495 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1496 && !targetm
.binds_local_p (decl
))
1502 /* Return true if the offsets to a zero/sign-extract operation
1503 represent an expression that matches an extend operation. The
1504 operands represent the paramters from
1506 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1508 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1511 HOST_WIDE_INT mult_val
, extract_val
;
1513 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1516 mult_val
= INTVAL (mult_imm
);
1517 extract_val
= INTVAL (extract_imm
);
1520 && extract_val
< GET_MODE_BITSIZE (mode
)
1521 && exact_log2 (extract_val
& ~7) > 0
1522 && (extract_val
& 7) <= 4
1523 && mult_val
== (1 << (extract_val
& 7)))
1529 /* Emit an insn that's a simple single-set. Both the operands must be
1530 known to be valid. */
1531 inline static rtx_insn
*
1532 emit_set_insn (rtx x
, rtx y
)
1534 return emit_insn (gen_rtx_SET (x
, y
));
1537 /* X and Y are two things to compare using CODE. Emit the compare insn and
1538 return the rtx for register 0 in the proper mode. */
1540 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1542 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1543 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1545 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1549 /* Build the SYMBOL_REF for __tls_get_addr. */
1551 static GTY(()) rtx tls_get_addr_libfunc
;
1554 aarch64_tls_get_addr (void)
1556 if (!tls_get_addr_libfunc
)
1557 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1558 return tls_get_addr_libfunc
;
1561 /* Return the TLS model to use for ADDR. */
1563 static enum tls_model
1564 tls_symbolic_operand_type (rtx addr
)
1566 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1567 if (GET_CODE (addr
) == CONST
)
1570 rtx sym
= strip_offset (addr
, &addend
);
1571 if (GET_CODE (sym
) == SYMBOL_REF
)
1572 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1574 else if (GET_CODE (addr
) == SYMBOL_REF
)
1575 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1580 /* We'll allow lo_sum's in addresses in our legitimate addresses
1581 so that combine would take care of combining addresses where
1582 necessary, but for generation purposes, we'll generate the address
1585 tmp = hi (symbol_ref); adrp x1, foo
1586 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1590 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1591 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1595 Load TLS symbol, depending on TLS mechanism and TLS access model.
1597 Global Dynamic - Traditional TLS:
1598 adrp tmp, :tlsgd:imm
1599 add dest, tmp, #:tlsgd_lo12:imm
1602 Global Dynamic - TLS Descriptors:
1603 adrp dest, :tlsdesc:imm
1604 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1605 add dest, dest, #:tlsdesc_lo12:imm
1612 adrp tmp, :gottprel:imm
1613 ldr dest, [tmp, #:gottprel_lo12:imm]
1618 add t0, tp, #:tprel_hi12:imm, lsl #12
1619 add t0, t0, #:tprel_lo12_nc:imm
1623 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1624 enum aarch64_symbol_type type
)
1628 case SYMBOL_SMALL_ABSOLUTE
:
1630 /* In ILP32, the mode of dest can be either SImode or DImode. */
1632 machine_mode mode
= GET_MODE (dest
);
1634 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1636 if (can_create_pseudo_p ())
1637 tmp_reg
= gen_reg_rtx (mode
);
1639 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1640 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1644 case SYMBOL_TINY_ABSOLUTE
:
1645 emit_insn (gen_rtx_SET (dest
, imm
));
1648 case SYMBOL_SMALL_GOT_28K
:
1650 machine_mode mode
= GET_MODE (dest
);
1651 rtx gp_rtx
= pic_offset_table_rtx
;
1655 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1656 here before rtl expand. Tree IVOPT will generate rtl pattern to
1657 decide rtx costs, in which case pic_offset_table_rtx is not
1658 initialized. For that case no need to generate the first adrp
1659 instruction as the final cost for global variable access is
1663 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1664 using the page base as GOT base, the first page may be wasted,
1665 in the worst scenario, there is only 28K space for GOT).
1667 The generate instruction sequence for accessing global variable
1670 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1672 Only one instruction needed. But we must initialize
1673 pic_offset_table_rtx properly. We generate initialize insn for
1674 every global access, and allow CSE to remove all redundant.
1676 The final instruction sequences will look like the following
1677 for multiply global variables access.
1679 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1681 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1682 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1683 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1686 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1687 crtl
->uses_pic_offset_table
= 1;
1688 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1690 if (mode
!= GET_MODE (gp_rtx
))
1691 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1695 if (mode
== ptr_mode
)
1698 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1700 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1702 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1706 gcc_assert (mode
== Pmode
);
1708 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1709 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1712 /* The operand is expected to be MEM. Whenever the related insn
1713 pattern changed, above code which calculate mem should be
1715 gcc_assert (GET_CODE (mem
) == MEM
);
1716 MEM_READONLY_P (mem
) = 1;
1717 MEM_NOTRAP_P (mem
) = 1;
1722 case SYMBOL_SMALL_GOT_4G
:
1724 /* In ILP32, the mode of dest can be either SImode or DImode,
1725 while the got entry is always of SImode size. The mode of
1726 dest depends on how dest is used: if dest is assigned to a
1727 pointer (e.g. in the memory), it has SImode; it may have
1728 DImode if dest is dereferenced to access the memeory.
1729 This is why we have to handle three different ldr_got_small
1730 patterns here (two patterns for ILP32). */
1735 machine_mode mode
= GET_MODE (dest
);
1737 if (can_create_pseudo_p ())
1738 tmp_reg
= gen_reg_rtx (mode
);
1740 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1741 if (mode
== ptr_mode
)
1744 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1746 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1748 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1752 gcc_assert (mode
== Pmode
);
1754 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1755 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1758 gcc_assert (GET_CODE (mem
) == MEM
);
1759 MEM_READONLY_P (mem
) = 1;
1760 MEM_NOTRAP_P (mem
) = 1;
1765 case SYMBOL_SMALL_TLSGD
:
1768 machine_mode mode
= GET_MODE (dest
);
1769 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1773 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1775 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1776 insns
= get_insns ();
1779 RTL_CONST_CALL_P (insns
) = 1;
1780 emit_libcall_block (insns
, dest
, result
, imm
);
1784 case SYMBOL_SMALL_TLSDESC
:
1786 machine_mode mode
= GET_MODE (dest
);
1787 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1790 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1792 /* In ILP32, the got entry is always of SImode size. Unlike
1793 small GOT, the dest is fixed at reg 0. */
1795 emit_insn (gen_tlsdesc_small_si (imm
));
1797 emit_insn (gen_tlsdesc_small_di (imm
));
1798 tp
= aarch64_load_tp (NULL
);
1801 tp
= gen_lowpart (mode
, tp
);
1803 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1805 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1809 case SYMBOL_SMALL_TLSIE
:
1811 /* In ILP32, the mode of dest can be either SImode or DImode,
1812 while the got entry is always of SImode size. The mode of
1813 dest depends on how dest is used: if dest is assigned to a
1814 pointer (e.g. in the memory), it has SImode; it may have
1815 DImode if dest is dereferenced to access the memeory.
1816 This is why we have to handle three different tlsie_small
1817 patterns here (two patterns for ILP32). */
1818 machine_mode mode
= GET_MODE (dest
);
1819 rtx tmp_reg
= gen_reg_rtx (mode
);
1820 rtx tp
= aarch64_load_tp (NULL
);
1822 if (mode
== ptr_mode
)
1825 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1828 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1829 tp
= gen_lowpart (mode
, tp
);
1834 gcc_assert (mode
== Pmode
);
1835 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1838 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1840 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1844 case SYMBOL_TLSLE12
:
1845 case SYMBOL_TLSLE24
:
1846 case SYMBOL_TLSLE32
:
1847 case SYMBOL_TLSLE48
:
1849 machine_mode mode
= GET_MODE (dest
);
1850 rtx tp
= aarch64_load_tp (NULL
);
1853 tp
= gen_lowpart (mode
, tp
);
1857 case SYMBOL_TLSLE12
:
1858 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1861 case SYMBOL_TLSLE24
:
1862 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1865 case SYMBOL_TLSLE32
:
1866 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1868 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1871 case SYMBOL_TLSLE48
:
1872 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1874 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1882 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1886 case SYMBOL_TINY_GOT
:
1887 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1890 case SYMBOL_TINY_TLSIE
:
1892 machine_mode mode
= GET_MODE (dest
);
1893 rtx tp
= aarch64_load_tp (NULL
);
1895 if (mode
== ptr_mode
)
1898 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1901 tp
= gen_lowpart (mode
, tp
);
1902 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1907 gcc_assert (mode
== Pmode
);
1908 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1912 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1921 /* Emit a move from SRC to DEST. Assume that the move expanders can
1922 handle all moves if !can_create_pseudo_p (). The distinction is
1923 important because, unlike emit_move_insn, the move expanders know
1924 how to force Pmode objects into the constant pool even when the
1925 constant pool address is not itself legitimate. */
1927 aarch64_emit_move (rtx dest
, rtx src
)
1929 return (can_create_pseudo_p ()
1930 ? emit_move_insn (dest
, src
)
1931 : emit_move_insn_1 (dest
, src
));
1934 /* Apply UNOPTAB to OP and store the result in DEST. */
1937 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
1939 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
1941 emit_move_insn (dest
, tmp
);
1944 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1947 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
1949 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
1952 emit_move_insn (dest
, tmp
);
1955 /* Split a 128-bit move operation into two 64-bit move operations,
1956 taking care to handle partial overlap of register to register
1957 copies. Special cases are needed when moving between GP regs and
1958 FP regs. SRC can be a register, constant or memory; DST a register
1959 or memory. If either operand is memory it must not have any side
1962 aarch64_split_128bit_move (rtx dst
, rtx src
)
1967 machine_mode mode
= GET_MODE (dst
);
1969 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1970 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1971 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1973 if (REG_P (dst
) && REG_P (src
))
1975 int src_regno
= REGNO (src
);
1976 int dst_regno
= REGNO (dst
);
1978 /* Handle FP <-> GP regs. */
1979 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1981 src_lo
= gen_lowpart (word_mode
, src
);
1982 src_hi
= gen_highpart (word_mode
, src
);
1986 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1987 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1991 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1992 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1996 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1998 dst_lo
= gen_lowpart (word_mode
, dst
);
1999 dst_hi
= gen_highpart (word_mode
, dst
);
2003 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
2004 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
2008 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
2009 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
2015 dst_lo
= gen_lowpart (word_mode
, dst
);
2016 dst_hi
= gen_highpart (word_mode
, dst
);
2017 src_lo
= gen_lowpart (word_mode
, src
);
2018 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2020 /* At most one pairing may overlap. */
2021 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2023 aarch64_emit_move (dst_hi
, src_hi
);
2024 aarch64_emit_move (dst_lo
, src_lo
);
2028 aarch64_emit_move (dst_lo
, src_lo
);
2029 aarch64_emit_move (dst_hi
, src_hi
);
2034 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2036 return (! REG_P (src
)
2037 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2040 /* Split a complex SIMD combine. */
2043 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2045 machine_mode src_mode
= GET_MODE (src1
);
2046 machine_mode dst_mode
= GET_MODE (dst
);
2048 gcc_assert (VECTOR_MODE_P (dst_mode
));
2049 gcc_assert (register_operand (dst
, dst_mode
)
2050 && register_operand (src1
, src_mode
)
2051 && register_operand (src2
, src_mode
));
2053 rtx (*gen
) (rtx
, rtx
, rtx
);
2058 gen
= gen_aarch64_simd_combinev8qi
;
2061 gen
= gen_aarch64_simd_combinev4hi
;
2064 gen
= gen_aarch64_simd_combinev2si
;
2067 gen
= gen_aarch64_simd_combinev4hf
;
2070 gen
= gen_aarch64_simd_combinev2sf
;
2073 gen
= gen_aarch64_simd_combinedi
;
2076 gen
= gen_aarch64_simd_combinedf
;
2082 emit_insn (gen (dst
, src1
, src2
));
2086 /* Split a complex SIMD move. */
2089 aarch64_split_simd_move (rtx dst
, rtx src
)
2091 machine_mode src_mode
= GET_MODE (src
);
2092 machine_mode dst_mode
= GET_MODE (dst
);
2094 gcc_assert (VECTOR_MODE_P (dst_mode
));
2096 if (REG_P (dst
) && REG_P (src
))
2098 rtx (*gen
) (rtx
, rtx
);
2100 gcc_assert (VECTOR_MODE_P (src_mode
));
2105 gen
= gen_aarch64_split_simd_movv16qi
;
2108 gen
= gen_aarch64_split_simd_movv8hi
;
2111 gen
= gen_aarch64_split_simd_movv4si
;
2114 gen
= gen_aarch64_split_simd_movv2di
;
2117 gen
= gen_aarch64_split_simd_movv8hf
;
2120 gen
= gen_aarch64_split_simd_movv4sf
;
2123 gen
= gen_aarch64_split_simd_movv2df
;
2129 emit_insn (gen (dst
, src
));
2135 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2136 machine_mode ymode
, rtx y
)
2138 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2139 gcc_assert (r
!= NULL
);
2140 return rtx_equal_p (x
, r
);
2145 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2147 if (can_create_pseudo_p ())
2148 return force_reg (mode
, value
);
2152 aarch64_emit_move (x
, value
);
2157 /* Return true if we can move VALUE into a register using a single
2158 CNT[BHWD] instruction. */
2161 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2163 HOST_WIDE_INT factor
= value
.coeffs
[0];
2164 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2165 return (value
.coeffs
[1] == factor
2166 && IN_RANGE (factor
, 2, 16 * 16)
2167 && (factor
& 1) == 0
2168 && factor
<= 16 * (factor
& -factor
));
2171 /* Likewise for rtx X. */
2174 aarch64_sve_cnt_immediate_p (rtx x
)
2177 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2180 /* Return the asm string for an instruction with a CNT-like vector size
2181 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2182 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2183 first part of the operands template (the part that comes before the
2184 vector size itself). FACTOR is the number of quadwords.
2185 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2186 If it is zero, we can use any element size. */
2189 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2190 unsigned int factor
,
2191 unsigned int nelts_per_vq
)
2193 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2195 if (nelts_per_vq
== 0)
2196 /* There is some overlap in the ranges of the four CNT instructions.
2197 Here we always use the smallest possible element size, so that the
2198 multiplier is 1 whereever possible. */
2199 nelts_per_vq
= factor
& -factor
;
2200 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2201 gcc_assert (IN_RANGE (shift
, 1, 4));
2202 char suffix
= "dwhb"[shift
- 1];
2205 unsigned int written
;
2207 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2208 prefix
, suffix
, operands
);
2210 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2211 prefix
, suffix
, operands
, factor
);
2212 gcc_assert (written
< sizeof (buffer
));
2216 /* Return the asm string for an instruction with a CNT-like vector size
2217 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2218 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2219 first part of the operands template (the part that comes before the
2220 vector size itself). X is the value of the vector size operand,
2221 as a polynomial integer rtx. */
2224 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2227 poly_int64 value
= rtx_to_poly_int64 (x
);
2228 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2229 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2230 value
.coeffs
[1], 0);
2233 /* Return true if we can add VALUE to a register using a single ADDVL
2234 or ADDPL instruction. */
2237 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2239 HOST_WIDE_INT factor
= value
.coeffs
[0];
2240 if (factor
== 0 || value
.coeffs
[1] != factor
)
2242 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2243 and a value of 16 is one vector width. */
2244 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2245 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2248 /* Likewise for rtx X. */
2251 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2254 return (poly_int_rtx_p (x
, &value
)
2255 && aarch64_sve_addvl_addpl_immediate_p (value
));
2258 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2259 and storing the result in operand 0. */
2262 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2264 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2265 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2266 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2268 /* Use INC or DEC if possible. */
2269 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2271 if (aarch64_sve_cnt_immediate_p (offset_value
))
2272 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2273 offset_value
.coeffs
[1], 0);
2274 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2275 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2276 -offset_value
.coeffs
[1], 0);
2279 int factor
= offset_value
.coeffs
[1];
2280 if ((factor
& 15) == 0)
2281 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2283 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2287 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2288 instruction. If it is, store the number of elements in each vector
2289 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2290 factor in *FACTOR_OUT (if nonnull). */
2293 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2294 unsigned int *nelts_per_vq_out
)
2299 if (!const_vec_duplicate_p (x
, &elt
)
2300 || !poly_int_rtx_p (elt
, &value
))
2303 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2304 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2305 /* There's no vector INCB. */
2308 HOST_WIDE_INT factor
= value
.coeffs
[0];
2309 if (value
.coeffs
[1] != factor
)
2312 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2313 if ((factor
% nelts_per_vq
) != 0
2314 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2318 *factor_out
= factor
;
2319 if (nelts_per_vq_out
)
2320 *nelts_per_vq_out
= nelts_per_vq
;
2324 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2328 aarch64_sve_inc_dec_immediate_p (rtx x
)
2330 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2333 /* Return the asm template for an SVE vector INC or DEC instruction.
2334 OPERANDS gives the operands before the vector count and X is the
2335 value of the vector count operand itself. */
2338 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2341 unsigned int nelts_per_vq
;
2342 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2345 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2348 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2353 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2354 scalar_int_mode mode
)
2357 unsigned HOST_WIDE_INT val
, val2
, mask
;
2358 int one_match
, zero_match
;
2363 if (aarch64_move_imm (val
, mode
))
2366 emit_insn (gen_rtx_SET (dest
, imm
));
2370 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2371 (with XXXX non-zero). In that case check to see if the move can be done in
2373 val2
= val
& 0xffffffff;
2375 && aarch64_move_imm (val2
, SImode
)
2376 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2379 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2381 /* Check if we have to emit a second instruction by checking to see
2382 if any of the upper 32 bits of the original DI mode value is set. */
2386 i
= (val
>> 48) ? 48 : 32;
2389 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2390 GEN_INT ((val
>> i
) & 0xffff)));
2395 if ((val
>> 32) == 0 || mode
== SImode
)
2399 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2401 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2402 GEN_INT ((val
>> 16) & 0xffff)));
2404 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2405 GEN_INT ((val
>> 16) & 0xffff)));
2410 /* Remaining cases are all for DImode. */
2413 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2414 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2415 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2416 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2418 if (zero_match
!= 2 && one_match
!= 2)
2420 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2421 For a 64-bit bitmask try whether changing 16 bits to all ones or
2422 zeroes creates a valid bitmask. To check any repeated bitmask,
2423 try using 16 bits from the other 32-bit half of val. */
2425 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2428 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2431 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2433 val2
= val2
& ~mask
;
2434 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2435 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2442 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2443 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2444 GEN_INT ((val
>> i
) & 0xffff)));
2450 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2451 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2452 otherwise skip zero bits. */
2456 val2
= one_match
> zero_match
? ~val
: val
;
2457 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2460 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2461 ? (val
| ~(mask
<< i
))
2462 : (val
& (mask
<< i
)))));
2463 for (i
+= 16; i
< 64; i
+= 16)
2465 if ((val2
& (mask
<< i
)) == 0)
2468 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2469 GEN_INT ((val
>> i
) & 0xffff)));
2476 /* Return whether imm is a 128-bit immediate which is simple enough to
2479 aarch64_mov128_immediate (rtx imm
)
2481 if (GET_CODE (imm
) == CONST_INT
)
2484 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2486 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2487 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2489 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2490 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2494 /* Return the number of temporary registers that aarch64_add_offset_1
2495 would need to add OFFSET to a register. */
2498 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2500 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2503 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2504 a non-polynomial OFFSET. MODE is the mode of the addition.
2505 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2506 be set and CFA adjustments added to the generated instructions.
2508 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2509 temporary if register allocation is already complete. This temporary
2510 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2511 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2512 the immediate again.
2514 Since this function may be used to adjust the stack pointer, we must
2515 ensure that it cannot cause transient stack deallocation (for example
2516 by first incrementing SP and then decrementing when adjusting by a
2517 large immediate). */
2520 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2521 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2522 bool frame_related_p
, bool emit_move_imm
)
2524 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2525 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2527 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2532 if (!rtx_equal_p (dest
, src
))
2534 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2535 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2540 /* Single instruction adjustment. */
2541 if (aarch64_uimm12_shift (moffset
))
2543 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2544 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2548 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2551 a) the offset cannot be loaded by a 16-bit move or
2552 b) there is no spare register into which we can move it. */
2553 if (moffset
< 0x1000000
2554 && ((!temp1
&& !can_create_pseudo_p ())
2555 || !aarch64_move_imm (moffset
, mode
)))
2557 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2559 low_off
= offset
< 0 ? -low_off
: low_off
;
2560 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2561 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2562 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2563 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2567 /* Emit a move immediate if required and an addition/subtraction. */
2570 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2571 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2573 insn
= emit_insn (offset
< 0
2574 ? gen_sub3_insn (dest
, src
, temp1
)
2575 : gen_add3_insn (dest
, src
, temp1
));
2576 if (frame_related_p
)
2578 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2579 rtx adj
= plus_constant (mode
, src
, offset
);
2580 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2584 /* Return the number of temporary registers that aarch64_add_offset
2585 would need to move OFFSET into a register or add OFFSET to a register;
2586 ADD_P is true if we want the latter rather than the former. */
2589 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2591 /* This follows the same structure as aarch64_add_offset. */
2592 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2595 unsigned int count
= 0;
2596 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2597 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2598 poly_int64
poly_offset (factor
, factor
);
2599 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2600 /* Need one register for the ADDVL/ADDPL result. */
2602 else if (factor
!= 0)
2604 factor
= abs (factor
);
2605 if (factor
> 16 * (factor
& -factor
))
2606 /* Need one register for the CNT result and one for the multiplication
2607 factor. If necessary, the second temporary can be reused for the
2608 constant part of the offset. */
2610 /* Need one register for the CNT result (which might then
2614 return count
+ aarch64_add_offset_1_temporaries (constant
);
2617 /* If X can be represented as a poly_int64, return the number
2618 of temporaries that are required to add it to a register.
2619 Return -1 otherwise. */
2622 aarch64_add_offset_temporaries (rtx x
)
2625 if (!poly_int_rtx_p (x
, &offset
))
2627 return aarch64_offset_temporaries (true, offset
);
2630 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2631 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2632 be set and CFA adjustments added to the generated instructions.
2634 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2635 temporary if register allocation is already complete. This temporary
2636 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2637 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2638 false to avoid emitting the immediate again.
2640 TEMP2, if nonnull, is a second temporary register that doesn't
2641 overlap either DEST or REG.
2643 Since this function may be used to adjust the stack pointer, we must
2644 ensure that it cannot cause transient stack deallocation (for example
2645 by first incrementing SP and then decrementing when adjusting by a
2646 large immediate). */
2649 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2650 poly_int64 offset
, rtx temp1
, rtx temp2
,
2651 bool frame_related_p
, bool emit_move_imm
= true)
2653 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2654 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2655 gcc_assert (temp1
== NULL_RTX
2657 || !reg_overlap_mentioned_p (temp1
, dest
));
2658 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2660 /* Try using ADDVL or ADDPL to add the whole value. */
2661 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2663 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2664 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2665 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2669 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2670 SVE vector register, over and above the minimum size of 128 bits.
2671 This is equivalent to half the value returned by CNTD with a
2672 vector shape of ALL. */
2673 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2674 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2676 /* Try using ADDVL or ADDPL to add the VG-based part. */
2677 poly_int64
poly_offset (factor
, factor
);
2678 if (src
!= const0_rtx
2679 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2681 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2682 if (frame_related_p
)
2684 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2685 RTX_FRAME_RELATED_P (insn
) = true;
2690 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2691 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2696 /* Otherwise use a CNT-based sequence. */
2697 else if (factor
!= 0)
2699 /* Use a subtraction if we have a negative factor. */
2700 rtx_code code
= PLUS
;
2707 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2708 into the multiplication. */
2712 /* Use a right shift by 1. */
2716 HOST_WIDE_INT low_bit
= factor
& -factor
;
2717 if (factor
<= 16 * low_bit
)
2719 if (factor
> 16 * 8)
2721 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2722 the value with the minimum multiplier and shift it into
2724 int extra_shift
= exact_log2 (low_bit
);
2725 shift
+= extra_shift
;
2726 factor
>>= extra_shift
;
2728 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2732 /* Use CNTD, then multiply it by FACTOR. */
2733 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2734 val
= aarch64_force_temporary (mode
, temp1
, val
);
2736 /* Go back to using a negative multiplication factor if we have
2737 no register from which to subtract. */
2738 if (code
== MINUS
&& src
== const0_rtx
)
2743 rtx coeff1
= gen_int_mode (factor
, mode
);
2744 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2745 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2750 /* Multiply by 1 << SHIFT. */
2751 val
= aarch64_force_temporary (mode
, temp1
, val
);
2752 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2754 else if (shift
== -1)
2757 val
= aarch64_force_temporary (mode
, temp1
, val
);
2758 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2761 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2762 if (src
!= const0_rtx
)
2764 val
= aarch64_force_temporary (mode
, temp1
, val
);
2765 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2767 else if (code
== MINUS
)
2769 val
= aarch64_force_temporary (mode
, temp1
, val
);
2770 val
= gen_rtx_NEG (mode
, val
);
2773 if (constant
== 0 || frame_related_p
)
2775 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2776 if (frame_related_p
)
2778 RTX_FRAME_RELATED_P (insn
) = true;
2779 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2780 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2789 src
= aarch64_force_temporary (mode
, temp1
, val
);
2794 emit_move_imm
= true;
2797 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
2798 frame_related_p
, emit_move_imm
);
2801 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2802 than a poly_int64. */
2805 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2806 rtx offset_rtx
, rtx temp1
, rtx temp2
)
2808 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
2809 temp1
, temp2
, false);
2812 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2813 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2814 if TEMP1 already contains abs (DELTA). */
2817 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
2819 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
2820 temp1
, temp2
, true, emit_move_imm
);
2823 /* Subtract DELTA from the stack pointer, marking the instructions
2824 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2828 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
)
2830 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
2831 temp1
, temp2
, frame_related_p
);
2834 /* Set DEST to (vec_series BASE STEP). */
2837 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
2839 machine_mode mode
= GET_MODE (dest
);
2840 scalar_mode inner
= GET_MODE_INNER (mode
);
2842 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2843 if (!aarch64_sve_index_immediate_p (base
))
2844 base
= force_reg (inner
, base
);
2845 if (!aarch64_sve_index_immediate_p (step
))
2846 step
= force_reg (inner
, step
);
2848 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
2851 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2852 integer of mode INT_MODE. Return true on success. */
2855 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
2858 /* If the constant is smaller than 128 bits, we can do the move
2859 using a vector of SRC_MODEs. */
2860 if (src_mode
!= TImode
)
2862 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
2863 GET_MODE_SIZE (src_mode
));
2864 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
2865 emit_move_insn (gen_lowpart (dup_mode
, dest
),
2866 gen_const_vec_duplicate (dup_mode
, src
));
2870 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2871 src
= force_const_mem (src_mode
, src
);
2875 /* Make sure that the address is legitimate. */
2876 if (!aarch64_sve_ld1r_operand_p (src
))
2878 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
2879 src
= replace_equiv_address (src
, addr
);
2882 machine_mode mode
= GET_MODE (dest
);
2883 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
2884 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
2885 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
2886 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
2887 emit_insn (gen_rtx_SET (dest
, src
));
2891 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2892 isn't a simple duplicate or series. */
2895 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
2897 machine_mode mode
= GET_MODE (src
);
2898 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
2899 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
2900 gcc_assert (npatterns
> 1);
2902 if (nelts_per_pattern
== 1)
2904 /* The constant is a repeating seqeuence of at least two elements,
2905 where the repeating elements occupy no more than 128 bits.
2906 Get an integer representation of the replicated value. */
2907 scalar_int_mode int_mode
;
2908 if (BYTES_BIG_ENDIAN
)
2909 /* For now, always use LD1RQ to load the value on big-endian
2910 targets, since the handling of smaller integers includes a
2911 subreg that is semantically an element reverse. */
2915 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
2916 gcc_assert (int_bits
<= 128);
2917 int_mode
= int_mode_for_size (int_bits
, 0).require ();
2919 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
2921 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
2925 /* Expand each pattern individually. */
2926 rtx_vector_builder builder
;
2927 auto_vec
<rtx
, 16> vectors (npatterns
);
2928 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2930 builder
.new_vector (mode
, 1, nelts_per_pattern
);
2931 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
2932 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
2933 vectors
.quick_push (force_reg (mode
, builder
.build ()));
2936 /* Use permutes to interleave the separate vectors. */
2937 while (npatterns
> 1)
2940 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2942 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
2943 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
2944 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
2948 gcc_assert (vectors
[0] == dest
);
2951 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2952 is a pattern that can be used to set DEST to a replicated scalar
2956 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
2957 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
2959 machine_mode mode
= GET_MODE (dest
);
2961 /* Check on what type of symbol it is. */
2962 scalar_int_mode int_mode
;
2963 if ((GET_CODE (imm
) == SYMBOL_REF
2964 || GET_CODE (imm
) == LABEL_REF
2965 || GET_CODE (imm
) == CONST
2966 || GET_CODE (imm
) == CONST_POLY_INT
)
2967 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
2971 HOST_WIDE_INT const_offset
;
2972 enum aarch64_symbol_type sty
;
2974 /* If we have (const (plus symbol offset)), separate out the offset
2975 before we start classifying the symbol. */
2976 rtx base
= strip_offset (imm
, &offset
);
2978 /* We must always add an offset involving VL separately, rather than
2979 folding it into the relocation. */
2980 if (!offset
.is_constant (&const_offset
))
2982 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
2983 emit_insn (gen_rtx_SET (dest
, imm
));
2986 /* Do arithmetic on 32-bit values if the result is smaller
2988 if (partial_subreg_p (int_mode
, SImode
))
2990 /* It is invalid to do symbol calculations in modes
2991 narrower than SImode. */
2992 gcc_assert (base
== const0_rtx
);
2993 dest
= gen_lowpart (SImode
, dest
);
2996 if (base
!= const0_rtx
)
2998 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2999 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3000 NULL_RTX
, NULL_RTX
, false);
3003 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3004 dest
, NULL_RTX
, false);
3009 sty
= aarch64_classify_symbol (base
, const_offset
);
3012 case SYMBOL_FORCE_TO_MEM
:
3013 if (const_offset
!= 0
3014 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3016 gcc_assert (can_create_pseudo_p ());
3017 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3018 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3019 NULL_RTX
, NULL_RTX
, false);
3023 mem
= force_const_mem (ptr_mode
, imm
);
3026 /* If we aren't generating PC relative literals, then
3027 we need to expand the literal pool access carefully.
3028 This is something that needs to be done in a number
3029 of places, so could well live as a separate function. */
3030 if (!aarch64_pcrelative_literal_loads
)
3032 gcc_assert (can_create_pseudo_p ());
3033 base
= gen_reg_rtx (ptr_mode
);
3034 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3035 if (ptr_mode
!= Pmode
)
3036 base
= convert_memory_address (Pmode
, base
);
3037 mem
= gen_rtx_MEM (ptr_mode
, base
);
3040 if (int_mode
!= ptr_mode
)
3041 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3043 emit_insn (gen_rtx_SET (dest
, mem
));
3047 case SYMBOL_SMALL_TLSGD
:
3048 case SYMBOL_SMALL_TLSDESC
:
3049 case SYMBOL_SMALL_TLSIE
:
3050 case SYMBOL_SMALL_GOT_28K
:
3051 case SYMBOL_SMALL_GOT_4G
:
3052 case SYMBOL_TINY_GOT
:
3053 case SYMBOL_TINY_TLSIE
:
3054 if (const_offset
!= 0)
3056 gcc_assert(can_create_pseudo_p ());
3057 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3058 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3059 NULL_RTX
, NULL_RTX
, false);
3064 case SYMBOL_SMALL_ABSOLUTE
:
3065 case SYMBOL_TINY_ABSOLUTE
:
3066 case SYMBOL_TLSLE12
:
3067 case SYMBOL_TLSLE24
:
3068 case SYMBOL_TLSLE32
:
3069 case SYMBOL_TLSLE48
:
3070 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3078 if (!CONST_INT_P (imm
))
3080 rtx base
, step
, value
;
3081 if (GET_CODE (imm
) == HIGH
3082 || aarch64_simd_valid_immediate (imm
, NULL
))
3083 emit_insn (gen_rtx_SET (dest
, imm
));
3084 else if (const_vec_series_p (imm
, &base
, &step
))
3085 aarch64_expand_vec_series (dest
, base
, step
);
3086 else if (const_vec_duplicate_p (imm
, &value
))
3088 /* If the constant is out of range of an SVE vector move,
3089 load it from memory if we can, otherwise move it into
3090 a register and use a DUP. */
3091 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3092 rtx op
= force_const_mem (inner_mode
, value
);
3094 op
= force_reg (inner_mode
, value
);
3095 else if (!aarch64_sve_ld1r_operand_p (op
))
3097 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3098 op
= replace_equiv_address (op
, addr
);
3100 emit_insn (gen_vec_duplicate (dest
, op
));
3102 else if (GET_CODE (imm
) == CONST_VECTOR
3103 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3104 aarch64_expand_sve_const_vector (dest
, imm
);
3107 rtx mem
= force_const_mem (mode
, imm
);
3109 emit_move_insn (dest
, mem
);
3115 aarch64_internal_mov_immediate (dest
, imm
, true,
3116 as_a
<scalar_int_mode
> (mode
));
3119 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3120 that is known to contain PTRUE. */
3123 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3125 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3126 gen_rtvec (2, pred
, src
),
3127 UNSPEC_MERGE_PTRUE
)));
3130 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3131 operand is in memory. In this case we need to use the predicated LD1
3132 and ST1 instead of LDR and STR, both for correctness on big-endian
3133 targets and because LD1 and ST1 support a wider range of addressing modes.
3134 PRED_MODE is the mode of the predicate.
3136 See the comment at the head of aarch64-sve.md for details about the
3137 big-endian handling. */
3140 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3142 machine_mode mode
= GET_MODE (dest
);
3143 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3144 if (!register_operand (src
, mode
)
3145 && !register_operand (dest
, mode
))
3147 rtx tmp
= gen_reg_rtx (mode
);
3149 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3151 emit_move_insn (tmp
, src
);
3154 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3157 /* Called only on big-endian targets. See whether an SVE vector move
3158 from SRC to DEST is effectively a REV[BHW] instruction, because at
3159 least one operand is a subreg of an SVE vector that has wider or
3160 narrower elements. Return true and emit the instruction if so.
3164 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3166 represents a VIEW_CONVERT between the following vectors, viewed
3169 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3170 R1: { [0], [1], [2], [3], ... }
3172 The high part of lane X in R2 should therefore correspond to lane X*2
3173 of R1, but the register representations are:
3176 R2: ...... [1].high [1].low [0].high [0].low
3177 R1: ...... [3] [2] [1] [0]
3179 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3180 We therefore need a reverse operation to swap the high and low values
3183 This is purely an optimization. Without it we would spill the
3184 subreg operand to the stack in one mode and reload it in the
3185 other mode, which has the same effect as the REV. */
3188 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3190 gcc_assert (BYTES_BIG_ENDIAN
);
3191 if (GET_CODE (dest
) == SUBREG
)
3192 dest
= SUBREG_REG (dest
);
3193 if (GET_CODE (src
) == SUBREG
)
3194 src
= SUBREG_REG (src
);
3196 /* The optimization handles two single SVE REGs with different element
3200 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3201 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3202 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3203 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3206 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3207 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3208 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3210 emit_insn (gen_rtx_SET (dest
, unspec
));
3214 /* Return a copy of X with mode MODE, without changing its other
3215 attributes. Unlike gen_lowpart, this doesn't care whether the
3216 mode change is valid. */
3219 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3221 if (GET_MODE (x
) == mode
)
3224 x
= shallow_copy_rtx (x
);
3225 set_mode_and_regno (x
, mode
, REGNO (x
));
3229 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3233 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3235 /* Decide which REV operation we need. The mode with narrower elements
3236 determines the mode of the operands and the mode with the wider
3237 elements determines the reverse width. */
3238 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3239 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3240 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3241 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3242 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3244 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3245 unsigned int unspec
;
3246 if (wider_bytes
== 8)
3247 unspec
= UNSPEC_REV64
;
3248 else if (wider_bytes
== 4)
3249 unspec
= UNSPEC_REV32
;
3250 else if (wider_bytes
== 2)
3251 unspec
= UNSPEC_REV16
;
3254 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3258 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3259 UNSPEC_MERGE_PTRUE))
3261 with the appropriate modes. */
3262 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3263 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3264 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3265 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3266 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3267 UNSPEC_MERGE_PTRUE
);
3268 emit_insn (gen_rtx_SET (dest
, src
));
3272 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3273 tree exp ATTRIBUTE_UNUSED
)
3275 /* Currently, always true. */
3279 /* Implement TARGET_PASS_BY_REFERENCE. */
3282 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3285 bool named ATTRIBUTE_UNUSED
)
3288 machine_mode dummymode
;
3291 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3292 if (mode
== BLKmode
&& type
)
3293 size
= int_size_in_bytes (type
);
3295 /* No frontends can create types with variable-sized modes, so we
3296 shouldn't be asked to pass or return them. */
3297 size
= GET_MODE_SIZE (mode
).to_constant ();
3299 /* Aggregates are passed by reference based on their size. */
3300 if (type
&& AGGREGATE_TYPE_P (type
))
3302 size
= int_size_in_bytes (type
);
3305 /* Variable sized arguments are always returned by reference. */
3309 /* Can this be a candidate to be passed in fp/simd register(s)? */
3310 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3315 /* Arguments which are variable sized or larger than 2 registers are
3316 passed by reference unless they are a homogenous floating point
3318 return size
> 2 * UNITS_PER_WORD
;
3321 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3323 aarch64_return_in_msb (const_tree valtype
)
3325 machine_mode dummy_mode
;
3328 /* Never happens in little-endian mode. */
3329 if (!BYTES_BIG_ENDIAN
)
3332 /* Only composite types smaller than or equal to 16 bytes can
3333 be potentially returned in registers. */
3334 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3335 || int_size_in_bytes (valtype
) <= 0
3336 || int_size_in_bytes (valtype
) > 16)
3339 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3340 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3341 is always passed/returned in the least significant bits of fp/simd
3343 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3344 &dummy_mode
, &dummy_int
, NULL
))
3350 /* Implement TARGET_FUNCTION_VALUE.
3351 Define how to find the value returned by a function. */
3354 aarch64_function_value (const_tree type
, const_tree func
,
3355 bool outgoing ATTRIBUTE_UNUSED
)
3360 machine_mode ag_mode
;
3362 mode
= TYPE_MODE (type
);
3363 if (INTEGRAL_TYPE_P (type
))
3364 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3366 if (aarch64_return_in_msb (type
))
3368 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3370 if (size
% UNITS_PER_WORD
!= 0)
3372 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3373 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3377 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3378 &ag_mode
, &count
, NULL
))
3380 if (!aarch64_composite_type_p (type
, mode
))
3382 gcc_assert (count
== 1 && mode
== ag_mode
);
3383 return gen_rtx_REG (mode
, V0_REGNUM
);
3390 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3391 for (i
= 0; i
< count
; i
++)
3393 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3394 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3395 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3396 XVECEXP (par
, 0, i
) = tmp
;
3402 return gen_rtx_REG (mode
, R0_REGNUM
);
3405 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3406 Return true if REGNO is the number of a hard register in which the values
3407 of called function may come back. */
3410 aarch64_function_value_regno_p (const unsigned int regno
)
3412 /* Maximum of 16 bytes can be returned in the general registers. Examples
3413 of 16-byte return values are: 128-bit integers and 16-byte small
3414 structures (excluding homogeneous floating-point aggregates). */
3415 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3418 /* Up to four fp/simd registers can return a function value, e.g. a
3419 homogeneous floating-point aggregate having four members. */
3420 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3421 return TARGET_FLOAT
;
3426 /* Implement TARGET_RETURN_IN_MEMORY.
3428 If the type T of the result of a function is such that
3430 would require that arg be passed as a value in a register (or set of
3431 registers) according to the parameter passing rules, then the result
3432 is returned in the same registers as would be used for such an
3436 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3439 machine_mode ag_mode
;
3442 if (!AGGREGATE_TYPE_P (type
)
3443 && TREE_CODE (type
) != COMPLEX_TYPE
3444 && TREE_CODE (type
) != VECTOR_TYPE
)
3445 /* Simple scalar types always returned in registers. */
3448 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3455 /* Types larger than 2 registers returned in memory. */
3456 size
= int_size_in_bytes (type
);
3457 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3461 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3462 const_tree type
, int *nregs
)
3464 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3465 return aarch64_vfp_is_call_or_return_candidate (mode
,
3467 &pcum
->aapcs_vfp_rmode
,
3472 /* Given MODE and TYPE of a function argument, return the alignment in
3473 bits. The idea is to suppress any stronger alignment requested by
3474 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3475 This is a helper function for local use only. */
3478 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3481 return GET_MODE_ALIGNMENT (mode
);
3483 if (integer_zerop (TYPE_SIZE (type
)))
3486 gcc_assert (TYPE_MODE (type
) == mode
);
3488 if (!AGGREGATE_TYPE_P (type
))
3489 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3491 if (TREE_CODE (type
) == ARRAY_TYPE
)
3492 return TYPE_ALIGN (TREE_TYPE (type
));
3494 unsigned int alignment
= 0;
3495 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3496 if (TREE_CODE (field
) == FIELD_DECL
)
3497 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3502 /* Layout a function argument according to the AAPCS64 rules. The rule
3503 numbers refer to the rule numbers in the AAPCS64. */
3506 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3508 bool named ATTRIBUTE_UNUSED
)
3510 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3511 int ncrn
, nvrn
, nregs
;
3512 bool allocate_ncrn
, allocate_nvrn
;
3515 /* We need to do this once per argument. */
3516 if (pcum
->aapcs_arg_processed
)
3519 pcum
->aapcs_arg_processed
= true;
3521 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3523 size
= int_size_in_bytes (type
);
3525 /* No frontends can create types with variable-sized modes, so we
3526 shouldn't be asked to pass or return them. */
3527 size
= GET_MODE_SIZE (mode
).to_constant ();
3528 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3530 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3531 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3536 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3537 The following code thus handles passing by SIMD/FP registers first. */
3539 nvrn
= pcum
->aapcs_nvrn
;
3541 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3542 and homogenous short-vector aggregates (HVA). */
3546 aarch64_err_no_fpadvsimd (mode
);
3548 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3550 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3551 if (!aarch64_composite_type_p (type
, mode
))
3553 gcc_assert (nregs
== 1);
3554 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3560 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3561 for (i
= 0; i
< nregs
; i
++)
3563 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3564 V0_REGNUM
+ nvrn
+ i
);
3565 rtx offset
= gen_int_mode
3566 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3567 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3568 XVECEXP (par
, 0, i
) = tmp
;
3570 pcum
->aapcs_reg
= par
;
3576 /* C.3 NSRN is set to 8. */
3577 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3582 ncrn
= pcum
->aapcs_ncrn
;
3583 nregs
= size
/ UNITS_PER_WORD
;
3585 /* C6 - C9. though the sign and zero extension semantics are
3586 handled elsewhere. This is the case where the argument fits
3587 entirely general registers. */
3588 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3591 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3593 /* C.8 if the argument has an alignment of 16 then the NGRN is
3594 rounded up to the next even number. */
3597 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3598 comparison is there because for > 16 * BITS_PER_UNIT
3599 alignment nregs should be > 2 and therefore it should be
3600 passed by reference rather than value. */
3601 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3604 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3607 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3608 A reg is still generated for it, but the caller should be smart
3609 enough not to use it. */
3610 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3611 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3617 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3618 for (i
= 0; i
< nregs
; i
++)
3620 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3621 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3622 GEN_INT (i
* UNITS_PER_WORD
));
3623 XVECEXP (par
, 0, i
) = tmp
;
3625 pcum
->aapcs_reg
= par
;
3628 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3633 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3635 /* The argument is passed on stack; record the needed number of words for
3636 this argument and align the total size if necessary. */
3638 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3640 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3641 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3642 16 / UNITS_PER_WORD
);
3646 /* Implement TARGET_FUNCTION_ARG. */
3649 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3650 const_tree type
, bool named
)
3652 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3653 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3655 if (mode
== VOIDmode
)
3658 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3659 return pcum
->aapcs_reg
;
3663 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3664 const_tree fntype ATTRIBUTE_UNUSED
,
3665 rtx libname ATTRIBUTE_UNUSED
,
3666 const_tree fndecl ATTRIBUTE_UNUSED
,
3667 unsigned n_named ATTRIBUTE_UNUSED
)
3669 pcum
->aapcs_ncrn
= 0;
3670 pcum
->aapcs_nvrn
= 0;
3671 pcum
->aapcs_nextncrn
= 0;
3672 pcum
->aapcs_nextnvrn
= 0;
3673 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3674 pcum
->aapcs_reg
= NULL_RTX
;
3675 pcum
->aapcs_arg_processed
= false;
3676 pcum
->aapcs_stack_words
= 0;
3677 pcum
->aapcs_stack_size
= 0;
3680 && fndecl
&& TREE_PUBLIC (fndecl
)
3681 && fntype
&& fntype
!= error_mark_node
)
3683 const_tree type
= TREE_TYPE (fntype
);
3684 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3685 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3686 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3687 &mode
, &nregs
, NULL
))
3688 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
3694 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3699 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3700 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3702 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3703 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3704 != (pcum
->aapcs_stack_words
!= 0));
3705 pcum
->aapcs_arg_processed
= false;
3706 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3707 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3708 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3709 pcum
->aapcs_stack_words
= 0;
3710 pcum
->aapcs_reg
= NULL_RTX
;
3715 aarch64_function_arg_regno_p (unsigned regno
)
3717 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3718 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3721 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3722 PARM_BOUNDARY bits of alignment, but will be given anything up
3723 to STACK_BOUNDARY bits if the type requires it. This makes sure
3724 that both before and after the layout of each argument, the Next
3725 Stacked Argument Address (NSAA) will have a minimum alignment of
3729 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3731 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3732 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3735 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3737 static fixed_size_mode
3738 aarch64_get_reg_raw_mode (int regno
)
3740 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3741 /* Don't use the SVE part of the register for __builtin_apply and
3742 __builtin_return. The SVE registers aren't used by the normal PCS,
3743 so using them there would be a waste of time. The PCS extensions
3744 for SVE types are fundamentally incompatible with the
3745 __builtin_return/__builtin_apply interface. */
3746 return as_a
<fixed_size_mode
> (V16QImode
);
3747 return default_get_reg_raw_mode (regno
);
3750 /* Implement TARGET_FUNCTION_ARG_PADDING.
3752 Small aggregate types are placed in the lowest memory address.
3754 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3756 static pad_direction
3757 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3759 /* On little-endian targets, the least significant byte of every stack
3760 argument is passed at the lowest byte address of the stack slot. */
3761 if (!BYTES_BIG_ENDIAN
)
3764 /* Otherwise, integral, floating-point and pointer types are padded downward:
3765 the least significant byte of a stack argument is passed at the highest
3766 byte address of the stack slot. */
3768 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3769 || POINTER_TYPE_P (type
))
3770 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3771 return PAD_DOWNWARD
;
3773 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3777 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3779 It specifies padding for the last (may also be the only)
3780 element of a block move between registers and memory. If
3781 assuming the block is in the memory, padding upward means that
3782 the last element is padded after its highest significant byte,
3783 while in downward padding, the last element is padded at the
3784 its least significant byte side.
3786 Small aggregates and small complex types are always padded
3789 We don't need to worry about homogeneous floating-point or
3790 short-vector aggregates; their move is not affected by the
3791 padding direction determined here. Regardless of endianness,
3792 each element of such an aggregate is put in the least
3793 significant bits of a fp/simd register.
3795 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3796 register has useful data, and return the opposite if the most
3797 significant byte does. */
3800 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
3801 bool first ATTRIBUTE_UNUSED
)
3804 /* Small composite types are always padded upward. */
3805 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
3809 size
= int_size_in_bytes (type
);
3811 /* No frontends can create types with variable-sized modes, so we
3812 shouldn't be asked to pass or return them. */
3813 size
= GET_MODE_SIZE (mode
).to_constant ();
3814 if (size
< 2 * UNITS_PER_WORD
)
3818 /* Otherwise, use the default padding. */
3819 return !BYTES_BIG_ENDIAN
;
3822 static scalar_int_mode
3823 aarch64_libgcc_cmp_return_mode (void)
3828 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3830 /* We use the 12-bit shifted immediate arithmetic instructions so values
3831 must be multiple of (1 << 12), i.e. 4096. */
3832 #define ARITH_FACTOR 4096
3834 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3835 #error Cannot use simple address calculation for stack probing
3838 /* The pair of scratch registers used for stack probing. */
3839 #define PROBE_STACK_FIRST_REG 9
3840 #define PROBE_STACK_SECOND_REG 10
3842 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3843 inclusive. These are offsets from the current stack pointer. */
3846 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
3849 if (!poly_size
.is_constant (&size
))
3851 sorry ("stack probes for SVE frames");
3855 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
3857 /* See the same assertion on PROBE_INTERVAL above. */
3858 gcc_assert ((first
% ARITH_FACTOR
) == 0);
3860 /* See if we have a constant small number of probes to generate. If so,
3861 that's the easy case. */
3862 if (size
<= PROBE_INTERVAL
)
3864 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
3866 emit_set_insn (reg1
,
3867 plus_constant (Pmode
,
3868 stack_pointer_rtx
, -(first
+ base
)));
3869 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
3872 /* The run-time loop is made up of 8 insns in the generic case while the
3873 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3874 else if (size
<= 4 * PROBE_INTERVAL
)
3876 HOST_WIDE_INT i
, rem
;
3878 emit_set_insn (reg1
,
3879 plus_constant (Pmode
,
3881 -(first
+ PROBE_INTERVAL
)));
3882 emit_stack_probe (reg1
);
3884 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3885 it exceeds SIZE. If only two probes are needed, this will not
3886 generate any code. Then probe at FIRST + SIZE. */
3887 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
3889 emit_set_insn (reg1
,
3890 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
3891 emit_stack_probe (reg1
);
3894 rem
= size
- (i
- PROBE_INTERVAL
);
3897 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3899 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
3900 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
3903 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
3906 /* Otherwise, do the same as above, but in a loop. Note that we must be
3907 extra careful with variables wrapping around because we might be at
3908 the very top (or the very bottom) of the address space and we have
3909 to be able to handle this case properly; in particular, we use an
3910 equality test for the loop condition. */
3913 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
3915 /* Step 1: round SIZE to the previous multiple of the interval. */
3917 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
3920 /* Step 2: compute initial and final value of the loop counter. */
3922 /* TEST_ADDR = SP + FIRST. */
3923 emit_set_insn (reg1
,
3924 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
3926 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3927 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
3928 if (! aarch64_uimm12_shift (adjustment
))
3930 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
3932 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
3935 emit_set_insn (reg2
,
3936 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
3942 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3945 while (TEST_ADDR != LAST_ADDR)
3947 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3948 until it is equal to ROUNDED_SIZE. */
3950 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
3953 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3954 that SIZE is equal to ROUNDED_SIZE. */
3956 if (size
!= rounded_size
)
3958 HOST_WIDE_INT rem
= size
- rounded_size
;
3962 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3964 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
3965 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
3968 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
3972 /* Make sure nothing is scheduled before we are done. */
3973 emit_insn (gen_blockage ());
3976 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3977 absolute addresses. */
3980 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
3982 static int labelno
= 0;
3986 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
3989 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
3991 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3993 xops
[1] = GEN_INT (PROBE_INTERVAL
);
3994 output_asm_insn ("sub\t%0, %0, %1", xops
);
3996 /* Probe at TEST_ADDR. */
3997 output_asm_insn ("str\txzr, [%0]", xops
);
3999 /* Test if TEST_ADDR == LAST_ADDR. */
4001 output_asm_insn ("cmp\t%0, %1", xops
);
4004 fputs ("\tb.ne\t", asm_out_file
);
4005 assemble_name_raw (asm_out_file
, loop_lab
);
4006 fputc ('\n', asm_out_file
);
4011 /* Determine whether a frame chain needs to be generated. */
4013 aarch64_needs_frame_chain (void)
4015 /* Force a frame chain for EH returns so the return address is at FP+8. */
4016 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4019 /* A leaf function cannot have calls or write LR. */
4020 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4022 /* Don't use a frame chain in leaf functions if leaf frame pointers
4024 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4027 return aarch64_use_frame_pointer
;
4030 /* Mark the registers that need to be saved by the callee and calculate
4031 the size of the callee-saved registers area and frame record (both FP
4032 and LR may be omitted). */
4034 aarch64_layout_frame (void)
4036 HOST_WIDE_INT offset
= 0;
4037 int regno
, last_fp_reg
= INVALID_REGNUM
;
4039 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
4042 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4044 #define SLOT_NOT_REQUIRED (-2)
4045 #define SLOT_REQUIRED (-1)
4047 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4048 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4050 /* First mark all the registers that really need to be saved... */
4051 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4052 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4054 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4055 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4057 /* ... that includes the eh data registers (if needed)... */
4058 if (crtl
->calls_eh_return
)
4059 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4060 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4063 /* ... and any callee saved register that dataflow says is live. */
4064 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4065 if (df_regs_ever_live_p (regno
)
4066 && (regno
== R30_REGNUM
4067 || !call_used_regs
[regno
]))
4068 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4070 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4071 if (df_regs_ever_live_p (regno
)
4072 && !call_used_regs
[regno
])
4074 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4075 last_fp_reg
= regno
;
4078 if (cfun
->machine
->frame
.emit_frame_chain
)
4080 /* FP and LR are placed in the linkage record. */
4081 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4082 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4083 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4084 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4085 offset
= 2 * UNITS_PER_WORD
;
4088 /* Now assign stack slots for them. */
4089 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4090 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4092 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4093 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4094 cfun
->machine
->frame
.wb_candidate1
= regno
;
4095 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4096 cfun
->machine
->frame
.wb_candidate2
= regno
;
4097 offset
+= UNITS_PER_WORD
;
4100 HOST_WIDE_INT max_int_offset
= offset
;
4101 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4102 bool has_align_gap
= offset
!= max_int_offset
;
4104 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4105 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4107 /* If there is an alignment gap between integer and fp callee-saves,
4108 allocate the last fp register to it if possible. */
4109 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
4111 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4115 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4116 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4117 cfun
->machine
->frame
.wb_candidate1
= regno
;
4118 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4119 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4120 cfun
->machine
->frame
.wb_candidate2
= regno
;
4121 offset
+= UNITS_PER_WORD
;
4124 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4126 cfun
->machine
->frame
.saved_regs_size
= offset
;
4128 HOST_WIDE_INT varargs_and_saved_regs_size
4129 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4131 cfun
->machine
->frame
.hard_fp_offset
4132 = aligned_upper_bound (varargs_and_saved_regs_size
4133 + get_frame_size (),
4134 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4136 /* Both these values are already aligned. */
4137 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4138 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4139 cfun
->machine
->frame
.frame_size
4140 = (cfun
->machine
->frame
.hard_fp_offset
4141 + crtl
->outgoing_args_size
);
4143 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4145 cfun
->machine
->frame
.initial_adjust
= 0;
4146 cfun
->machine
->frame
.final_adjust
= 0;
4147 cfun
->machine
->frame
.callee_adjust
= 0;
4148 cfun
->machine
->frame
.callee_offset
= 0;
4150 HOST_WIDE_INT max_push_offset
= 0;
4151 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4152 max_push_offset
= 512;
4153 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4154 max_push_offset
= 256;
4156 HOST_WIDE_INT const_size
, const_fp_offset
;
4157 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4158 && const_size
< max_push_offset
4159 && known_eq (crtl
->outgoing_args_size
, 0))
4161 /* Simple, small frame with no outgoing arguments:
4162 stp reg1, reg2, [sp, -frame_size]!
4163 stp reg3, reg4, [sp, 16] */
4164 cfun
->machine
->frame
.callee_adjust
= const_size
;
4166 else if (known_lt (crtl
->outgoing_args_size
4167 + cfun
->machine
->frame
.saved_regs_size
, 512)
4168 && !(cfun
->calls_alloca
4169 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4172 /* Frame with small outgoing arguments:
4173 sub sp, sp, frame_size
4174 stp reg1, reg2, [sp, outgoing_args_size]
4175 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4176 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4177 cfun
->machine
->frame
.callee_offset
4178 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4180 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4181 && const_fp_offset
< max_push_offset
)
4183 /* Frame with large outgoing arguments but a small local area:
4184 stp reg1, reg2, [sp, -hard_fp_offset]!
4185 stp reg3, reg4, [sp, 16]
4186 sub sp, sp, outgoing_args_size */
4187 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4188 cfun
->machine
->frame
.final_adjust
4189 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4193 /* Frame with large local area and outgoing arguments using frame pointer:
4194 sub sp, sp, hard_fp_offset
4195 stp x29, x30, [sp, 0]
4197 stp reg3, reg4, [sp, 16]
4198 sub sp, sp, outgoing_args_size */
4199 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4200 cfun
->machine
->frame
.final_adjust
4201 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4204 cfun
->machine
->frame
.laid_out
= true;
4207 /* Return true if the register REGNO is saved on entry to
4208 the current function. */
4211 aarch64_register_saved_on_entry (int regno
)
4213 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4216 /* Return the next register up from REGNO up to LIMIT for the callee
4220 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4222 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4227 /* Push the register number REGNO of mode MODE to the stack with write-back
4228 adjusting the stack by ADJUSTMENT. */
4231 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4232 HOST_WIDE_INT adjustment
)
4234 rtx base_rtx
= stack_pointer_rtx
;
4237 reg
= gen_rtx_REG (mode
, regno
);
4238 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4239 plus_constant (Pmode
, base_rtx
, -adjustment
));
4240 mem
= gen_frame_mem (mode
, mem
);
4242 insn
= emit_move_insn (mem
, reg
);
4243 RTX_FRAME_RELATED_P (insn
) = 1;
4246 /* Generate and return an instruction to store the pair of registers
4247 REG and REG2 of mode MODE to location BASE with write-back adjusting
4248 the stack location BASE by ADJUSTMENT. */
4251 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4252 HOST_WIDE_INT adjustment
)
4257 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4258 GEN_INT (-adjustment
),
4259 GEN_INT (UNITS_PER_WORD
- adjustment
));
4261 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4262 GEN_INT (-adjustment
),
4263 GEN_INT (UNITS_PER_WORD
- adjustment
));
4269 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4270 stack pointer by ADJUSTMENT. */
4273 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4276 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4278 if (regno2
== INVALID_REGNUM
)
4279 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4281 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4282 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4284 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4286 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4287 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4288 RTX_FRAME_RELATED_P (insn
) = 1;
4291 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4292 adjusting it by ADJUSTMENT afterwards. */
4295 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4296 HOST_WIDE_INT adjustment
)
4301 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4302 GEN_INT (UNITS_PER_WORD
));
4304 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4305 GEN_INT (UNITS_PER_WORD
));
4311 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4312 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4316 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4319 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4320 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4322 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4324 if (regno2
== INVALID_REGNUM
)
4326 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4327 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4328 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4332 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4333 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4334 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4339 /* Generate and return a store pair instruction of mode MODE to store
4340 register REG1 to MEM1 and register REG2 to MEM2. */
4343 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4349 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4352 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4359 /* Generate and regurn a load pair isntruction of mode MODE to load register
4360 REG1 from MEM1 and register REG2 from MEM2. */
4363 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4369 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4372 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4379 /* Return TRUE if return address signing should be enabled for the current
4380 function, otherwise return FALSE. */
4383 aarch64_return_address_signing_enabled (void)
4385 /* This function should only be called after frame laid out. */
4386 gcc_assert (cfun
->machine
->frame
.laid_out
);
4388 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4389 if it's LR is pushed onto stack. */
4390 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4391 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4392 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4395 /* Emit code to save the callee-saved registers from register number START
4396 to LIMIT to the stack at the location starting at offset START_OFFSET,
4397 skipping any write-back candidates if SKIP_WB is true. */
4400 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4401 unsigned start
, unsigned limit
, bool skip_wb
)
4407 for (regno
= aarch64_next_callee_save (start
, limit
);
4409 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4415 && (regno
== cfun
->machine
->frame
.wb_candidate1
4416 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4419 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4422 reg
= gen_rtx_REG (mode
, regno
);
4423 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4424 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4427 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4430 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4431 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4432 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4435 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4438 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4439 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4441 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4444 /* The first part of a frame-related parallel insn is
4445 always assumed to be relevant to the frame
4446 calculations; subsequent parts, are only
4447 frame-related if explicitly marked. */
4448 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4452 insn
= emit_move_insn (mem
, reg
);
4454 RTX_FRAME_RELATED_P (insn
) = 1;
4458 /* Emit code to restore the callee registers of mode MODE from register
4459 number START up to and including LIMIT. Restore from the stack offset
4460 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4461 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4464 aarch64_restore_callee_saves (machine_mode mode
,
4465 poly_int64 start_offset
, unsigned start
,
4466 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4468 rtx base_rtx
= stack_pointer_rtx
;
4473 for (regno
= aarch64_next_callee_save (start
, limit
);
4475 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4477 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4483 && (regno
== cfun
->machine
->frame
.wb_candidate1
4484 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4487 reg
= gen_rtx_REG (mode
, regno
);
4488 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4489 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4491 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4494 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4495 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4496 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4498 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4501 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4502 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4503 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4505 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4509 emit_move_insn (reg
, mem
);
4510 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4514 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4518 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4520 HOST_WIDE_INT multiple
;
4521 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4522 && IN_RANGE (multiple
, -8, 7));
4525 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4529 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4531 HOST_WIDE_INT multiple
;
4532 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4533 && IN_RANGE (multiple
, 0, 63));
4536 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4540 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4542 HOST_WIDE_INT multiple
;
4543 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4544 && IN_RANGE (multiple
, -64, 63));
4547 /* Return true if OFFSET is a signed 9-bit value. */
4550 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4553 HOST_WIDE_INT const_offset
;
4554 return (offset
.is_constant (&const_offset
)
4555 && IN_RANGE (const_offset
, -256, 255));
4558 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4562 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4564 HOST_WIDE_INT multiple
;
4565 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4566 && IN_RANGE (multiple
, -256, 255));
4569 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4573 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4575 HOST_WIDE_INT multiple
;
4576 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4577 && IN_RANGE (multiple
, 0, 4095));
4580 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4583 aarch64_get_separate_components (void)
4585 aarch64_layout_frame ();
4587 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4588 bitmap_clear (components
);
4590 /* The registers we need saved to the frame. */
4591 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4592 if (aarch64_register_saved_on_entry (regno
))
4594 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4595 if (!frame_pointer_needed
)
4596 offset
+= cfun
->machine
->frame
.frame_size
4597 - cfun
->machine
->frame
.hard_fp_offset
;
4598 /* Check that we can access the stack slot of the register with one
4599 direct load with no adjustments needed. */
4600 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4601 bitmap_set_bit (components
, regno
);
4604 /* Don't mess with the hard frame pointer. */
4605 if (frame_pointer_needed
)
4606 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4608 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4609 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4610 /* If aarch64_layout_frame has chosen registers to store/restore with
4611 writeback don't interfere with them to avoid having to output explicit
4612 stack adjustment instructions. */
4613 if (reg2
!= INVALID_REGNUM
)
4614 bitmap_clear_bit (components
, reg2
);
4615 if (reg1
!= INVALID_REGNUM
)
4616 bitmap_clear_bit (components
, reg1
);
4618 bitmap_clear_bit (components
, LR_REGNUM
);
4619 bitmap_clear_bit (components
, SP_REGNUM
);
4624 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4627 aarch64_components_for_bb (basic_block bb
)
4629 bitmap in
= DF_LIVE_IN (bb
);
4630 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4631 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4633 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4634 bitmap_clear (components
);
4636 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4637 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4638 if ((!call_used_regs
[regno
])
4639 && (bitmap_bit_p (in
, regno
)
4640 || bitmap_bit_p (gen
, regno
)
4641 || bitmap_bit_p (kill
, regno
)))
4643 unsigned regno2
, offset
, offset2
;
4644 bitmap_set_bit (components
, regno
);
4646 /* If there is a callee-save at an adjacent offset, add it too
4647 to increase the use of LDP/STP. */
4648 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4649 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
4651 if (regno2
<= LAST_SAVED_REGNUM
)
4653 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4654 if ((offset
& ~8) == (offset2
& ~8))
4655 bitmap_set_bit (components
, regno2
);
4662 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4663 Nothing to do for aarch64. */
4666 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
4670 /* Return the next set bit in BMP from START onwards. Return the total number
4671 of bits in BMP if no set bit is found at or after START. */
4674 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
4676 unsigned int nbits
= SBITMAP_SIZE (bmp
);
4680 gcc_assert (start
< nbits
);
4681 for (unsigned int i
= start
; i
< nbits
; i
++)
4682 if (bitmap_bit_p (bmp
, i
))
4688 /* Do the work for aarch64_emit_prologue_components and
4689 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4690 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4691 for these components or the epilogue sequence. That is, it determines
4692 whether we should emit stores or loads and what kind of CFA notes to attach
4693 to the insns. Otherwise the logic for the two sequences is very
4697 aarch64_process_components (sbitmap components
, bool prologue_p
)
4699 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
4700 ? HARD_FRAME_POINTER_REGNUM
4701 : STACK_POINTER_REGNUM
);
4703 unsigned last_regno
= SBITMAP_SIZE (components
);
4704 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
4705 rtx_insn
*insn
= NULL
;
4707 while (regno
!= last_regno
)
4709 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4710 so DFmode for the vector registers is enough. */
4711 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
4712 rtx reg
= gen_rtx_REG (mode
, regno
);
4713 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4714 if (!frame_pointer_needed
)
4715 offset
+= cfun
->machine
->frame
.frame_size
4716 - cfun
->machine
->frame
.hard_fp_offset
;
4717 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
4718 rtx mem
= gen_frame_mem (mode
, addr
);
4720 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
4721 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
4722 /* No more registers to handle after REGNO.
4723 Emit a single save/restore and exit. */
4724 if (regno2
== last_regno
)
4726 insn
= emit_insn (set
);
4727 RTX_FRAME_RELATED_P (insn
) = 1;
4729 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4731 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4735 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4736 /* The next register is not of the same class or its offset is not
4737 mergeable with the current one into a pair. */
4738 if (!satisfies_constraint_Ump (mem
)
4739 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
4740 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
4741 GET_MODE_SIZE (mode
)))
4743 insn
= emit_insn (set
);
4744 RTX_FRAME_RELATED_P (insn
) = 1;
4746 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4748 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4754 /* REGNO2 can be saved/restored in a pair with REGNO. */
4755 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4756 if (!frame_pointer_needed
)
4757 offset2
+= cfun
->machine
->frame
.frame_size
4758 - cfun
->machine
->frame
.hard_fp_offset
;
4759 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
4760 rtx mem2
= gen_frame_mem (mode
, addr2
);
4761 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
4762 : gen_rtx_SET (reg2
, mem2
);
4765 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
4767 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4769 RTX_FRAME_RELATED_P (insn
) = 1;
4772 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
4773 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
4777 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4778 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
4781 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
4785 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4788 aarch64_emit_prologue_components (sbitmap components
)
4790 aarch64_process_components (components
, true);
4793 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4796 aarch64_emit_epilogue_components (sbitmap components
)
4798 aarch64_process_components (components
, false);
4801 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4804 aarch64_set_handled_components (sbitmap components
)
4806 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4807 if (bitmap_bit_p (components
, regno
))
4808 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
4811 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4812 is saved at BASE + OFFSET. */
4815 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
4816 rtx base
, poly_int64 offset
)
4818 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
4819 add_reg_note (insn
, REG_CFA_EXPRESSION
,
4820 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
4823 /* AArch64 stack frames generated by this compiler look like:
4825 +-------------------------------+
4827 | incoming stack arguments |
4829 +-------------------------------+
4830 | | <-- incoming stack pointer (aligned)
4831 | callee-allocated save area |
4832 | for register varargs |
4834 +-------------------------------+
4835 | local variables | <-- frame_pointer_rtx
4837 +-------------------------------+
4839 +-------------------------------+ |
4840 | callee-saved registers | | frame.saved_regs_size
4841 +-------------------------------+ |
4843 +-------------------------------+ |
4844 | FP' | / <- hard_frame_pointer_rtx (aligned)
4845 +-------------------------------+
4846 | dynamic allocation |
4847 +-------------------------------+
4849 +-------------------------------+
4850 | outgoing stack arguments | <-- arg_pointer
4852 +-------------------------------+
4853 | | <-- stack_pointer_rtx (aligned)
4855 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4856 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4859 /* Generate the prologue instructions for entry into a function.
4860 Establish the stack frame by decreasing the stack pointer with a
4861 properly calculated size and, if necessary, create a frame record
4862 filled with the values of LR and previous frame pointer. The
4863 current FP is also set up if it is in use. */
4866 aarch64_expand_prologue (void)
4868 aarch64_layout_frame ();
4870 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
4871 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4872 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4873 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4874 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4875 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4876 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4877 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
4880 /* Sign return address for functions. */
4881 if (aarch64_return_address_signing_enabled ())
4883 insn
= emit_insn (gen_pacisp ());
4884 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4885 RTX_FRAME_RELATED_P (insn
) = 1;
4888 if (flag_stack_usage_info
)
4889 current_function_static_stack_size
= constant_lower_bound (frame_size
);
4891 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
4893 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
4895 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
4896 && maybe_gt (frame_size
, get_stack_check_protect ()))
4897 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4899 - get_stack_check_protect ()));
4901 else if (maybe_gt (frame_size
, 0))
4902 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
4905 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4906 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4908 aarch64_sub_sp (ip0_rtx
, ip1_rtx
, initial_adjust
, true);
4910 if (callee_adjust
!= 0)
4911 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
4913 if (emit_frame_chain
)
4915 poly_int64 reg_offset
= callee_adjust
;
4916 if (callee_adjust
== 0)
4920 reg_offset
= callee_offset
;
4921 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
4923 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
4924 stack_pointer_rtx
, callee_offset
,
4925 ip1_rtx
, ip0_rtx
, frame_pointer_needed
);
4926 if (frame_pointer_needed
&& !frame_size
.is_constant ())
4928 /* Variable-sized frames need to describe the save slot
4929 address using DW_CFA_expression rather than DW_CFA_offset.
4930 This means that, without taking further action, the
4931 locations of the registers that we've already saved would
4932 remain based on the stack pointer even after we redefine
4933 the CFA based on the frame pointer. We therefore need new
4934 DW_CFA_expressions to re-express the save slots with addresses
4935 based on the frame pointer. */
4936 rtx_insn
*insn
= get_last_insn ();
4937 gcc_assert (RTX_FRAME_RELATED_P (insn
));
4939 /* Add an explicit CFA definition if this was previously
4941 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
4943 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
4945 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4946 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
4949 /* Change the save slot expressions for the registers that
4950 we've already saved. */
4951 reg_offset
-= callee_offset
;
4952 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
4953 reg_offset
+ UNITS_PER_WORD
);
4954 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
4957 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
4960 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4961 callee_adjust
!= 0 || emit_frame_chain
);
4962 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4963 callee_adjust
!= 0 || emit_frame_chain
);
4964 aarch64_sub_sp (ip1_rtx
, ip0_rtx
, final_adjust
, !frame_pointer_needed
);
4967 /* Return TRUE if we can use a simple_return insn.
4969 This function checks whether the callee saved stack is empty, which
4970 means no restore actions are need. The pro_and_epilogue will use
4971 this to check whether shrink-wrapping opt is feasible. */
4974 aarch64_use_return_insn_p (void)
4976 if (!reload_completed
)
4982 aarch64_layout_frame ();
4984 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
4987 /* Generate the epilogue instructions for returning from a function.
4988 This is almost exactly the reverse of the prolog sequence, except
4989 that we need to insert barriers to avoid scheduling loads that read
4990 from a deallocated stack, and we optimize the unwind records by
4991 emitting them all together if possible. */
4993 aarch64_expand_epilogue (bool for_sibcall
)
4995 aarch64_layout_frame ();
4997 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4998 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4999 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5000 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
5001 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5002 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5005 /* A stack clash protection prologue may not have left IP0_REGNUM or
5006 IP1_REGNUM in a usable state. The same is true for allocations
5007 with an SVE component, since we then need both temporary registers
5008 for each allocation. */
5009 bool can_inherit_p
= (initial_adjust
.is_constant ()
5010 && final_adjust
.is_constant ()
5011 && !flag_stack_clash_protection
);
5013 /* We need to add memory barrier to prevent read from deallocated stack. */
5015 = maybe_ne (get_frame_size ()
5016 + cfun
->machine
->frame
.saved_varargs_size
, 0);
5018 /* Emit a barrier to prevent loads from a deallocated stack. */
5019 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
5020 || cfun
->calls_alloca
5021 || crtl
->calls_eh_return
)
5023 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5024 need_barrier_p
= false;
5027 /* Restore the stack pointer from the frame pointer if it may not
5028 be the same as the stack pointer. */
5029 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5030 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5031 if (frame_pointer_needed
5032 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
5033 /* If writeback is used when restoring callee-saves, the CFA
5034 is restored on the instruction doing the writeback. */
5035 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
5036 hard_frame_pointer_rtx
, -callee_offset
,
5037 ip1_rtx
, ip0_rtx
, callee_adjust
== 0);
5039 aarch64_add_sp (ip1_rtx
, ip0_rtx
, final_adjust
,
5040 !can_inherit_p
|| df_regs_ever_live_p (IP1_REGNUM
));
5042 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5043 callee_adjust
!= 0, &cfi_ops
);
5044 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5045 callee_adjust
!= 0, &cfi_ops
);
5048 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5050 if (callee_adjust
!= 0)
5051 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
5053 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
5055 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5056 insn
= get_last_insn ();
5057 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
5058 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
5059 RTX_FRAME_RELATED_P (insn
) = 1;
5063 aarch64_add_sp (ip0_rtx
, ip1_rtx
, initial_adjust
,
5064 !can_inherit_p
|| df_regs_ever_live_p (IP0_REGNUM
));
5068 /* Emit delayed restores and reset the CFA to be SP. */
5069 insn
= get_last_insn ();
5070 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5071 REG_NOTES (insn
) = cfi_ops
;
5072 RTX_FRAME_RELATED_P (insn
) = 1;
5075 /* We prefer to emit the combined return/authenticate instruction RETAA,
5076 however there are three cases in which we must instead emit an explicit
5077 authentication instruction.
5079 1) Sibcalls don't return in a normal way, so if we're about to call one
5080 we must authenticate.
5082 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5083 generating code for !TARGET_ARMV8_3 we can't use it and must
5084 explicitly authenticate.
5086 3) On an eh_return path we make extra stack adjustments to update the
5087 canonical frame address to be the exception handler's CFA. We want
5088 to authenticate using the CFA of the function which calls eh_return.
5090 if (aarch64_return_address_signing_enabled ()
5091 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5093 insn
= emit_insn (gen_autisp ());
5094 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5095 RTX_FRAME_RELATED_P (insn
) = 1;
5098 /* Stack adjustment for exception handler. */
5099 if (crtl
->calls_eh_return
)
5101 /* We need to unwind the stack by the offset computed by
5102 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5103 to be SP; letting the CFA move during this adjustment
5104 is just as correct as retaining the CFA from the body
5105 of the function. Therefore, do nothing special. */
5106 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5109 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5111 emit_jump_insn (ret_rtx
);
5114 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5115 normally or return to a previous frame after unwinding.
5117 An EH return uses a single shared return sequence. The epilogue is
5118 exactly like a normal epilogue except that it has an extra input
5119 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5120 that must be applied after the frame has been destroyed. An extra label
5121 is inserted before the epilogue which initializes this register to zero,
5122 and this is the entry point for a normal return.
5124 An actual EH return updates the return address, initializes the stack
5125 adjustment and jumps directly into the epilogue (bypassing the zeroing
5126 of the adjustment). Since the return address is typically saved on the
5127 stack when a function makes a call, the saved LR must be updated outside
5130 This poses problems as the store is generated well before the epilogue,
5131 so the offset of LR is not known yet. Also optimizations will remove the
5132 store as it appears dead, even after the epilogue is generated (as the
5133 base or offset for loading LR is different in many cases).
5135 To avoid these problems this implementation forces the frame pointer
5136 in eh_return functions so that the location of LR is fixed and known early.
5137 It also marks the store volatile, so no optimization is permitted to
5138 remove the store. */
5140 aarch64_eh_return_handler_rtx (void)
5142 rtx tmp
= gen_frame_mem (Pmode
,
5143 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5145 /* Mark the store volatile, so no optimization is permitted to remove it. */
5146 MEM_VOLATILE_P (tmp
) = true;
5150 /* Output code to add DELTA to the first argument, and then jump
5151 to FUNCTION. Used for C++ multiple inheritance. */
5153 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5154 HOST_WIDE_INT delta
,
5155 HOST_WIDE_INT vcall_offset
,
5158 /* The this pointer is always in x0. Note that this differs from
5159 Arm where the this pointer maybe bumped to r1 if r0 is required
5160 to return a pointer to an aggregate. On AArch64 a result value
5161 pointer will be in x8. */
5162 int this_regno
= R0_REGNUM
;
5163 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5166 reload_completed
= 1;
5167 emit_note (NOTE_INSN_PROLOGUE_END
);
5169 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5170 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5171 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5173 if (vcall_offset
== 0)
5174 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5177 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5182 if (delta
>= -256 && delta
< 256)
5183 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5184 plus_constant (Pmode
, this_rtx
, delta
));
5186 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5187 temp1
, temp0
, false);
5190 if (Pmode
== ptr_mode
)
5191 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5193 aarch64_emit_move (temp0
,
5194 gen_rtx_ZERO_EXTEND (Pmode
,
5195 gen_rtx_MEM (ptr_mode
, addr
)));
5197 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5198 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5201 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5203 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5206 if (Pmode
== ptr_mode
)
5207 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5209 aarch64_emit_move (temp1
,
5210 gen_rtx_SIGN_EXTEND (Pmode
,
5211 gen_rtx_MEM (ptr_mode
, addr
)));
5213 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5216 /* Generate a tail call to the target function. */
5217 if (!TREE_USED (function
))
5219 assemble_external (function
);
5220 TREE_USED (function
) = 1;
5222 funexp
= XEXP (DECL_RTL (function
), 0);
5223 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5224 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5225 SIBLING_CALL_P (insn
) = 1;
5227 insn
= get_insns ();
5228 shorten_branches (insn
);
5229 final_start_function (insn
, file
, 1);
5230 final (insn
, file
, 1);
5231 final_end_function ();
5233 /* Stop pretending to be a post-reload pass. */
5234 reload_completed
= 0;
5238 aarch64_tls_referenced_p (rtx x
)
5240 if (!TARGET_HAVE_TLS
)
5242 subrtx_iterator::array_type array
;
5243 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5245 const_rtx x
= *iter
;
5246 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5248 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5249 TLS offsets, not real symbol references. */
5250 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5251 iter
.skip_subrtxes ();
5257 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5258 a left shift of 0 or 12 bits. */
5260 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5262 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5263 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5268 /* Return true if val is an immediate that can be loaded into a
5269 register by a MOVZ instruction. */
5271 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5273 if (GET_MODE_SIZE (mode
) > 4)
5275 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5276 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5281 /* Ignore sign extension. */
5282 val
&= (HOST_WIDE_INT
) 0xffffffff;
5284 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5285 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5288 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5289 64-bit (DImode) integer. */
5291 static unsigned HOST_WIDE_INT
5292 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5294 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5297 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5304 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5306 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5308 0x0000000100000001ull
,
5309 0x0001000100010001ull
,
5310 0x0101010101010101ull
,
5311 0x1111111111111111ull
,
5312 0x5555555555555555ull
,
5316 /* Return true if val is a valid bitmask immediate. */
5319 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
5321 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
5324 /* Check for a single sequence of one bits and return quickly if so.
5325 The special cases of all ones and all zeroes returns false. */
5326 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
5327 tmp
= val
+ (val
& -val
);
5329 if (tmp
== (tmp
& -tmp
))
5330 return (val
+ 1) > 1;
5332 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5334 val
= (val
<< 32) | (val
& 0xffffffff);
5336 /* Invert if the immediate doesn't start with a zero bit - this means we
5337 only need to search for sequences of one bits. */
5341 /* Find the first set bit and set tmp to val with the first sequence of one
5342 bits removed. Return success if there is a single sequence of ones. */
5343 first_one
= val
& -val
;
5344 tmp
= val
& (val
+ first_one
);
5349 /* Find the next set bit and compute the difference in bit position. */
5350 next_one
= tmp
& -tmp
;
5351 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5354 /* Check the bit position difference is a power of 2, and that the first
5355 sequence of one bits fits within 'bits' bits. */
5356 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5359 /* Check the sequence of one bits is repeated 64/bits times. */
5360 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5363 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5364 Assumed precondition: VAL_IN Is not zero. */
5366 unsigned HOST_WIDE_INT
5367 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5369 int lowest_bit_set
= ctz_hwi (val_in
);
5370 int highest_bit_set
= floor_log2 (val_in
);
5371 gcc_assert (val_in
!= 0);
5373 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5374 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5377 /* Create constant where bits outside of lowest bit set to highest bit set
5380 unsigned HOST_WIDE_INT
5381 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5383 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5386 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5389 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5391 scalar_int_mode int_mode
;
5392 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5395 if (aarch64_bitmask_imm (val_in
, int_mode
))
5398 if (aarch64_move_imm (val_in
, int_mode
))
5401 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5403 return aarch64_bitmask_imm (imm2
, int_mode
);
5406 /* Return true if val is an immediate that can be loaded into a
5407 register in a single instruction. */
5409 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
5411 scalar_int_mode int_mode
;
5412 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5415 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
5417 return aarch64_bitmask_imm (val
, int_mode
);
5421 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
5425 if (GET_CODE (x
) == HIGH
)
5428 /* There's no way to calculate VL-based values using relocations. */
5429 subrtx_iterator::array_type array
;
5430 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5431 if (GET_CODE (*iter
) == CONST_POLY_INT
)
5434 split_const (x
, &base
, &offset
);
5435 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
5437 if (aarch64_classify_symbol (base
, INTVAL (offset
))
5438 != SYMBOL_FORCE_TO_MEM
)
5441 /* Avoid generating a 64-bit relocation in ILP32; leave
5442 to aarch64_expand_mov_immediate to handle it properly. */
5443 return mode
!= ptr_mode
;
5446 return aarch64_tls_referenced_p (x
);
5449 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5450 The expansion for a table switch is quite expensive due to the number
5451 of instructions, the table lookup and hard to predict indirect jump.
5452 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5453 set, otherwise use tables for > 16 cases as a tradeoff between size and
5454 performance. When optimizing for size, use the default setting. */
5457 aarch64_case_values_threshold (void)
5459 /* Use the specified limit for the number of cases before using jump
5460 tables at higher optimization levels. */
5462 && selected_cpu
->tune
->max_case_values
!= 0)
5463 return selected_cpu
->tune
->max_case_values
;
5465 return optimize_size
? default_case_values_threshold () : 17;
5468 /* Return true if register REGNO is a valid index register.
5469 STRICT_P is true if REG_OK_STRICT is in effect. */
5472 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
5474 if (!HARD_REGISTER_NUM_P (regno
))
5482 regno
= reg_renumber
[regno
];
5484 return GP_REGNUM_P (regno
);
5487 /* Return true if register REGNO is a valid base register for mode MODE.
5488 STRICT_P is true if REG_OK_STRICT is in effect. */
5491 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
5493 if (!HARD_REGISTER_NUM_P (regno
))
5501 regno
= reg_renumber
[regno
];
5504 /* The fake registers will be eliminated to either the stack or
5505 hard frame pointer, both of which are usually valid base registers.
5506 Reload deals with the cases where the eliminated form isn't valid. */
5507 return (GP_REGNUM_P (regno
)
5508 || regno
== SP_REGNUM
5509 || regno
== FRAME_POINTER_REGNUM
5510 || regno
== ARG_POINTER_REGNUM
);
5513 /* Return true if X is a valid base register for mode MODE.
5514 STRICT_P is true if REG_OK_STRICT is in effect. */
5517 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
5520 && GET_CODE (x
) == SUBREG
5521 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
5524 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
5527 /* Return true if address offset is a valid index. If it is, fill in INFO
5528 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5531 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
5532 machine_mode mode
, bool strict_p
)
5534 enum aarch64_address_type type
;
5539 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
5540 && GET_MODE (x
) == Pmode
)
5542 type
= ADDRESS_REG_REG
;
5546 /* (sign_extend:DI (reg:SI)) */
5547 else if ((GET_CODE (x
) == SIGN_EXTEND
5548 || GET_CODE (x
) == ZERO_EXTEND
)
5549 && GET_MODE (x
) == DImode
5550 && GET_MODE (XEXP (x
, 0)) == SImode
)
5552 type
= (GET_CODE (x
) == SIGN_EXTEND
)
5553 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5554 index
= XEXP (x
, 0);
5557 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5558 else if (GET_CODE (x
) == MULT
5559 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5560 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5561 && GET_MODE (XEXP (x
, 0)) == DImode
5562 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5563 && CONST_INT_P (XEXP (x
, 1)))
5565 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5566 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5567 index
= XEXP (XEXP (x
, 0), 0);
5568 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5570 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5571 else if (GET_CODE (x
) == ASHIFT
5572 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5573 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5574 && GET_MODE (XEXP (x
, 0)) == DImode
5575 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5576 && CONST_INT_P (XEXP (x
, 1)))
5578 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5579 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5580 index
= XEXP (XEXP (x
, 0), 0);
5581 shift
= INTVAL (XEXP (x
, 1));
5583 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5584 else if ((GET_CODE (x
) == SIGN_EXTRACT
5585 || GET_CODE (x
) == ZERO_EXTRACT
)
5586 && GET_MODE (x
) == DImode
5587 && GET_CODE (XEXP (x
, 0)) == MULT
5588 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5589 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5591 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5592 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5593 index
= XEXP (XEXP (x
, 0), 0);
5594 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5595 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5596 || INTVAL (XEXP (x
, 2)) != 0)
5599 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5600 (const_int 0xffffffff<<shift)) */
5601 else if (GET_CODE (x
) == AND
5602 && GET_MODE (x
) == DImode
5603 && GET_CODE (XEXP (x
, 0)) == MULT
5604 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5605 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5606 && CONST_INT_P (XEXP (x
, 1)))
5608 type
= ADDRESS_REG_UXTW
;
5609 index
= XEXP (XEXP (x
, 0), 0);
5610 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5611 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5614 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5615 else if ((GET_CODE (x
) == SIGN_EXTRACT
5616 || GET_CODE (x
) == ZERO_EXTRACT
)
5617 && GET_MODE (x
) == DImode
5618 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5619 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5620 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5622 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5623 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5624 index
= XEXP (XEXP (x
, 0), 0);
5625 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5626 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5627 || INTVAL (XEXP (x
, 2)) != 0)
5630 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5631 (const_int 0xffffffff<<shift)) */
5632 else if (GET_CODE (x
) == AND
5633 && GET_MODE (x
) == DImode
5634 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5635 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5636 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5637 && CONST_INT_P (XEXP (x
, 1)))
5639 type
= ADDRESS_REG_UXTW
;
5640 index
= XEXP (XEXP (x
, 0), 0);
5641 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5642 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5645 /* (mult:P (reg:P) (const_int scale)) */
5646 else if (GET_CODE (x
) == MULT
5647 && GET_MODE (x
) == Pmode
5648 && GET_MODE (XEXP (x
, 0)) == Pmode
5649 && CONST_INT_P (XEXP (x
, 1)))
5651 type
= ADDRESS_REG_REG
;
5652 index
= XEXP (x
, 0);
5653 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5655 /* (ashift:P (reg:P) (const_int shift)) */
5656 else if (GET_CODE (x
) == ASHIFT
5657 && GET_MODE (x
) == Pmode
5658 && GET_MODE (XEXP (x
, 0)) == Pmode
5659 && CONST_INT_P (XEXP (x
, 1)))
5661 type
= ADDRESS_REG_REG
;
5662 index
= XEXP (x
, 0);
5663 shift
= INTVAL (XEXP (x
, 1));
5669 && GET_CODE (index
) == SUBREG
5670 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
5671 index
= SUBREG_REG (index
);
5673 if (aarch64_sve_data_mode_p (mode
))
5675 if (type
!= ADDRESS_REG_REG
5676 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
5682 && !(IN_RANGE (shift
, 1, 3)
5683 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
5688 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
5691 info
->offset
= index
;
5692 info
->shift
= shift
;
5699 /* Return true if MODE is one of the modes for which we
5700 support LDP/STP operations. */
5703 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
5705 return mode
== SImode
|| mode
== DImode
5706 || mode
== SFmode
|| mode
== DFmode
5707 || (aarch64_vector_mode_supported_p (mode
)
5708 && (known_eq (GET_MODE_SIZE (mode
), 8)
5709 || (known_eq (GET_MODE_SIZE (mode
), 16)
5710 && (aarch64_tune_params
.extra_tuning_flags
5711 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
5714 /* Return true if REGNO is a virtual pointer register, or an eliminable
5715 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5716 include stack_pointer or hard_frame_pointer. */
5718 virt_or_elim_regno_p (unsigned regno
)
5720 return ((regno
>= FIRST_VIRTUAL_REGISTER
5721 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
5722 || regno
== FRAME_POINTER_REGNUM
5723 || regno
== ARG_POINTER_REGNUM
);
5726 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5727 If it is, fill in INFO appropriately. STRICT_P is true if
5728 REG_OK_STRICT is in effect. */
5731 aarch64_classify_address (struct aarch64_address_info
*info
,
5732 rtx x
, machine_mode mode
, bool strict_p
,
5733 aarch64_addr_query_type type
= ADDR_QUERY_M
)
5735 enum rtx_code code
= GET_CODE (x
);
5739 HOST_WIDE_INT const_size
;
5741 /* On BE, we use load/store pair for all large int mode load/stores.
5742 TI/TFmode may also use a load/store pair. */
5743 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5744 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
5745 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
5746 || type
== ADDR_QUERY_LDP_STP_N
5749 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
5751 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5752 corresponds to the actual size of the memory being loaded/stored and the
5753 mode of the corresponding addressing mode is half of that. */
5754 if (type
== ADDR_QUERY_LDP_STP_N
5755 && known_eq (GET_MODE_SIZE (mode
), 16))
5758 bool allow_reg_index_p
= (!load_store_pair_p
5759 && (known_lt (GET_MODE_SIZE (mode
), 16)
5760 || vec_flags
== VEC_ADVSIMD
5761 || vec_flags
== VEC_SVE_DATA
));
5763 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5764 [Rn, #offset, MUL VL]. */
5765 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
5766 && (code
!= REG
&& code
!= PLUS
))
5769 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5771 if (advsimd_struct_p
5772 && !BYTES_BIG_ENDIAN
5773 && (code
!= POST_INC
&& code
!= REG
))
5776 gcc_checking_assert (GET_MODE (x
) == VOIDmode
5777 || SCALAR_INT_MODE_P (GET_MODE (x
)));
5783 info
->type
= ADDRESS_REG_IMM
;
5785 info
->offset
= const0_rtx
;
5786 info
->const_offset
= 0;
5787 return aarch64_base_register_rtx_p (x
, strict_p
);
5795 && virt_or_elim_regno_p (REGNO (op0
))
5796 && poly_int_rtx_p (op1
, &offset
))
5798 info
->type
= ADDRESS_REG_IMM
;
5801 info
->const_offset
= offset
;
5806 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
5807 && aarch64_base_register_rtx_p (op0
, strict_p
)
5808 && poly_int_rtx_p (op1
, &offset
))
5810 info
->type
= ADDRESS_REG_IMM
;
5813 info
->const_offset
= offset
;
5815 /* TImode and TFmode values are allowed in both pairs of X
5816 registers and individual Q registers. The available
5818 X,X: 7-bit signed scaled offset
5819 Q: 9-bit signed offset
5820 We conservatively require an offset representable in either mode.
5821 When performing the check for pairs of X registers i.e. LDP/STP
5822 pass down DImode since that is the natural size of the LDP/STP
5823 instruction memory accesses. */
5824 if (mode
== TImode
|| mode
== TFmode
)
5825 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
5826 && (offset_9bit_signed_unscaled_p (mode
, offset
)
5827 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
5829 /* A 7bit offset check because OImode will emit a ldp/stp
5830 instruction (only big endian will get here).
5831 For ldp/stp instructions, the offset is scaled for the size of a
5832 single element of the pair. */
5834 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
5836 /* Three 9/12 bit offsets checks because CImode will emit three
5837 ldr/str instructions (only big endian will get here). */
5839 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5840 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
5841 || offset_12bit_unsigned_scaled_p (V16QImode
,
5844 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5845 instructions (only big endian will get here). */
5847 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5848 && aarch64_offset_7bit_signed_scaled_p (TImode
,
5851 /* Make "m" use the LD1 offset range for SVE data modes, so
5852 that pre-RTL optimizers like ivopts will work to that
5853 instead of the wider LDR/STR range. */
5854 if (vec_flags
== VEC_SVE_DATA
)
5855 return (type
== ADDR_QUERY_M
5856 ? offset_4bit_signed_scaled_p (mode
, offset
)
5857 : offset_9bit_signed_scaled_p (mode
, offset
));
5859 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
5861 poly_int64 end_offset
= (offset
5862 + GET_MODE_SIZE (mode
)
5863 - BYTES_PER_SVE_VECTOR
);
5864 return (type
== ADDR_QUERY_M
5865 ? offset_4bit_signed_scaled_p (mode
, offset
)
5866 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
5867 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
5871 if (vec_flags
== VEC_SVE_PRED
)
5872 return offset_9bit_signed_scaled_p (mode
, offset
);
5874 if (load_store_pair_p
)
5875 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5876 || known_eq (GET_MODE_SIZE (mode
), 8)
5877 || known_eq (GET_MODE_SIZE (mode
), 16))
5878 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5880 return (offset_9bit_signed_unscaled_p (mode
, offset
)
5881 || offset_12bit_unsigned_scaled_p (mode
, offset
));
5884 if (allow_reg_index_p
)
5886 /* Look for base + (scaled/extended) index register. */
5887 if (aarch64_base_register_rtx_p (op0
, strict_p
)
5888 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
5893 if (aarch64_base_register_rtx_p (op1
, strict_p
)
5894 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
5907 info
->type
= ADDRESS_REG_WB
;
5908 info
->base
= XEXP (x
, 0);
5909 info
->offset
= NULL_RTX
;
5910 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
5914 info
->type
= ADDRESS_REG_WB
;
5915 info
->base
= XEXP (x
, 0);
5916 if (GET_CODE (XEXP (x
, 1)) == PLUS
5917 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
5918 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
5919 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5921 info
->offset
= XEXP (XEXP (x
, 1), 1);
5922 info
->const_offset
= offset
;
5924 /* TImode and TFmode values are allowed in both pairs of X
5925 registers and individual Q registers. The available
5927 X,X: 7-bit signed scaled offset
5928 Q: 9-bit signed offset
5929 We conservatively require an offset representable in either mode.
5931 if (mode
== TImode
|| mode
== TFmode
)
5932 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
5933 && offset_9bit_signed_unscaled_p (mode
, offset
));
5935 if (load_store_pair_p
)
5936 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5937 || known_eq (GET_MODE_SIZE (mode
), 8)
5938 || known_eq (GET_MODE_SIZE (mode
), 16))
5939 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5941 return offset_9bit_signed_unscaled_p (mode
, offset
);
5948 /* load literal: pc-relative constant pool entry. Only supported
5949 for SI mode or larger. */
5950 info
->type
= ADDRESS_SYMBOLIC
;
5952 if (!load_store_pair_p
5953 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
5958 split_const (x
, &sym
, &addend
);
5959 return ((GET_CODE (sym
) == LABEL_REF
5960 || (GET_CODE (sym
) == SYMBOL_REF
5961 && CONSTANT_POOL_ADDRESS_P (sym
)
5962 && aarch64_pcrelative_literal_loads
)));
5967 info
->type
= ADDRESS_LO_SUM
;
5968 info
->base
= XEXP (x
, 0);
5969 info
->offset
= XEXP (x
, 1);
5970 if (allow_reg_index_p
5971 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5974 split_const (info
->offset
, &sym
, &offs
);
5975 if (GET_CODE (sym
) == SYMBOL_REF
5976 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
5977 == SYMBOL_SMALL_ABSOLUTE
))
5979 /* The symbol and offset must be aligned to the access size. */
5982 if (CONSTANT_POOL_ADDRESS_P (sym
))
5983 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
5984 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
5986 tree exp
= SYMBOL_REF_DECL (sym
);
5987 align
= TYPE_ALIGN (TREE_TYPE (exp
));
5988 align
= aarch64_constant_alignment (exp
, align
);
5990 else if (SYMBOL_REF_DECL (sym
))
5991 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
5992 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
5993 && SYMBOL_REF_BLOCK (sym
) != NULL
)
5994 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
5996 align
= BITS_PER_UNIT
;
5998 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
5999 if (known_eq (ref_size
, 0))
6000 ref_size
= GET_MODE_SIZE (DImode
);
6002 return (multiple_p (INTVAL (offs
), ref_size
)
6003 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
6013 /* Return true if the address X is valid for a PRFM instruction.
6014 STRICT_P is true if we should do strict checking with
6015 aarch64_classify_address. */
6018 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
6020 struct aarch64_address_info addr
;
6022 /* PRFM accepts the same addresses as DImode... */
6023 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
6027 /* ... except writeback forms. */
6028 return addr
.type
!= ADDRESS_REG_WB
;
6032 aarch64_symbolic_address_p (rtx x
)
6036 split_const (x
, &x
, &offset
);
6037 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
6040 /* Classify the base of symbolic expression X. */
6042 enum aarch64_symbol_type
6043 aarch64_classify_symbolic_expression (rtx x
)
6047 split_const (x
, &x
, &offset
);
6048 return aarch64_classify_symbol (x
, INTVAL (offset
));
6052 /* Return TRUE if X is a legitimate address for accessing memory in
6055 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
6057 struct aarch64_address_info addr
;
6059 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
6062 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6063 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6065 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
6066 aarch64_addr_query_type type
)
6068 struct aarch64_address_info addr
;
6070 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
6073 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6076 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6077 poly_int64 orig_offset
,
6081 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6083 HOST_WIDE_INT const_offset
, second_offset
;
6085 /* A general SVE offset is A * VQ + B. Remove the A component from
6086 coefficient 0 in order to get the constant B. */
6087 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6089 /* Split an out-of-range address displacement into a base and
6090 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6091 range otherwise to increase opportunities for sharing the base
6092 address of different sizes. Unaligned accesses use the signed
6093 9-bit range, TImode/TFmode use the intersection of signed
6094 scaled 7-bit and signed 9-bit offset. */
6095 if (mode
== TImode
|| mode
== TFmode
)
6096 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6097 else if ((const_offset
& (size
- 1)) != 0)
6098 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6100 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6102 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6105 /* Split the offset into second_offset and the rest. */
6106 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6107 *offset2
= gen_int_mode (second_offset
, Pmode
);
6112 /* Get the mode we should use as the basis of the range. For structure
6113 modes this is the mode of one vector. */
6114 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6115 machine_mode step_mode
6116 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6118 /* Get the "mul vl" multiplier we'd like to use. */
6119 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6120 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6121 if (vec_flags
& VEC_SVE_DATA
)
6122 /* LDR supports a 9-bit range, but the move patterns for
6123 structure modes require all vectors to be in range of the
6124 same base. The simplest way of accomodating that while still
6125 promoting reuse of anchor points between different modes is
6126 to use an 8-bit range unconditionally. */
6127 vnum
= ((vnum
+ 128) & 255) - 128;
6129 /* Predicates are only handled singly, so we might as well use
6131 vnum
= ((vnum
+ 256) & 511) - 256;
6135 /* Convert the "mul vl" multiplier into a byte offset. */
6136 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6137 if (known_eq (second_offset
, orig_offset
))
6140 /* Split the offset into second_offset and the rest. */
6141 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6142 *offset2
= gen_int_mode (second_offset
, Pmode
);
6147 /* Return the binary representation of floating point constant VALUE in INTVAL.
6148 If the value cannot be converted, return false without setting INTVAL.
6149 The conversion is done in the given MODE. */
6151 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6154 /* We make a general exception for 0. */
6155 if (aarch64_float_const_zero_rtx_p (value
))
6161 scalar_float_mode mode
;
6162 if (GET_CODE (value
) != CONST_DOUBLE
6163 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6164 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6165 /* Only support up to DF mode. */
6166 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6169 unsigned HOST_WIDE_INT ival
= 0;
6172 real_to_target (res
,
6173 CONST_DOUBLE_REAL_VALUE (value
),
6174 REAL_MODE_FORMAT (mode
));
6178 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6179 ival
= zext_hwi (res
[order
], 32);
6180 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6183 ival
= zext_hwi (res
[0], 32);
6189 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6190 single MOV(+MOVK) followed by an FMOV. */
6192 aarch64_float_const_rtx_p (rtx x
)
6194 machine_mode mode
= GET_MODE (x
);
6195 if (mode
== VOIDmode
)
6198 /* Determine whether it's cheaper to write float constants as
6199 mov/movk pairs over ldr/adrp pairs. */
6200 unsigned HOST_WIDE_INT ival
;
6202 if (GET_CODE (x
) == CONST_DOUBLE
6203 && SCALAR_FLOAT_MODE_P (mode
)
6204 && aarch64_reinterpret_float_as_int (x
, &ival
))
6206 scalar_int_mode imode
= (mode
== HFmode
6208 : int_mode_for_mode (mode
).require ());
6209 int num_instr
= aarch64_internal_mov_immediate
6210 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6211 return num_instr
< 3;
6217 /* Return TRUE if rtx X is immediate constant 0.0 */
6219 aarch64_float_const_zero_rtx_p (rtx x
)
6221 if (GET_MODE (x
) == VOIDmode
)
6224 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6225 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6226 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6229 /* Return TRUE if rtx X is immediate constant that fits in a single
6230 MOVI immediate operation. */
6232 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6238 scalar_int_mode imode
;
6239 unsigned HOST_WIDE_INT ival
;
6241 if (GET_CODE (x
) == CONST_DOUBLE
6242 && SCALAR_FLOAT_MODE_P (mode
))
6244 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6247 /* We make a general exception for 0. */
6248 if (aarch64_float_const_zero_rtx_p (x
))
6251 imode
= int_mode_for_mode (mode
).require ();
6253 else if (GET_CODE (x
) == CONST_INT
6254 && is_a
<scalar_int_mode
> (mode
, &imode
))
6259 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6260 a 128 bit vector mode. */
6261 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
6263 vmode
= aarch64_simd_container_mode (imode
, width
);
6264 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
6266 return aarch64_simd_valid_immediate (v_op
, NULL
);
6270 /* Return the fixed registers used for condition codes. */
6273 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
6276 *p2
= INVALID_REGNUM
;
6280 /* This function is used by the call expanders of the machine description.
6281 RESULT is the register in which the result is returned. It's NULL for
6282 "call" and "sibcall".
6283 MEM is the location of the function call.
6284 SIBCALL indicates whether this function call is normal call or sibling call.
6285 It will generate different pattern accordingly. */
6288 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
6290 rtx call
, callee
, tmp
;
6294 gcc_assert (MEM_P (mem
));
6295 callee
= XEXP (mem
, 0);
6296 mode
= GET_MODE (callee
);
6297 gcc_assert (mode
== Pmode
);
6299 /* Decide if we should generate indirect calls by loading the
6300 address of the callee into a register before performing
6301 the branch-and-link. */
6302 if (SYMBOL_REF_P (callee
)
6303 ? (aarch64_is_long_call_p (callee
)
6304 || aarch64_is_noplt_call_p (callee
))
6306 XEXP (mem
, 0) = force_reg (mode
, callee
);
6308 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
6310 if (result
!= NULL_RTX
)
6311 call
= gen_rtx_SET (result
, call
);
6316 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
6318 vec
= gen_rtvec (2, call
, tmp
);
6319 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
6321 aarch64_emit_call_insn (call
);
6324 /* Emit call insn with PAT and do aarch64-specific handling. */
6327 aarch64_emit_call_insn (rtx pat
)
6329 rtx insn
= emit_call_insn (pat
);
6331 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
6332 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
6333 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
6337 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
6339 /* All floating point compares return CCFP if it is an equality
6340 comparison, and CCFPE otherwise. */
6341 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
6368 /* Equality comparisons of short modes against zero can be performed
6369 using the TST instruction with the appropriate bitmask. */
6370 if (y
== const0_rtx
&& REG_P (x
)
6371 && (code
== EQ
|| code
== NE
)
6372 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
6375 /* Similarly, comparisons of zero_extends from shorter modes can
6376 be performed using an ANDS with an immediate mask. */
6377 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
6378 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6379 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
6380 && (code
== EQ
|| code
== NE
))
6383 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6385 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
6386 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
6387 || GET_CODE (x
) == NEG
6388 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
6389 && CONST_INT_P (XEXP (x
, 2)))))
6392 /* A compare with a shifted operand. Because of canonicalization,
6393 the comparison will have to be swapped when we emit the assembly
6395 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6396 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
6397 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
6398 || GET_CODE (x
) == LSHIFTRT
6399 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
6402 /* Similarly for a negated operand, but we can only do this for
6404 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6405 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
6406 && (code
== EQ
|| code
== NE
)
6407 && GET_CODE (x
) == NEG
)
6410 /* A test for unsigned overflow. */
6411 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6413 && GET_CODE (x
) == PLUS
6414 && GET_CODE (y
) == ZERO_EXTEND
)
6417 /* For everything else, return CCmode. */
6422 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
6425 aarch64_get_condition_code (rtx x
)
6427 machine_mode mode
= GET_MODE (XEXP (x
, 0));
6428 enum rtx_code comp_code
= GET_CODE (x
);
6430 if (GET_MODE_CLASS (mode
) != MODE_CC
)
6431 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
6432 return aarch64_get_condition_code_1 (mode
, comp_code
);
6436 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
6444 case GE
: return AARCH64_GE
;
6445 case GT
: return AARCH64_GT
;
6446 case LE
: return AARCH64_LS
;
6447 case LT
: return AARCH64_MI
;
6448 case NE
: return AARCH64_NE
;
6449 case EQ
: return AARCH64_EQ
;
6450 case ORDERED
: return AARCH64_VC
;
6451 case UNORDERED
: return AARCH64_VS
;
6452 case UNLT
: return AARCH64_LT
;
6453 case UNLE
: return AARCH64_LE
;
6454 case UNGT
: return AARCH64_HI
;
6455 case UNGE
: return AARCH64_PL
;
6463 case NE
: return AARCH64_NE
;
6464 case EQ
: return AARCH64_EQ
;
6465 case GE
: return AARCH64_GE
;
6466 case GT
: return AARCH64_GT
;
6467 case LE
: return AARCH64_LE
;
6468 case LT
: return AARCH64_LT
;
6469 case GEU
: return AARCH64_CS
;
6470 case GTU
: return AARCH64_HI
;
6471 case LEU
: return AARCH64_LS
;
6472 case LTU
: return AARCH64_CC
;
6480 case NE
: return AARCH64_NE
;
6481 case EQ
: return AARCH64_EQ
;
6482 case GE
: return AARCH64_LE
;
6483 case GT
: return AARCH64_LT
;
6484 case LE
: return AARCH64_GE
;
6485 case LT
: return AARCH64_GT
;
6486 case GEU
: return AARCH64_LS
;
6487 case GTU
: return AARCH64_CC
;
6488 case LEU
: return AARCH64_CS
;
6489 case LTU
: return AARCH64_HI
;
6497 case NE
: return AARCH64_NE
;
6498 case EQ
: return AARCH64_EQ
;
6499 case GE
: return AARCH64_PL
;
6500 case LT
: return AARCH64_MI
;
6508 case NE
: return AARCH64_NE
;
6509 case EQ
: return AARCH64_EQ
;
6517 case NE
: return AARCH64_CS
;
6518 case EQ
: return AARCH64_CC
;
6531 aarch64_const_vec_all_same_in_range_p (rtx x
,
6532 HOST_WIDE_INT minval
,
6533 HOST_WIDE_INT maxval
)
6536 return (const_vec_duplicate_p (x
, &elt
)
6537 && CONST_INT_P (elt
)
6538 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
6542 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
6544 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
6547 /* Return true if VEC is a constant in which every element is in the range
6548 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6551 aarch64_const_vec_all_in_range_p (rtx vec
,
6552 HOST_WIDE_INT minval
,
6553 HOST_WIDE_INT maxval
)
6555 if (GET_CODE (vec
) != CONST_VECTOR
6556 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
6560 if (!CONST_VECTOR_STEPPED_P (vec
))
6561 nunits
= const_vector_encoded_nelts (vec
);
6562 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
6565 for (int i
= 0; i
< nunits
; i
++)
6567 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
6568 if (!CONST_INT_P (vec_elem
)
6569 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
6576 #define AARCH64_CC_V 1
6577 #define AARCH64_CC_C (1 << 1)
6578 #define AARCH64_CC_Z (1 << 2)
6579 #define AARCH64_CC_N (1 << 3)
6581 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6582 static const int aarch64_nzcv_codes
[] =
6584 0, /* EQ, Z == 1. */
6585 AARCH64_CC_Z
, /* NE, Z == 0. */
6586 0, /* CS, C == 1. */
6587 AARCH64_CC_C
, /* CC, C == 0. */
6588 0, /* MI, N == 1. */
6589 AARCH64_CC_N
, /* PL, N == 0. */
6590 0, /* VS, V == 1. */
6591 AARCH64_CC_V
, /* VC, V == 0. */
6592 0, /* HI, C ==1 && Z == 0. */
6593 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
6594 AARCH64_CC_V
, /* GE, N == V. */
6595 0, /* LT, N != V. */
6596 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
6597 0, /* LE, !(Z == 0 && N == V). */
6602 /* Print floating-point vector immediate operand X to F, negating it
6603 first if NEGATE is true. Return true on success, false if it isn't
6604 a constant we can handle. */
6607 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
6611 if (!const_vec_duplicate_p (x
, &elt
))
6614 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
6616 r
= real_value_negate (&r
);
6618 /* We only handle the SVE single-bit immediates here. */
6619 if (real_equal (&r
, &dconst0
))
6620 asm_fprintf (f
, "0.0");
6621 else if (real_equal (&r
, &dconst1
))
6622 asm_fprintf (f
, "1.0");
6623 else if (real_equal (&r
, &dconsthalf
))
6624 asm_fprintf (f
, "0.5");
6631 /* Return the equivalent letter for size. */
6633 sizetochar (int size
)
6637 case 64: return 'd';
6638 case 32: return 's';
6639 case 16: return 'h';
6640 case 8 : return 'b';
6641 default: gcc_unreachable ();
6645 /* Print operand X to file F in a target specific manner according to CODE.
6646 The acceptable formatting commands given by CODE are:
6647 'c': An integer or symbol address without a preceding #
6649 'C': Take the duplicated element in a vector constant
6650 and print it in hex.
6651 'D': Take the duplicated element in a vector constant
6652 and print it as an unsigned integer, in decimal.
6653 'e': Print the sign/zero-extend size as a character 8->b,
6655 'p': Prints N such that 2^N == X (X must be power of 2 and
6657 'P': Print the number of non-zero bits in X (a const_int).
6658 'H': Print the higher numbered register of a pair (TImode)
6660 'm': Print a condition (eq, ne, etc).
6661 'M': Same as 'm', but invert condition.
6662 'N': Take the duplicated element in a vector constant
6663 and print the negative of it in decimal.
6664 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6665 'S/T/U/V': Print a FP/SIMD register name for a register list.
6666 The register printed is the FP/SIMD register name
6667 of X + 0/1/2/3 for S/T/U/V.
6668 'R': Print a scalar FP/SIMD register name + 1.
6669 'X': Print bottom 16 bits of integer constant in hex.
6670 'w/x': Print a general register name or the zero register
6672 '0': Print a normal operand, if it's a general register,
6673 then we assume DImode.
6674 'k': Print NZCV for conditional compare instructions.
6675 'A': Output address constant representing the first
6676 argument of X, specifying a relocation offset
6678 'L': Output constant address specified by X
6679 with a relocation offset if appropriate.
6680 'G': Prints address of X, specifying a PC relative
6681 relocation mode if appropriate.
6682 'y': Output address of LDP or STP - this is used for
6683 some LDP/STPs which don't use a PARALLEL in their
6684 pattern (so the mode needs to be adjusted).
6685 'z': Output address of a typical LDP or STP. */
6688 aarch64_print_operand (FILE *f
, rtx x
, int code
)
6694 switch (GET_CODE (x
))
6697 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
6701 output_addr_const (f
, x
);
6705 if (GET_CODE (XEXP (x
, 0)) == PLUS
6706 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
6708 output_addr_const (f
, x
);
6714 output_operand_lossage ("unsupported operand for code '%c'", code
);
6722 if (!CONST_INT_P (x
)
6723 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
6725 output_operand_lossage ("invalid operand for '%%%c'", code
);
6741 output_operand_lossage ("invalid operand for '%%%c'", code
);
6751 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
6753 output_operand_lossage ("invalid operand for '%%%c'", code
);
6757 asm_fprintf (f
, "%d", n
);
6762 if (!CONST_INT_P (x
))
6764 output_operand_lossage ("invalid operand for '%%%c'", code
);
6768 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
6772 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
6774 output_operand_lossage ("invalid operand for '%%%c'", code
);
6778 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
6785 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6786 if (x
== const_true_rtx
)
6793 if (!COMPARISON_P (x
))
6795 output_operand_lossage ("invalid operand for '%%%c'", code
);
6799 cond_code
= aarch64_get_condition_code (x
);
6800 gcc_assert (cond_code
>= 0);
6802 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
6803 fputs (aarch64_condition_codes
[cond_code
], f
);
6808 if (!const_vec_duplicate_p (x
, &elt
))
6810 output_operand_lossage ("invalid vector constant");
6814 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6815 asm_fprintf (f
, "%wd", -INTVAL (elt
));
6816 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6817 && aarch64_print_vector_float_operand (f
, x
, true))
6821 output_operand_lossage ("invalid vector constant");
6831 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6833 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6836 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
6843 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6845 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6848 asm_fprintf (f
, "%c%d",
6849 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
6850 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
6854 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6856 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6859 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
6863 if (!CONST_INT_P (x
))
6865 output_operand_lossage ("invalid operand for '%%%c'", code
);
6868 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
6873 /* Print a replicated constant in hex. */
6874 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6876 output_operand_lossage ("invalid operand for '%%%c'", code
);
6879 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6880 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6886 /* Print a replicated constant in decimal, treating it as
6888 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6890 output_operand_lossage ("invalid operand for '%%%c'", code
);
6893 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6894 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6901 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
6903 asm_fprintf (f
, "%czr", code
);
6907 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
6909 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
6913 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
6915 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
6924 output_operand_lossage ("missing operand");
6928 switch (GET_CODE (x
))
6931 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
6933 if (REG_NREGS (x
) == 1)
6934 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
6938 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
6939 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
6940 REGNO (x
) - V0_REGNUM
, suffix
,
6941 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
6945 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
6949 output_address (GET_MODE (x
), XEXP (x
, 0));
6954 output_addr_const (asm_out_file
, x
);
6958 asm_fprintf (f
, "%wd", INTVAL (x
));
6962 if (!VECTOR_MODE_P (GET_MODE (x
)))
6964 output_addr_const (asm_out_file
, x
);
6970 if (!const_vec_duplicate_p (x
, &elt
))
6972 output_operand_lossage ("invalid vector constant");
6976 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6977 asm_fprintf (f
, "%wd", INTVAL (elt
));
6978 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6979 && aarch64_print_vector_float_operand (f
, x
, false))
6983 output_operand_lossage ("invalid vector constant");
6989 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6990 be getting CONST_DOUBLEs holding integers. */
6991 gcc_assert (GET_MODE (x
) != VOIDmode
);
6992 if (aarch64_float_const_zero_rtx_p (x
))
6997 else if (aarch64_float_const_representable_p (x
))
7000 char float_buf
[buf_size
] = {'\0'};
7001 real_to_decimal_for_mode (float_buf
,
7002 CONST_DOUBLE_REAL_VALUE (x
),
7005 asm_fprintf (asm_out_file
, "%s", float_buf
);
7009 output_operand_lossage ("invalid constant");
7012 output_operand_lossage ("invalid operand");
7018 if (GET_CODE (x
) == HIGH
)
7021 switch (aarch64_classify_symbolic_expression (x
))
7023 case SYMBOL_SMALL_GOT_4G
:
7024 asm_fprintf (asm_out_file
, ":got:");
7027 case SYMBOL_SMALL_TLSGD
:
7028 asm_fprintf (asm_out_file
, ":tlsgd:");
7031 case SYMBOL_SMALL_TLSDESC
:
7032 asm_fprintf (asm_out_file
, ":tlsdesc:");
7035 case SYMBOL_SMALL_TLSIE
:
7036 asm_fprintf (asm_out_file
, ":gottprel:");
7039 case SYMBOL_TLSLE24
:
7040 asm_fprintf (asm_out_file
, ":tprel:");
7043 case SYMBOL_TINY_GOT
:
7050 output_addr_const (asm_out_file
, x
);
7054 switch (aarch64_classify_symbolic_expression (x
))
7056 case SYMBOL_SMALL_GOT_4G
:
7057 asm_fprintf (asm_out_file
, ":lo12:");
7060 case SYMBOL_SMALL_TLSGD
:
7061 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7064 case SYMBOL_SMALL_TLSDESC
:
7065 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7068 case SYMBOL_SMALL_TLSIE
:
7069 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7072 case SYMBOL_TLSLE12
:
7073 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7076 case SYMBOL_TLSLE24
:
7077 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7080 case SYMBOL_TINY_GOT
:
7081 asm_fprintf (asm_out_file
, ":got:");
7084 case SYMBOL_TINY_TLSIE
:
7085 asm_fprintf (asm_out_file
, ":gottprel:");
7091 output_addr_const (asm_out_file
, x
);
7095 switch (aarch64_classify_symbolic_expression (x
))
7097 case SYMBOL_TLSLE24
:
7098 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7103 output_addr_const (asm_out_file
, x
);
7108 HOST_WIDE_INT cond_code
;
7110 if (!CONST_INT_P (x
))
7112 output_operand_lossage ("invalid operand for '%%%c'", code
);
7116 cond_code
= INTVAL (x
);
7117 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7118 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7125 machine_mode mode
= GET_MODE (x
);
7127 if (GET_CODE (x
) != MEM
7128 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7130 output_operand_lossage ("invalid operand for '%%%c'", code
);
7134 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
7136 ? ADDR_QUERY_LDP_STP_N
7137 : ADDR_QUERY_LDP_STP
))
7138 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7143 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7148 /* Print address 'x' of a memory access with mode 'mode'.
7149 'op' is the context required by aarch64_classify_address. It can either be
7150 MEM for a normal memory access or PARALLEL for LDP/STP. */
7152 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7153 aarch64_addr_query_type type
)
7155 struct aarch64_address_info addr
;
7158 /* Check all addresses are Pmode - including ILP32. */
7159 if (GET_MODE (x
) != Pmode
)
7160 output_operand_lossage ("invalid address mode");
7162 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7165 case ADDRESS_REG_IMM
:
7166 if (known_eq (addr
.const_offset
, 0))
7167 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7168 else if (aarch64_sve_data_mode_p (mode
))
7171 = exact_div (addr
.const_offset
,
7172 BYTES_PER_SVE_VECTOR
).to_constant ();
7173 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7174 reg_names
[REGNO (addr
.base
)], vnum
);
7176 else if (aarch64_sve_pred_mode_p (mode
))
7179 = exact_div (addr
.const_offset
,
7180 BYTES_PER_SVE_PRED
).to_constant ();
7181 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7182 reg_names
[REGNO (addr
.base
)], vnum
);
7185 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7186 INTVAL (addr
.offset
));
7189 case ADDRESS_REG_REG
:
7190 if (addr
.shift
== 0)
7191 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7192 reg_names
[REGNO (addr
.offset
)]);
7194 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7195 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7198 case ADDRESS_REG_UXTW
:
7199 if (addr
.shift
== 0)
7200 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7201 REGNO (addr
.offset
) - R0_REGNUM
);
7203 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7204 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7207 case ADDRESS_REG_SXTW
:
7208 if (addr
.shift
== 0)
7209 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7210 REGNO (addr
.offset
) - R0_REGNUM
);
7212 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7213 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7216 case ADDRESS_REG_WB
:
7217 /* Writeback is only supported for fixed-width modes. */
7218 size
= GET_MODE_SIZE (mode
).to_constant ();
7219 switch (GET_CODE (x
))
7222 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7225 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7228 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7231 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7234 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7235 INTVAL (addr
.offset
));
7238 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
7239 INTVAL (addr
.offset
));
7246 case ADDRESS_LO_SUM
:
7247 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
7248 output_addr_const (f
, addr
.offset
);
7249 asm_fprintf (f
, "]");
7252 case ADDRESS_SYMBOLIC
:
7253 output_addr_const (f
, x
);
7260 /* Print address 'x' of a memory access with mode 'mode'. */
7262 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
7264 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
7265 output_addr_const (f
, x
);
7269 aarch64_label_mentioned_p (rtx x
)
7274 if (GET_CODE (x
) == LABEL_REF
)
7277 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7278 referencing instruction, but they are constant offsets, not
7280 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7283 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
7284 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
7290 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
7291 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
7294 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
7301 /* Implement REGNO_REG_CLASS. */
7304 aarch64_regno_regclass (unsigned regno
)
7306 if (GP_REGNUM_P (regno
))
7307 return GENERAL_REGS
;
7309 if (regno
== SP_REGNUM
)
7312 if (regno
== FRAME_POINTER_REGNUM
7313 || regno
== ARG_POINTER_REGNUM
)
7314 return POINTER_REGS
;
7316 if (FP_REGNUM_P (regno
))
7317 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
7319 if (PR_REGNUM_P (regno
))
7320 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
7325 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7326 If OFFSET is out of range, return an offset of an anchor point
7327 that is in range. Return 0 otherwise. */
7329 static HOST_WIDE_INT
7330 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
7333 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7335 return (offset
+ 0x400) & ~0x7f0;
7337 /* For offsets that aren't a multiple of the access size, the limit is
7339 if (offset
& (size
- 1))
7341 /* BLKmode typically uses LDP of X-registers. */
7342 if (mode
== BLKmode
)
7343 return (offset
+ 512) & ~0x3ff;
7344 return (offset
+ 0x100) & ~0x1ff;
7347 /* Small negative offsets are supported. */
7348 if (IN_RANGE (offset
, -256, 0))
7351 if (mode
== TImode
|| mode
== TFmode
)
7352 return (offset
+ 0x100) & ~0x1ff;
7354 /* Use 12-bit offset by access size. */
7355 return offset
& (~0xfff * size
);
7359 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
7361 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7362 where mask is selected by alignment and size of the offset.
7363 We try to pick as large a range for the offset as possible to
7364 maximize the chance of a CSE. However, for aligned addresses
7365 we limit the range to 4k so that structures with different sized
7366 elements are likely to use the same base. We need to be careful
7367 not to split a CONST for some forms of address expression, otherwise
7368 it will generate sub-optimal code. */
7370 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
7372 rtx base
= XEXP (x
, 0);
7373 rtx offset_rtx
= XEXP (x
, 1);
7374 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
7376 if (GET_CODE (base
) == PLUS
)
7378 rtx op0
= XEXP (base
, 0);
7379 rtx op1
= XEXP (base
, 1);
7381 /* Force any scaling into a temp for CSE. */
7382 op0
= force_reg (Pmode
, op0
);
7383 op1
= force_reg (Pmode
, op1
);
7385 /* Let the pointer register be in op0. */
7386 if (REG_POINTER (op1
))
7387 std::swap (op0
, op1
);
7389 /* If the pointer is virtual or frame related, then we know that
7390 virtual register instantiation or register elimination is going
7391 to apply a second constant. We want the two constants folded
7392 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7393 if (virt_or_elim_regno_p (REGNO (op0
)))
7395 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
7396 NULL_RTX
, true, OPTAB_DIRECT
);
7397 return gen_rtx_PLUS (Pmode
, base
, op1
);
7400 /* Otherwise, in order to encourage CSE (and thence loop strength
7401 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7402 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
7403 NULL_RTX
, true, OPTAB_DIRECT
);
7404 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
7408 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7410 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
7412 if (base_offset
!= 0)
7414 base
= plus_constant (Pmode
, base
, base_offset
);
7415 base
= force_operand (base
, NULL_RTX
);
7416 return plus_constant (Pmode
, base
, offset
- base_offset
);
7424 /* Return the reload icode required for a constant pool in mode. */
7425 static enum insn_code
7426 aarch64_constant_pool_reload_icode (machine_mode mode
)
7431 return CODE_FOR_aarch64_reload_movcpsfdi
;
7434 return CODE_FOR_aarch64_reload_movcpdfdi
;
7437 return CODE_FOR_aarch64_reload_movcptfdi
;
7440 return CODE_FOR_aarch64_reload_movcpv8qidi
;
7443 return CODE_FOR_aarch64_reload_movcpv16qidi
;
7446 return CODE_FOR_aarch64_reload_movcpv4hidi
;
7449 return CODE_FOR_aarch64_reload_movcpv8hidi
;
7452 return CODE_FOR_aarch64_reload_movcpv2sidi
;
7455 return CODE_FOR_aarch64_reload_movcpv4sidi
;
7458 return CODE_FOR_aarch64_reload_movcpv2didi
;
7461 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
7470 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
7473 secondary_reload_info
*sri
)
7475 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7476 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7477 comment at the head of aarch64-sve.md for more details about the
7478 big-endian handling. */
7479 if (BYTES_BIG_ENDIAN
7480 && reg_class_subset_p (rclass
, FP_REGS
)
7481 && !((REG_P (x
) && HARD_REGISTER_P (x
))
7482 || aarch64_simd_valid_immediate (x
, NULL
))
7483 && aarch64_sve_data_mode_p (mode
))
7485 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
7489 /* If we have to disable direct literal pool loads and stores because the
7490 function is too big, then we need a scratch register. */
7491 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
7492 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
7493 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
7494 && !aarch64_pcrelative_literal_loads
)
7496 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
7500 /* Without the TARGET_SIMD instructions we cannot move a Q register
7501 to a Q register directly. We need a scratch. */
7502 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
7503 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
7504 && reg_class_subset_p (rclass
, FP_REGS
))
7507 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
7508 else if (mode
== TImode
)
7509 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
7513 /* A TFmode or TImode memory access should be handled via an FP_REGS
7514 because AArch64 has richer addressing modes for LDR/STR instructions
7515 than LDP/STP instructions. */
7516 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
7517 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
7520 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
7521 return GENERAL_REGS
;
7527 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
7529 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
7531 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7532 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7533 if (frame_pointer_needed
)
7534 return to
== HARD_FRAME_POINTER_REGNUM
;
7539 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
7541 aarch64_layout_frame ();
7543 if (to
== HARD_FRAME_POINTER_REGNUM
)
7545 if (from
== ARG_POINTER_REGNUM
)
7546 return cfun
->machine
->frame
.hard_fp_offset
;
7548 if (from
== FRAME_POINTER_REGNUM
)
7549 return cfun
->machine
->frame
.hard_fp_offset
7550 - cfun
->machine
->frame
.locals_offset
;
7553 if (to
== STACK_POINTER_REGNUM
)
7555 if (from
== FRAME_POINTER_REGNUM
)
7556 return cfun
->machine
->frame
.frame_size
7557 - cfun
->machine
->frame
.locals_offset
;
7560 return cfun
->machine
->frame
.frame_size
;
7563 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7567 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
7571 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
7576 aarch64_asm_trampoline_template (FILE *f
)
7580 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
7581 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
7585 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
7586 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
7588 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
7589 assemble_aligned_integer (4, const0_rtx
);
7590 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7591 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7595 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
7597 rtx fnaddr
, mem
, a_tramp
;
7598 const int tramp_code_sz
= 16;
7600 /* Don't need to copy the trailing D-words, we fill those in below. */
7601 emit_block_move (m_tramp
, assemble_trampoline_template (),
7602 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
7603 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
7604 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
7605 if (GET_MODE (fnaddr
) != ptr_mode
)
7606 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
7607 emit_move_insn (mem
, fnaddr
);
7609 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
7610 emit_move_insn (mem
, chain_value
);
7612 /* XXX We should really define a "clear_cache" pattern and use
7613 gen_clear_cache(). */
7614 a_tramp
= XEXP (m_tramp
, 0);
7615 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
7616 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
7617 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
7621 static unsigned char
7622 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
7624 /* ??? Logically we should only need to provide a value when
7625 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7626 can hold MODE, but at the moment we need to handle all modes.
7627 Just ignore any runtime parts for registers that can't store them. */
7628 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
7632 case TAILCALL_ADDR_REGS
:
7636 case POINTER_AND_FP_REGS
:
7639 if (aarch64_sve_data_mode_p (mode
)
7640 && constant_multiple_p (GET_MODE_SIZE (mode
),
7641 BYTES_PER_SVE_VECTOR
, &nregs
))
7643 return (aarch64_vector_data_mode_p (mode
)
7644 ? CEIL (lowest_size
, UNITS_PER_VREG
)
7645 : CEIL (lowest_size
, UNITS_PER_WORD
));
7662 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
7664 if (regclass
== POINTER_REGS
)
7665 return GENERAL_REGS
;
7667 if (regclass
== STACK_REG
)
7670 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
7676 /* Register eliminiation can result in a request for
7677 SP+constant->FP_REGS. We cannot support such operations which
7678 use SP as source and an FP_REG as destination, so reject out
7680 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
7682 rtx lhs
= XEXP (x
, 0);
7684 /* Look through a possible SUBREG introduced by ILP32. */
7685 if (GET_CODE (lhs
) == SUBREG
)
7686 lhs
= SUBREG_REG (lhs
);
7688 gcc_assert (REG_P (lhs
));
7689 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
7698 aarch64_asm_output_labelref (FILE* f
, const char *name
)
7700 asm_fprintf (f
, "%U%s", name
);
7704 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
7706 if (priority
== DEFAULT_INIT_PRIORITY
)
7707 default_ctor_section_asm_out_constructor (symbol
, priority
);
7711 /* While priority is known to be in range [0, 65535], so 18 bytes
7712 would be enough, the compiler might not know that. To avoid
7713 -Wformat-truncation false positive, use a larger size. */
7715 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
7716 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7717 switch_to_section (s
);
7718 assemble_align (POINTER_SIZE
);
7719 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7724 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
7726 if (priority
== DEFAULT_INIT_PRIORITY
)
7727 default_dtor_section_asm_out_destructor (symbol
, priority
);
7731 /* While priority is known to be in range [0, 65535], so 18 bytes
7732 would be enough, the compiler might not know that. To avoid
7733 -Wformat-truncation false positive, use a larger size. */
7735 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
7736 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7737 switch_to_section (s
);
7738 assemble_align (POINTER_SIZE
);
7739 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7744 aarch64_output_casesi (rtx
*operands
)
7748 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
7750 static const char *const patterns
[4][2] =
7753 "ldrb\t%w3, [%0,%w1,uxtw]",
7754 "add\t%3, %4, %w3, sxtb #2"
7757 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7758 "add\t%3, %4, %w3, sxth #2"
7761 "ldr\t%w3, [%0,%w1,uxtw #2]",
7762 "add\t%3, %4, %w3, sxtw #2"
7764 /* We assume that DImode is only generated when not optimizing and
7765 that we don't really need 64-bit address offsets. That would
7766 imply an object file with 8GB of code in a single function! */
7768 "ldr\t%w3, [%0,%w1,uxtw #2]",
7769 "add\t%3, %4, %w3, sxtw #2"
7773 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
7775 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
7776 index
= exact_log2 (GET_MODE_SIZE (mode
));
7778 gcc_assert (index
>= 0 && index
<= 3);
7780 /* Need to implement table size reduction, by chaning the code below. */
7781 output_asm_insn (patterns
[index
][0], operands
);
7782 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
7783 snprintf (buf
, sizeof (buf
),
7784 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
7785 output_asm_insn (buf
, operands
);
7786 output_asm_insn (patterns
[index
][1], operands
);
7787 output_asm_insn ("br\t%3", operands
);
7788 assemble_label (asm_out_file
, label
);
7793 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7794 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7798 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
7800 if (shift
>= 0 && shift
<= 3)
7803 for (size
= 8; size
<= 32; size
*= 2)
7805 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
7806 if (mask
== bits
<< shift
)
7813 /* Constant pools are per function only when PC relative
7814 literal loads are true or we are in the large memory
7818 aarch64_can_use_per_function_literal_pools_p (void)
7820 return (aarch64_pcrelative_literal_loads
7821 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
7825 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
7827 /* We can't use blocks for constants when we're using a per-function
7829 return !aarch64_can_use_per_function_literal_pools_p ();
7832 /* Select appropriate section for constants depending
7833 on where we place literal pools. */
7836 aarch64_select_rtx_section (machine_mode mode
,
7838 unsigned HOST_WIDE_INT align
)
7840 if (aarch64_can_use_per_function_literal_pools_p ())
7841 return function_section (current_function_decl
);
7843 return default_elf_select_rtx_section (mode
, x
, align
);
7846 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7848 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
7849 HOST_WIDE_INT offset
)
7851 /* When using per-function literal pools, we must ensure that any code
7852 section is aligned to the minimal instruction length, lest we get
7853 errors from the assembler re "unaligned instructions". */
7854 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
7855 ASM_OUTPUT_ALIGN (f
, 2);
7860 /* Helper function for rtx cost calculation. Strip a shift expression
7861 from X. Returns the inner operand if successful, or the original
7862 expression on failure. */
7864 aarch64_strip_shift (rtx x
)
7868 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7869 we can convert both to ROR during final output. */
7870 if ((GET_CODE (op
) == ASHIFT
7871 || GET_CODE (op
) == ASHIFTRT
7872 || GET_CODE (op
) == LSHIFTRT
7873 || GET_CODE (op
) == ROTATERT
7874 || GET_CODE (op
) == ROTATE
)
7875 && CONST_INT_P (XEXP (op
, 1)))
7876 return XEXP (op
, 0);
7878 if (GET_CODE (op
) == MULT
7879 && CONST_INT_P (XEXP (op
, 1))
7880 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
7881 return XEXP (op
, 0);
7886 /* Helper function for rtx cost calculation. Strip an extend
7887 expression from X. Returns the inner operand if successful, or the
7888 original expression on failure. We deal with a number of possible
7889 canonicalization variations here. If STRIP_SHIFT is true, then
7890 we can strip off a shift also. */
7892 aarch64_strip_extend (rtx x
, bool strip_shift
)
7894 scalar_int_mode mode
;
7897 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
7900 /* Zero and sign extraction of a widened value. */
7901 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
7902 && XEXP (op
, 2) == const0_rtx
7903 && GET_CODE (XEXP (op
, 0)) == MULT
7904 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
7906 return XEXP (XEXP (op
, 0), 0);
7908 /* It can also be represented (for zero-extend) as an AND with an
7910 if (GET_CODE (op
) == AND
7911 && GET_CODE (XEXP (op
, 0)) == MULT
7912 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
7913 && CONST_INT_P (XEXP (op
, 1))
7914 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
7915 INTVAL (XEXP (op
, 1))) != 0)
7916 return XEXP (XEXP (op
, 0), 0);
7918 /* Now handle extended register, as this may also have an optional
7919 left shift by 1..4. */
7921 && GET_CODE (op
) == ASHIFT
7922 && CONST_INT_P (XEXP (op
, 1))
7923 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
7926 if (GET_CODE (op
) == ZERO_EXTEND
7927 || GET_CODE (op
) == SIGN_EXTEND
)
7936 /* Return true iff CODE is a shift supported in combination
7937 with arithmetic instructions. */
7940 aarch64_shift_p (enum rtx_code code
)
7942 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
7946 /* Return true iff X is a cheap shift without a sign extend. */
7949 aarch64_cheap_mult_shift_p (rtx x
)
7956 if (!(aarch64_tune_params
.extra_tuning_flags
7957 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
7960 if (GET_CODE (op0
) == SIGN_EXTEND
)
7963 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
7964 && UINTVAL (op1
) <= 4)
7967 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
7970 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
7972 if (l2
> 0 && l2
<= 4)
7978 /* Helper function for rtx cost calculation. Calculate the cost of
7979 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7980 Return the calculated cost of the expression, recursing manually in to
7981 operands where needed. */
7984 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
7987 const struct cpu_cost_table
*extra_cost
7988 = aarch64_tune_params
.insn_extra_cost
;
7990 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
7991 machine_mode mode
= GET_MODE (x
);
7993 gcc_checking_assert (code
== MULT
);
7998 if (VECTOR_MODE_P (mode
))
7999 mode
= GET_MODE_INNER (mode
);
8001 /* Integer multiply/fma. */
8002 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8004 /* The multiply will be canonicalized as a shift, cost it as such. */
8005 if (aarch64_shift_p (GET_CODE (x
))
8006 || (CONST_INT_P (op1
)
8007 && exact_log2 (INTVAL (op1
)) > 0))
8009 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
8010 || GET_CODE (op0
) == SIGN_EXTEND
;
8015 /* If the shift is considered cheap,
8016 then don't add any cost. */
8017 if (aarch64_cheap_mult_shift_p (x
))
8019 else if (REG_P (op1
))
8020 /* ARITH + shift-by-register. */
8021 cost
+= extra_cost
->alu
.arith_shift_reg
;
8023 /* ARITH + extended register. We don't have a cost field
8024 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8025 cost
+= extra_cost
->alu
.extend_arith
;
8027 /* ARITH + shift-by-immediate. */
8028 cost
+= extra_cost
->alu
.arith_shift
;
8031 /* LSL (immediate). */
8032 cost
+= extra_cost
->alu
.shift
;
8035 /* Strip extends as we will have costed them in the case above. */
8037 op0
= aarch64_strip_extend (op0
, true);
8039 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
8044 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8045 compound and let the below cases handle it. After all, MNEG is a
8046 special-case alias of MSUB. */
8047 if (GET_CODE (op0
) == NEG
)
8049 op0
= XEXP (op0
, 0);
8053 /* Integer multiplies or FMAs have zero/sign extending variants. */
8054 if ((GET_CODE (op0
) == ZERO_EXTEND
8055 && GET_CODE (op1
) == ZERO_EXTEND
)
8056 || (GET_CODE (op0
) == SIGN_EXTEND
8057 && GET_CODE (op1
) == SIGN_EXTEND
))
8059 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
8060 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
8065 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8066 cost
+= extra_cost
->mult
[0].extend_add
;
8068 /* MUL/SMULL/UMULL. */
8069 cost
+= extra_cost
->mult
[0].extend
;
8075 /* This is either an integer multiply or a MADD. In both cases
8076 we want to recurse and cost the operands. */
8077 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8078 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8084 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8087 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8096 /* Floating-point FMA/FMUL can also support negations of the
8097 operands, unless the rounding mode is upward or downward in
8098 which case FNMUL is different than FMUL with operand negation. */
8099 bool neg0
= GET_CODE (op0
) == NEG
;
8100 bool neg1
= GET_CODE (op1
) == NEG
;
8101 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8104 op0
= XEXP (op0
, 0);
8106 op1
= XEXP (op1
, 0);
8110 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8111 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8114 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8117 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8118 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8124 aarch64_address_cost (rtx x
,
8126 addr_space_t as ATTRIBUTE_UNUSED
,
8129 enum rtx_code c
= GET_CODE (x
);
8130 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8131 struct aarch64_address_info info
;
8135 if (!aarch64_classify_address (&info
, x
, mode
, false))
8137 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8139 /* This is a CONST or SYMBOL ref which will be split
8140 in a different way depending on the code model in use.
8141 Cost it through the generic infrastructure. */
8142 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8143 /* Divide through by the cost of one instruction to
8144 bring it to the same units as the address costs. */
8145 cost_symbol_ref
/= COSTS_N_INSNS (1);
8146 /* The cost is then the cost of preparing the address,
8147 followed by an immediate (possibly 0) offset. */
8148 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8152 /* This is most likely a jump table from a case
8154 return addr_cost
->register_offset
;
8160 case ADDRESS_LO_SUM
:
8161 case ADDRESS_SYMBOLIC
:
8162 case ADDRESS_REG_IMM
:
8163 cost
+= addr_cost
->imm_offset
;
8166 case ADDRESS_REG_WB
:
8167 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8168 cost
+= addr_cost
->pre_modify
;
8169 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8170 cost
+= addr_cost
->post_modify
;
8176 case ADDRESS_REG_REG
:
8177 cost
+= addr_cost
->register_offset
;
8180 case ADDRESS_REG_SXTW
:
8181 cost
+= addr_cost
->register_sextend
;
8184 case ADDRESS_REG_UXTW
:
8185 cost
+= addr_cost
->register_zextend
;
8195 /* For the sake of calculating the cost of the shifted register
8196 component, we can treat same sized modes in the same way. */
8197 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8198 cost
+= addr_cost
->addr_scale_costs
.hi
;
8199 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8200 cost
+= addr_cost
->addr_scale_costs
.si
;
8201 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8202 cost
+= addr_cost
->addr_scale_costs
.di
;
8204 /* We can't tell, or this is a 128-bit vector. */
8205 cost
+= addr_cost
->addr_scale_costs
.ti
;
8211 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8212 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8216 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8218 /* When optimizing for speed, use the cost of unpredictable branches. */
8219 const struct cpu_branch_cost
*branch_costs
=
8220 aarch64_tune_params
.branch_costs
;
8222 if (!speed_p
|| predictable_p
)
8223 return branch_costs
->predictable
;
8225 return branch_costs
->unpredictable
;
8228 /* Return true if the RTX X in mode MODE is a zero or sign extract
8229 usable in an ADD or SUB (extended register) instruction. */
8231 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8233 /* Catch add with a sign extract.
8234 This is add_<optab><mode>_multp2. */
8235 if (GET_CODE (x
) == SIGN_EXTRACT
8236 || GET_CODE (x
) == ZERO_EXTRACT
)
8238 rtx op0
= XEXP (x
, 0);
8239 rtx op1
= XEXP (x
, 1);
8240 rtx op2
= XEXP (x
, 2);
8242 if (GET_CODE (op0
) == MULT
8243 && CONST_INT_P (op1
)
8244 && op2
== const0_rtx
8245 && CONST_INT_P (XEXP (op0
, 1))
8246 && aarch64_is_extend_from_extract (mode
,
8253 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8255 else if (GET_CODE (x
) == SIGN_EXTEND
8256 || GET_CODE (x
) == ZERO_EXTEND
)
8257 return REG_P (XEXP (x
, 0));
8263 aarch64_frint_unspec_p (unsigned int u
)
8281 /* Return true iff X is an rtx that will match an extr instruction
8282 i.e. as described in the *extr<mode>5_insn family of patterns.
8283 OP0 and OP1 will be set to the operands of the shifts involved
8284 on success and will be NULL_RTX otherwise. */
8287 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
8290 scalar_int_mode mode
;
8291 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
8294 *res_op0
= NULL_RTX
;
8295 *res_op1
= NULL_RTX
;
8297 if (GET_CODE (x
) != IOR
)
8303 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
8304 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
8306 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8307 if (GET_CODE (op1
) == ASHIFT
)
8308 std::swap (op0
, op1
);
8310 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
8313 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
8314 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
8316 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
8317 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
8319 *res_op0
= XEXP (op0
, 0);
8320 *res_op1
= XEXP (op1
, 0);
8328 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8329 storing it in *COST. Result is true if the total cost of the operation
8330 has now been calculated. */
8332 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
8336 enum rtx_code cmpcode
;
8338 if (COMPARISON_P (op0
))
8340 inner
= XEXP (op0
, 0);
8341 comparator
= XEXP (op0
, 1);
8342 cmpcode
= GET_CODE (op0
);
8347 comparator
= const0_rtx
;
8351 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
8353 /* Conditional branch. */
8354 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8358 if (cmpcode
== NE
|| cmpcode
== EQ
)
8360 if (comparator
== const0_rtx
)
8362 /* TBZ/TBNZ/CBZ/CBNZ. */
8363 if (GET_CODE (inner
) == ZERO_EXTRACT
)
8365 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
8366 ZERO_EXTRACT
, 0, speed
);
8369 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
8374 else if (cmpcode
== LT
|| cmpcode
== GE
)
8377 if (comparator
== const0_rtx
)
8382 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8385 if (GET_CODE (op1
) == COMPARE
)
8387 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8388 if (XEXP (op1
, 1) == const0_rtx
)
8392 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
8393 const struct cpu_cost_table
*extra_cost
8394 = aarch64_tune_params
.insn_extra_cost
;
8396 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8397 *cost
+= extra_cost
->alu
.arith
;
8399 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8404 /* It's a conditional operation based on the status flags,
8405 so it must be some flavor of CSEL. */
8407 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8408 if (GET_CODE (op1
) == NEG
8409 || GET_CODE (op1
) == NOT
8410 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
8411 op1
= XEXP (op1
, 0);
8412 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
8414 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8415 op1
= XEXP (op1
, 0);
8416 op2
= XEXP (op2
, 0);
8419 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
8420 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
8424 /* We don't know what this is, cost all operands. */
8428 /* Check whether X is a bitfield operation of the form shift + extend that
8429 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8430 operand to which the bitfield operation is applied. Otherwise return
8434 aarch64_extend_bitfield_pattern_p (rtx x
)
8436 rtx_code outer_code
= GET_CODE (x
);
8437 machine_mode outer_mode
= GET_MODE (x
);
8439 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
8440 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
8443 rtx inner
= XEXP (x
, 0);
8444 rtx_code inner_code
= GET_CODE (inner
);
8445 machine_mode inner_mode
= GET_MODE (inner
);
8451 if (CONST_INT_P (XEXP (inner
, 1))
8452 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8453 op
= XEXP (inner
, 0);
8456 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8457 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8458 op
= XEXP (inner
, 0);
8461 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8462 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8463 op
= XEXP (inner
, 0);
8472 /* Return true if the mask and a shift amount from an RTX of the form
8473 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8474 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8477 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
8480 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
8481 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
8482 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
8483 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
8486 /* Calculate the cost of calculating X, storing it in *COST. Result
8487 is true if the total cost of the operation has now been calculated. */
8489 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
8490 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
8493 const struct cpu_cost_table
*extra_cost
8494 = aarch64_tune_params
.insn_extra_cost
;
8495 int code
= GET_CODE (x
);
8496 scalar_int_mode int_mode
;
8498 /* By default, assume that everything has equivalent cost to the
8499 cheapest instruction. Any additional costs are applied as a delta
8500 above this default. */
8501 *cost
= COSTS_N_INSNS (1);
8506 /* The cost depends entirely on the operands to SET. */
8511 switch (GET_CODE (op0
))
8516 rtx address
= XEXP (op0
, 0);
8517 if (VECTOR_MODE_P (mode
))
8518 *cost
+= extra_cost
->ldst
.storev
;
8519 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8520 *cost
+= extra_cost
->ldst
.store
;
8521 else if (mode
== SFmode
)
8522 *cost
+= extra_cost
->ldst
.storef
;
8523 else if (mode
== DFmode
)
8524 *cost
+= extra_cost
->ldst
.stored
;
8527 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8531 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8535 if (! REG_P (SUBREG_REG (op0
)))
8536 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
8540 /* The cost is one per vector-register copied. */
8541 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
8543 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
8544 *cost
= COSTS_N_INSNS (nregs
);
8546 /* const0_rtx is in general free, but we will use an
8547 instruction to set a register to 0. */
8548 else if (REG_P (op1
) || op1
== const0_rtx
)
8550 /* The cost is 1 per register copied. */
8551 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
8552 *cost
= COSTS_N_INSNS (nregs
);
8555 /* Cost is just the cost of the RHS of the set. */
8556 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8561 /* Bit-field insertion. Strip any redundant widening of
8562 the RHS to meet the width of the target. */
8563 if (GET_CODE (op1
) == SUBREG
)
8564 op1
= SUBREG_REG (op1
);
8565 if ((GET_CODE (op1
) == ZERO_EXTEND
8566 || GET_CODE (op1
) == SIGN_EXTEND
)
8567 && CONST_INT_P (XEXP (op0
, 1))
8568 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
8569 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
8570 op1
= XEXP (op1
, 0);
8572 if (CONST_INT_P (op1
))
8574 /* MOV immediate is assumed to always be cheap. */
8575 *cost
= COSTS_N_INSNS (1);
8581 *cost
+= extra_cost
->alu
.bfi
;
8582 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
8588 /* We can't make sense of this, assume default cost. */
8589 *cost
= COSTS_N_INSNS (1);
8595 /* If an instruction can incorporate a constant within the
8596 instruction, the instruction's expression avoids calling
8597 rtx_cost() on the constant. If rtx_cost() is called on a
8598 constant, then it is usually because the constant must be
8599 moved into a register by one or more instructions.
8601 The exception is constant 0, which can be expressed
8602 as XZR/WZR and is therefore free. The exception to this is
8603 if we have (set (reg) (const0_rtx)) in which case we must cost
8604 the move. However, we can catch that when we cost the SET, so
8605 we don't need to consider that here. */
8606 if (x
== const0_rtx
)
8610 /* To an approximation, building any other constant is
8611 proportionally expensive to the number of instructions
8612 required to build that constant. This is true whether we
8613 are compiling for SPEED or otherwise. */
8614 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8615 int_mode
= word_mode
;
8616 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
8617 (NULL_RTX
, x
, false, int_mode
));
8623 /* First determine number of instructions to do the move
8624 as an integer constant. */
8625 if (!aarch64_float_const_representable_p (x
)
8626 && !aarch64_can_const_movi_rtx_p (x
, mode
)
8627 && aarch64_float_const_rtx_p (x
))
8629 unsigned HOST_WIDE_INT ival
;
8630 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
8631 gcc_assert (succeed
);
8633 scalar_int_mode imode
= (mode
== HFmode
8635 : int_mode_for_mode (mode
).require ());
8636 int ncost
= aarch64_internal_mov_immediate
8637 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8638 *cost
+= COSTS_N_INSNS (ncost
);
8644 /* mov[df,sf]_aarch64. */
8645 if (aarch64_float_const_representable_p (x
))
8646 /* FMOV (scalar immediate). */
8647 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
8648 else if (!aarch64_float_const_zero_rtx_p (x
))
8650 /* This will be a load from memory. */
8652 *cost
+= extra_cost
->ldst
.loadd
;
8654 *cost
+= extra_cost
->ldst
.loadf
;
8657 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8658 or MOV v0.s[0], wzr - neither of which are modeled by the
8659 cost tables. Just use the default cost. */
8669 /* For loads we want the base cost of a load, plus an
8670 approximation for the additional cost of the addressing
8672 rtx address
= XEXP (x
, 0);
8673 if (VECTOR_MODE_P (mode
))
8674 *cost
+= extra_cost
->ldst
.loadv
;
8675 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8676 *cost
+= extra_cost
->ldst
.load
;
8677 else if (mode
== SFmode
)
8678 *cost
+= extra_cost
->ldst
.loadf
;
8679 else if (mode
== DFmode
)
8680 *cost
+= extra_cost
->ldst
.loadd
;
8683 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8692 if (VECTOR_MODE_P (mode
))
8697 *cost
+= extra_cost
->vect
.alu
;
8702 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8704 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8705 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8708 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
8712 /* Cost this as SUB wzr, X. */
8713 op0
= CONST0_RTX (mode
);
8718 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8720 /* Support (neg(fma...)) as a single instruction only if
8721 sign of zeros is unimportant. This matches the decision
8722 making in aarch64.md. */
8723 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
8726 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8729 if (GET_CODE (op0
) == MULT
)
8732 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8737 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8747 if (VECTOR_MODE_P (mode
))
8748 *cost
+= extra_cost
->vect
.alu
;
8750 *cost
+= extra_cost
->alu
.clz
;
8759 if (op1
== const0_rtx
8760 && GET_CODE (op0
) == AND
)
8763 mode
= GET_MODE (op0
);
8767 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
8769 /* TODO: A write to the CC flags possibly costs extra, this
8770 needs encoding in the cost tables. */
8772 mode
= GET_MODE (op0
);
8774 if (GET_CODE (op0
) == AND
)
8780 if (GET_CODE (op0
) == PLUS
)
8782 /* ADDS (and CMN alias). */
8787 if (GET_CODE (op0
) == MINUS
)
8794 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
8795 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
8796 && CONST_INT_P (XEXP (op0
, 2)))
8798 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8799 Handle it here directly rather than going to cost_logic
8800 since we know the immediate generated for the TST is valid
8801 so we can avoid creating an intermediate rtx for it only
8802 for costing purposes. */
8804 *cost
+= extra_cost
->alu
.logical
;
8806 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
8807 ZERO_EXTRACT
, 0, speed
);
8811 if (GET_CODE (op1
) == NEG
)
8815 *cost
+= extra_cost
->alu
.arith
;
8817 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
8818 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
8824 Compare can freely swap the order of operands, and
8825 canonicalization puts the more complex operation first.
8826 But the integer MINUS logic expects the shift/extend
8827 operation in op1. */
8829 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
8837 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
8841 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8843 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
8845 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
8846 /* FCMP supports constant 0.0 for no extra cost. */
8852 if (VECTOR_MODE_P (mode
))
8854 /* Vector compare. */
8856 *cost
+= extra_cost
->vect
.alu
;
8858 if (aarch64_float_const_zero_rtx_p (op1
))
8860 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8874 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
8876 /* Detect valid immediates. */
8877 if ((GET_MODE_CLASS (mode
) == MODE_INT
8878 || (GET_MODE_CLASS (mode
) == MODE_CC
8879 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
8880 && CONST_INT_P (op1
)
8881 && aarch64_uimm12_shift (INTVAL (op1
)))
8884 /* SUB(S) (immediate). */
8885 *cost
+= extra_cost
->alu
.arith
;
8889 /* Look for SUB (extended register). */
8890 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8891 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
8894 *cost
+= extra_cost
->alu
.extend_arith
;
8896 op1
= aarch64_strip_extend (op1
, true);
8897 *cost
+= rtx_cost (op1
, VOIDmode
,
8898 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
8902 rtx new_op1
= aarch64_strip_extend (op1
, false);
8904 /* Cost this as an FMA-alike operation. */
8905 if ((GET_CODE (new_op1
) == MULT
8906 || aarch64_shift_p (GET_CODE (new_op1
)))
8909 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
8910 (enum rtx_code
) code
,
8915 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
8919 if (VECTOR_MODE_P (mode
))
8922 *cost
+= extra_cost
->vect
.alu
;
8924 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8927 *cost
+= extra_cost
->alu
.arith
;
8929 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8932 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8946 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8947 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8950 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
8951 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8955 if (GET_MODE_CLASS (mode
) == MODE_INT
8956 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
8957 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
8959 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
8962 /* ADD (immediate). */
8963 *cost
+= extra_cost
->alu
.arith
;
8967 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8969 /* Look for ADD (extended register). */
8970 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8971 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
8974 *cost
+= extra_cost
->alu
.extend_arith
;
8976 op0
= aarch64_strip_extend (op0
, true);
8977 *cost
+= rtx_cost (op0
, VOIDmode
,
8978 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
8982 /* Strip any extend, leave shifts behind as we will
8983 cost them through mult_cost. */
8984 new_op0
= aarch64_strip_extend (op0
, false);
8986 if (GET_CODE (new_op0
) == MULT
8987 || aarch64_shift_p (GET_CODE (new_op0
)))
8989 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
8994 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
8998 if (VECTOR_MODE_P (mode
))
9001 *cost
+= extra_cost
->vect
.alu
;
9003 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9006 *cost
+= extra_cost
->alu
.arith
;
9008 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9011 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9018 *cost
= COSTS_N_INSNS (1);
9022 if (VECTOR_MODE_P (mode
))
9023 *cost
+= extra_cost
->vect
.alu
;
9025 *cost
+= extra_cost
->alu
.rev
;
9030 if (aarch_rev16_p (x
))
9032 *cost
= COSTS_N_INSNS (1);
9036 if (VECTOR_MODE_P (mode
))
9037 *cost
+= extra_cost
->vect
.alu
;
9039 *cost
+= extra_cost
->alu
.rev
;
9044 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
9046 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
9047 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
9049 *cost
+= extra_cost
->alu
.shift
;
9060 if (VECTOR_MODE_P (mode
))
9063 *cost
+= extra_cost
->vect
.alu
;
9068 && GET_CODE (op0
) == MULT
9069 && CONST_INT_P (XEXP (op0
, 1))
9070 && CONST_INT_P (op1
)
9071 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9074 /* This is a UBFM/SBFM. */
9075 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9077 *cost
+= extra_cost
->alu
.bfx
;
9081 if (is_int_mode (mode
, &int_mode
))
9083 if (CONST_INT_P (op1
))
9085 /* We have a mask + shift version of a UBFIZ
9086 i.e. the *andim_ashift<mode>_bfiz pattern. */
9087 if (GET_CODE (op0
) == ASHIFT
9088 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
9091 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
9092 (enum rtx_code
) code
, 0, speed
);
9094 *cost
+= extra_cost
->alu
.bfx
;
9098 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
9100 /* We possibly get the immediate for free, this is not
9102 *cost
+= rtx_cost (op0
, int_mode
,
9103 (enum rtx_code
) code
, 0, speed
);
9105 *cost
+= extra_cost
->alu
.logical
;
9114 /* Handle ORN, EON, or BIC. */
9115 if (GET_CODE (op0
) == NOT
)
9116 op0
= XEXP (op0
, 0);
9118 new_op0
= aarch64_strip_shift (op0
);
9120 /* If we had a shift on op0 then this is a logical-shift-
9121 by-register/immediate operation. Otherwise, this is just
9122 a logical operation. */
9127 /* Shift by immediate. */
9128 if (CONST_INT_P (XEXP (op0
, 1)))
9129 *cost
+= extra_cost
->alu
.log_shift
;
9131 *cost
+= extra_cost
->alu
.log_shift_reg
;
9134 *cost
+= extra_cost
->alu
.logical
;
9137 /* In both cases we want to cost both operands. */
9138 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9140 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9150 op0
= aarch64_strip_shift (x
);
9152 if (VECTOR_MODE_P (mode
))
9155 *cost
+= extra_cost
->vect
.alu
;
9159 /* MVN-shifted-reg. */
9162 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9165 *cost
+= extra_cost
->alu
.log_shift
;
9169 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9170 Handle the second form here taking care that 'a' in the above can
9172 else if (GET_CODE (op0
) == XOR
)
9174 rtx newop0
= XEXP (op0
, 0);
9175 rtx newop1
= XEXP (op0
, 1);
9176 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9178 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9179 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9183 if (op0_stripped
!= newop0
)
9184 *cost
+= extra_cost
->alu
.log_shift
;
9186 *cost
+= extra_cost
->alu
.logical
;
9193 *cost
+= extra_cost
->alu
.logical
;
9200 /* If a value is written in SI mode, then zero extended to DI
9201 mode, the operation will in general be free as a write to
9202 a 'w' register implicitly zeroes the upper bits of an 'x'
9203 register. However, if this is
9205 (set (reg) (zero_extend (reg)))
9207 we must cost the explicit register move. */
9209 && GET_MODE (op0
) == SImode
9212 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9214 /* If OP_COST is non-zero, then the cost of the zero extend
9215 is effectively the cost of the inner operation. Otherwise
9216 we have a MOV instruction and we take the cost from the MOV
9217 itself. This is true independently of whether we are
9218 optimizing for space or time. */
9224 else if (MEM_P (op0
))
9226 /* All loads can zero extend to any size for free. */
9227 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9231 op0
= aarch64_extend_bitfield_pattern_p (x
);
9234 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9236 *cost
+= extra_cost
->alu
.bfx
;
9242 if (VECTOR_MODE_P (mode
))
9245 *cost
+= extra_cost
->vect
.alu
;
9249 /* We generate an AND instead of UXTB/UXTH. */
9250 *cost
+= extra_cost
->alu
.logical
;
9256 if (MEM_P (XEXP (x
, 0)))
9261 rtx address
= XEXP (XEXP (x
, 0), 0);
9262 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9265 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9271 op0
= aarch64_extend_bitfield_pattern_p (x
);
9274 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
9276 *cost
+= extra_cost
->alu
.bfx
;
9282 if (VECTOR_MODE_P (mode
))
9283 *cost
+= extra_cost
->vect
.alu
;
9285 *cost
+= extra_cost
->alu
.extend
;
9293 if (CONST_INT_P (op1
))
9297 if (VECTOR_MODE_P (mode
))
9299 /* Vector shift (immediate). */
9300 *cost
+= extra_cost
->vect
.alu
;
9304 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9306 *cost
+= extra_cost
->alu
.shift
;
9310 /* We can incorporate zero/sign extend for free. */
9311 if (GET_CODE (op0
) == ZERO_EXTEND
9312 || GET_CODE (op0
) == SIGN_EXTEND
)
9313 op0
= XEXP (op0
, 0);
9315 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
9320 if (VECTOR_MODE_P (mode
))
9323 /* Vector shift (register). */
9324 *cost
+= extra_cost
->vect
.alu
;
9330 *cost
+= extra_cost
->alu
.shift_reg
;
9332 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9333 && CONST_INT_P (XEXP (op1
, 1))
9334 && known_eq (INTVAL (XEXP (op1
, 1)),
9335 GET_MODE_BITSIZE (mode
) - 1))
9337 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9338 /* We already demanded XEXP (op1, 0) to be REG_P, so
9339 don't recurse into it. */
9343 return false; /* All arguments need to be in registers. */
9353 if (CONST_INT_P (op1
))
9355 /* ASR (immediate) and friends. */
9358 if (VECTOR_MODE_P (mode
))
9359 *cost
+= extra_cost
->vect
.alu
;
9361 *cost
+= extra_cost
->alu
.shift
;
9364 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9369 if (VECTOR_MODE_P (mode
))
9372 /* Vector shift (register). */
9373 *cost
+= extra_cost
->vect
.alu
;
9378 /* ASR (register) and friends. */
9379 *cost
+= extra_cost
->alu
.shift_reg
;
9381 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9382 && CONST_INT_P (XEXP (op1
, 1))
9383 && known_eq (INTVAL (XEXP (op1
, 1)),
9384 GET_MODE_BITSIZE (mode
) - 1))
9386 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9387 /* We already demanded XEXP (op1, 0) to be REG_P, so
9388 don't recurse into it. */
9392 return false; /* All arguments need to be in registers. */
9397 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
9398 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
9402 *cost
+= extra_cost
->ldst
.load
;
9404 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
9405 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
9407 /* ADRP, followed by ADD. */
9408 *cost
+= COSTS_N_INSNS (1);
9410 *cost
+= 2 * extra_cost
->alu
.arith
;
9412 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9413 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9417 *cost
+= extra_cost
->alu
.arith
;
9422 /* One extra load instruction, after accessing the GOT. */
9423 *cost
+= COSTS_N_INSNS (1);
9425 *cost
+= extra_cost
->ldst
.load
;
9431 /* ADRP/ADD (immediate). */
9433 *cost
+= extra_cost
->alu
.arith
;
9441 if (VECTOR_MODE_P (mode
))
9442 *cost
+= extra_cost
->vect
.alu
;
9444 *cost
+= extra_cost
->alu
.bfx
;
9447 /* We can trust that the immediates used will be correct (there
9448 are no by-register forms), so we need only cost op0. */
9449 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9453 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
9454 /* aarch64_rtx_mult_cost always handles recursion to its
9459 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9460 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9461 an unconditional negate. This case should only ever be reached through
9462 the set_smod_pow2_cheap check in expmed.c. */
9463 if (CONST_INT_P (XEXP (x
, 1))
9464 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
9465 && (mode
== SImode
|| mode
== DImode
))
9467 /* We expand to 4 instructions. Reset the baseline. */
9468 *cost
= COSTS_N_INSNS (4);
9471 *cost
+= 2 * extra_cost
->alu
.logical
9472 + 2 * extra_cost
->alu
.arith
;
9481 /* Slighly prefer UMOD over SMOD. */
9482 if (VECTOR_MODE_P (mode
))
9483 *cost
+= extra_cost
->vect
.alu
;
9484 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9485 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
9486 + extra_cost
->mult
[mode
== DImode
].idiv
9487 + (code
== MOD
? 1 : 0));
9489 return false; /* All arguments need to be in registers. */
9496 if (VECTOR_MODE_P (mode
))
9497 *cost
+= extra_cost
->vect
.alu
;
9498 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9499 /* There is no integer SQRT, so only DIV and UDIV can get
9501 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
9502 /* Slighly prefer UDIV over SDIV. */
9503 + (code
== DIV
? 1 : 0));
9505 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
9507 return false; /* All arguments need to be in registers. */
9510 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
9511 XEXP (x
, 2), cost
, speed
);
9524 return false; /* All arguments must be in registers. */
9533 if (VECTOR_MODE_P (mode
))
9534 *cost
+= extra_cost
->vect
.alu
;
9536 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9539 /* FMSUB, FNMADD, and FNMSUB are free. */
9540 if (GET_CODE (op0
) == NEG
)
9541 op0
= XEXP (op0
, 0);
9543 if (GET_CODE (op2
) == NEG
)
9544 op2
= XEXP (op2
, 0);
9546 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9547 and the by-element operand as operand 0. */
9548 if (GET_CODE (op1
) == NEG
)
9549 op1
= XEXP (op1
, 0);
9551 /* Catch vector-by-element operations. The by-element operand can
9552 either be (vec_duplicate (vec_select (x))) or just
9553 (vec_select (x)), depending on whether we are multiplying by
9554 a vector or a scalar.
9556 Canonicalization is not very good in these cases, FMA4 will put the
9557 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9558 if (GET_CODE (op0
) == VEC_DUPLICATE
)
9559 op0
= XEXP (op0
, 0);
9560 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
9561 op1
= XEXP (op1
, 0);
9563 if (GET_CODE (op0
) == VEC_SELECT
)
9564 op0
= XEXP (op0
, 0);
9565 else if (GET_CODE (op1
) == VEC_SELECT
)
9566 op1
= XEXP (op1
, 0);
9568 /* If the remaining parameters are not registers,
9569 get the cost to put them into registers. */
9570 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
9571 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
9572 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
9576 case UNSIGNED_FLOAT
:
9578 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
9584 if (VECTOR_MODE_P (mode
))
9586 /*Vector truncate. */
9587 *cost
+= extra_cost
->vect
.alu
;
9590 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
9594 case FLOAT_TRUNCATE
:
9597 if (VECTOR_MODE_P (mode
))
9599 /*Vector conversion. */
9600 *cost
+= extra_cost
->vect
.alu
;
9603 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
9610 /* Strip the rounding part. They will all be implemented
9611 by the fcvt* family of instructions anyway. */
9612 if (GET_CODE (x
) == UNSPEC
)
9614 unsigned int uns_code
= XINT (x
, 1);
9616 if (uns_code
== UNSPEC_FRINTA
9617 || uns_code
== UNSPEC_FRINTM
9618 || uns_code
== UNSPEC_FRINTN
9619 || uns_code
== UNSPEC_FRINTP
9620 || uns_code
== UNSPEC_FRINTZ
)
9621 x
= XVECEXP (x
, 0, 0);
9626 if (VECTOR_MODE_P (mode
))
9627 *cost
+= extra_cost
->vect
.alu
;
9629 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
9632 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9633 fixed-point fcvt. */
9634 if (GET_CODE (x
) == MULT
9635 && ((VECTOR_MODE_P (mode
)
9636 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
9637 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
9639 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
9644 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9648 if (VECTOR_MODE_P (mode
))
9652 *cost
+= extra_cost
->vect
.alu
;
9654 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9658 /* FABD, which is analogous to FADD. */
9659 if (GET_CODE (op0
) == MINUS
)
9661 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
9662 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
9664 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9668 /* Simple FABS is analogous to FNEG. */
9670 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9674 /* Integer ABS will either be split to
9675 two arithmetic instructions, or will be an ABS
9676 (scalar), which we don't model. */
9677 *cost
= COSTS_N_INSNS (2);
9679 *cost
+= 2 * extra_cost
->alu
.arith
;
9687 if (VECTOR_MODE_P (mode
))
9688 *cost
+= extra_cost
->vect
.alu
;
9691 /* FMAXNM/FMINNM/FMAX/FMIN.
9692 TODO: This may not be accurate for all implementations, but
9693 we do not model this in the cost tables. */
9694 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9700 /* The floating point round to integer frint* instructions. */
9701 if (aarch64_frint_unspec_p (XINT (x
, 1)))
9704 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
9709 if (XINT (x
, 1) == UNSPEC_RBIT
)
9712 *cost
+= extra_cost
->alu
.rev
;
9720 /* Decompose <su>muldi3_highpart. */
9721 if (/* (truncate:DI */
9724 && GET_MODE (XEXP (x
, 0)) == TImode
9725 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
9727 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
9728 /* (ANY_EXTEND:TI (reg:DI))
9729 (ANY_EXTEND:TI (reg:DI))) */
9730 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
9731 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
9732 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
9733 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
9734 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
9735 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
9736 /* (const_int 64) */
9737 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
9738 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
9742 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
9743 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
9744 mode
, MULT
, 0, speed
);
9745 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
9746 mode
, MULT
, 1, speed
);
9756 && flag_aarch64_verbose_cost
)
9758 "\nFailed to cost RTX. Assuming default cost.\n");
9763 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9764 calculated for X. This cost is stored in *COST. Returns true
9765 if the total cost of X was calculated. */
9767 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
9768 int param
, int *cost
, bool speed
)
9770 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
9773 && flag_aarch64_verbose_cost
)
9775 print_rtl_single (dump_file
, x
);
9776 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
9777 speed
? "Hot" : "Cold",
9778 *cost
, result
? "final" : "partial");
9785 aarch64_register_move_cost (machine_mode mode
,
9786 reg_class_t from_i
, reg_class_t to_i
)
9788 enum reg_class from
= (enum reg_class
) from_i
;
9789 enum reg_class to
= (enum reg_class
) to_i
;
9790 const struct cpu_regmove_cost
*regmove_cost
9791 = aarch64_tune_params
.regmove_cost
;
9793 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9794 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
9797 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
9798 from
= GENERAL_REGS
;
9800 /* Moving between GPR and stack cost is the same as GP2GP. */
9801 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
9802 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
9803 return regmove_cost
->GP2GP
;
9805 /* To/From the stack register, we move via the gprs. */
9806 if (to
== STACK_REG
|| from
== STACK_REG
)
9807 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
9808 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
9810 if (known_eq (GET_MODE_SIZE (mode
), 16))
9812 /* 128-bit operations on general registers require 2 instructions. */
9813 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9814 return regmove_cost
->GP2GP
* 2;
9815 else if (from
== GENERAL_REGS
)
9816 return regmove_cost
->GP2FP
* 2;
9817 else if (to
== GENERAL_REGS
)
9818 return regmove_cost
->FP2GP
* 2;
9820 /* When AdvSIMD instructions are disabled it is not possible to move
9821 a 128-bit value directly between Q registers. This is handled in
9822 secondary reload. A general register is used as a scratch to move
9823 the upper DI value and the lower DI value is moved directly,
9824 hence the cost is the sum of three moves. */
9826 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
9828 return regmove_cost
->FP2FP
;
9831 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9832 return regmove_cost
->GP2GP
;
9833 else if (from
== GENERAL_REGS
)
9834 return regmove_cost
->GP2FP
;
9835 else if (to
== GENERAL_REGS
)
9836 return regmove_cost
->FP2GP
;
9838 return regmove_cost
->FP2FP
;
9842 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
9843 reg_class_t rclass ATTRIBUTE_UNUSED
,
9844 bool in ATTRIBUTE_UNUSED
)
9846 return aarch64_tune_params
.memmov_cost
;
9849 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9850 to optimize 1.0/sqrt. */
9853 use_rsqrt_p (machine_mode mode
)
9855 return (!flag_trapping_math
9856 && flag_unsafe_math_optimizations
9857 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
9858 & AARCH64_APPROX_MODE (mode
))
9859 || flag_mrecip_low_precision_sqrt
));
9862 /* Function to decide when to use the approximate reciprocal square root
9866 aarch64_builtin_reciprocal (tree fndecl
)
9868 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
9870 if (!use_rsqrt_p (mode
))
9872 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
9875 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
9877 /* Select reciprocal square root initial estimate insn depending on machine
9881 get_rsqrte_type (machine_mode mode
)
9885 case E_DFmode
: return gen_aarch64_rsqrtedf
;
9886 case E_SFmode
: return gen_aarch64_rsqrtesf
;
9887 case E_V2DFmode
: return gen_aarch64_rsqrtev2df
;
9888 case E_V2SFmode
: return gen_aarch64_rsqrtev2sf
;
9889 case E_V4SFmode
: return gen_aarch64_rsqrtev4sf
;
9890 default: gcc_unreachable ();
9894 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
9896 /* Select reciprocal square root series step insn depending on machine mode. */
9899 get_rsqrts_type (machine_mode mode
)
9903 case E_DFmode
: return gen_aarch64_rsqrtsdf
;
9904 case E_SFmode
: return gen_aarch64_rsqrtssf
;
9905 case E_V2DFmode
: return gen_aarch64_rsqrtsv2df
;
9906 case E_V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
9907 case E_V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
9908 default: gcc_unreachable ();
9912 /* Emit instruction sequence to compute either the approximate square root
9913 or its approximate reciprocal, depending on the flag RECP, and return
9914 whether the sequence was emitted or not. */
9917 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
9919 machine_mode mode
= GET_MODE (dst
);
9921 if (GET_MODE_INNER (mode
) == HFmode
)
9929 if (!(flag_mlow_precision_sqrt
9930 || (aarch64_tune_params
.approx_modes
->sqrt
9931 & AARCH64_APPROX_MODE (mode
))))
9934 if (flag_finite_math_only
9935 || flag_trapping_math
9936 || !flag_unsafe_math_optimizations
9937 || optimize_function_for_size_p (cfun
))
9941 /* Caller assumes we cannot fail. */
9942 gcc_assert (use_rsqrt_p (mode
));
9944 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
9945 rtx xmsk
= gen_reg_rtx (mmsk
);
9947 /* When calculating the approximate square root, compare the
9948 argument with 0.0 and create a mask. */
9949 emit_insn (gen_rtx_SET (xmsk
,
9951 gen_rtx_EQ (mmsk
, src
,
9952 CONST0_RTX (mode
)))));
9954 /* Estimate the approximate reciprocal square root. */
9955 rtx xdst
= gen_reg_rtx (mode
);
9956 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
9958 /* Iterate over the series twice for SF and thrice for DF. */
9959 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9961 /* Optionally iterate over the series once less for faster performance
9962 while sacrificing the accuracy. */
9963 if ((recp
&& flag_mrecip_low_precision_sqrt
)
9964 || (!recp
&& flag_mlow_precision_sqrt
))
9967 /* Iterate over the series to calculate the approximate reciprocal square
9969 rtx x1
= gen_reg_rtx (mode
);
9970 while (iterations
--)
9972 rtx x2
= gen_reg_rtx (mode
);
9973 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
9975 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
9978 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
9983 /* Qualify the approximate reciprocal square root when the argument is
9984 0.0 by squashing the intermediary result to 0.0. */
9985 rtx xtmp
= gen_reg_rtx (mmsk
);
9986 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
9987 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
9988 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
9990 /* Calculate the approximate square root. */
9991 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
9994 /* Finalize the approximation. */
9995 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
10000 typedef rtx (*recpe_type
) (rtx
, rtx
);
10002 /* Select reciprocal initial estimate insn depending on machine mode. */
10005 get_recpe_type (machine_mode mode
)
10009 case E_SFmode
: return (gen_aarch64_frecpesf
);
10010 case E_V2SFmode
: return (gen_aarch64_frecpev2sf
);
10011 case E_V4SFmode
: return (gen_aarch64_frecpev4sf
);
10012 case E_DFmode
: return (gen_aarch64_frecpedf
);
10013 case E_V2DFmode
: return (gen_aarch64_frecpev2df
);
10014 default: gcc_unreachable ();
10018 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
10020 /* Select reciprocal series step insn depending on machine mode. */
10023 get_recps_type (machine_mode mode
)
10027 case E_SFmode
: return (gen_aarch64_frecpssf
);
10028 case E_V2SFmode
: return (gen_aarch64_frecpsv2sf
);
10029 case E_V4SFmode
: return (gen_aarch64_frecpsv4sf
);
10030 case E_DFmode
: return (gen_aarch64_frecpsdf
);
10031 case E_V2DFmode
: return (gen_aarch64_frecpsv2df
);
10032 default: gcc_unreachable ();
10036 /* Emit the instruction sequence to compute the approximation for the division
10037 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10040 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
10042 machine_mode mode
= GET_MODE (quo
);
10044 if (GET_MODE_INNER (mode
) == HFmode
)
10047 bool use_approx_division_p
= (flag_mlow_precision_div
10048 || (aarch64_tune_params
.approx_modes
->division
10049 & AARCH64_APPROX_MODE (mode
)));
10051 if (!flag_finite_math_only
10052 || flag_trapping_math
10053 || !flag_unsafe_math_optimizations
10054 || optimize_function_for_size_p (cfun
)
10055 || !use_approx_division_p
)
10058 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
10061 /* Estimate the approximate reciprocal. */
10062 rtx xrcp
= gen_reg_rtx (mode
);
10063 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
10065 /* Iterate over the series twice for SF and thrice for DF. */
10066 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10068 /* Optionally iterate over the series once less for faster performance,
10069 while sacrificing the accuracy. */
10070 if (flag_mlow_precision_div
)
10073 /* Iterate over the series to calculate the approximate reciprocal. */
10074 rtx xtmp
= gen_reg_rtx (mode
);
10075 while (iterations
--)
10077 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
10079 if (iterations
> 0)
10080 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10083 if (num
!= CONST1_RTX (mode
))
10085 /* As the approximate reciprocal of DEN is already calculated, only
10086 calculate the approximate division when NUM is not 1.0. */
10087 rtx xnum
= force_reg (mode
, num
);
10088 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
10091 /* Finalize the approximation. */
10092 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10096 /* Return the number of instructions that can be issued per cycle. */
10098 aarch64_sched_issue_rate (void)
10100 return aarch64_tune_params
.issue_rate
;
10104 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10106 int issue_rate
= aarch64_sched_issue_rate ();
10108 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
10112 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10113 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10114 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10117 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10120 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10124 /* Vectorizer cost model target hooks. */
10126 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10128 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10130 int misalign ATTRIBUTE_UNUSED
)
10133 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10136 if (vectype
!= NULL
)
10137 fp
= FLOAT_TYPE_P (vectype
);
10139 switch (type_of_cost
)
10142 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10145 return costs
->scalar_load_cost
;
10148 return costs
->scalar_store_cost
;
10151 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10154 return costs
->vec_align_load_cost
;
10157 return costs
->vec_store_cost
;
10159 case vec_to_scalar
:
10160 return costs
->vec_to_scalar_cost
;
10162 case scalar_to_vec
:
10163 return costs
->scalar_to_vec_cost
;
10165 case unaligned_load
:
10166 case vector_gather_load
:
10167 return costs
->vec_unalign_load_cost
;
10169 case unaligned_store
:
10170 case vector_scatter_store
:
10171 return costs
->vec_unalign_store_cost
;
10173 case cond_branch_taken
:
10174 return costs
->cond_taken_branch_cost
;
10176 case cond_branch_not_taken
:
10177 return costs
->cond_not_taken_branch_cost
;
10180 return costs
->vec_permute_cost
;
10182 case vec_promote_demote
:
10183 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10185 case vec_construct
:
10186 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10187 return elements
/ 2 + 1;
10190 gcc_unreachable ();
10194 /* Implement targetm.vectorize.add_stmt_cost. */
10196 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10197 struct _stmt_vec_info
*stmt_info
, int misalign
,
10198 enum vect_cost_model_location where
)
10200 unsigned *cost
= (unsigned *) data
;
10201 unsigned retval
= 0;
10203 if (flag_vect_cost_model
)
10205 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10207 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10209 /* Statements in an inner loop relative to the loop being
10210 vectorized are weighted more heavily. The value here is
10211 arbitrary and could potentially be improved with analysis. */
10212 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10213 count
*= 50; /* FIXME */
10215 retval
= (unsigned) (count
* stmt_cost
);
10216 cost
[where
] += retval
;
10222 static void initialize_aarch64_code_model (struct gcc_options
*);
10224 /* Parse the TO_PARSE string and put the architecture struct that it
10225 selects into RES and the architectural features into ISA_FLAGS.
10226 Return an aarch64_parse_opt_result describing the parse result.
10227 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10229 static enum aarch64_parse_opt_result
10230 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10231 unsigned long *isa_flags
)
10234 const struct processor
*arch
;
10235 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10238 strcpy (str
, to_parse
);
10240 ext
= strchr (str
, '+');
10245 len
= strlen (str
);
10248 return AARCH64_PARSE_MISSING_ARG
;
10251 /* Loop through the list of supported ARCHes to find a match. */
10252 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10254 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
10256 unsigned long isa_temp
= arch
->flags
;
10260 /* TO_PARSE string contains at least one extension. */
10261 enum aarch64_parse_opt_result ext_res
10262 = aarch64_parse_extension (ext
, &isa_temp
);
10264 if (ext_res
!= AARCH64_PARSE_OK
)
10267 /* Extension parsing was successful. Confirm the result
10268 arch and ISA flags. */
10270 *isa_flags
= isa_temp
;
10271 return AARCH64_PARSE_OK
;
10275 /* ARCH name not found in list. */
10276 return AARCH64_PARSE_INVALID_ARG
;
10279 /* Parse the TO_PARSE string and put the result tuning in RES and the
10280 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10281 describing the parse result. If there is an error parsing, RES and
10282 ISA_FLAGS are left unchanged. */
10284 static enum aarch64_parse_opt_result
10285 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10286 unsigned long *isa_flags
)
10289 const struct processor
*cpu
;
10290 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10293 strcpy (str
, to_parse
);
10295 ext
= strchr (str
, '+');
10300 len
= strlen (str
);
10303 return AARCH64_PARSE_MISSING_ARG
;
10306 /* Loop through the list of supported CPUs to find a match. */
10307 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10309 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
10311 unsigned long isa_temp
= cpu
->flags
;
10316 /* TO_PARSE string contains at least one extension. */
10317 enum aarch64_parse_opt_result ext_res
10318 = aarch64_parse_extension (ext
, &isa_temp
);
10320 if (ext_res
!= AARCH64_PARSE_OK
)
10323 /* Extension parsing was successfull. Confirm the result
10324 cpu and ISA flags. */
10326 *isa_flags
= isa_temp
;
10327 return AARCH64_PARSE_OK
;
10331 /* CPU name not found in list. */
10332 return AARCH64_PARSE_INVALID_ARG
;
10335 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10336 Return an aarch64_parse_opt_result describing the parse result.
10337 If the parsing fails the RES does not change. */
10339 static enum aarch64_parse_opt_result
10340 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
10342 const struct processor
*cpu
;
10343 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10345 strcpy (str
, to_parse
);
10347 /* Loop through the list of supported CPUs to find a match. */
10348 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10350 if (strcmp (cpu
->name
, str
) == 0)
10353 return AARCH64_PARSE_OK
;
10357 /* CPU name not found in list. */
10358 return AARCH64_PARSE_INVALID_ARG
;
10361 /* Parse TOKEN, which has length LENGTH to see if it is an option
10362 described in FLAG. If it is, return the index bit for that fusion type.
10363 If not, error (printing OPTION_NAME) and return zero. */
10365 static unsigned int
10366 aarch64_parse_one_option_token (const char *token
,
10368 const struct aarch64_flag_desc
*flag
,
10369 const char *option_name
)
10371 for (; flag
->name
!= NULL
; flag
++)
10373 if (length
== strlen (flag
->name
)
10374 && !strncmp (flag
->name
, token
, length
))
10378 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10382 /* Parse OPTION which is a comma-separated list of flags to enable.
10383 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10384 default state we inherit from the CPU tuning structures. OPTION_NAME
10385 gives the top-level option we are parsing in the -moverride string,
10386 for use in error messages. */
10388 static unsigned int
10389 aarch64_parse_boolean_options (const char *option
,
10390 const struct aarch64_flag_desc
*flags
,
10391 unsigned int initial_state
,
10392 const char *option_name
)
10394 const char separator
= '.';
10395 const char* specs
= option
;
10396 const char* ntoken
= option
;
10397 unsigned int found_flags
= initial_state
;
10399 while ((ntoken
= strchr (specs
, separator
)))
10401 size_t token_length
= ntoken
- specs
;
10402 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10406 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10407 in the token stream, reset the supported operations. So:
10409 adrp+add.cmp+branch.none.adrp+add
10411 would have the result of turning on only adrp+add fusion. */
10415 found_flags
|= token_ops
;
10419 /* We ended with a comma, print something. */
10422 error ("%s string ill-formed\n", option_name
);
10426 /* We still have one more token to parse. */
10427 size_t token_length
= strlen (specs
);
10428 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10435 found_flags
|= token_ops
;
10436 return found_flags
;
10439 /* Support for overriding instruction fusion. */
10442 aarch64_parse_fuse_string (const char *fuse_string
,
10443 struct tune_params
*tune
)
10445 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
10446 aarch64_fusible_pairs
,
10451 /* Support for overriding other tuning flags. */
10454 aarch64_parse_tune_string (const char *tune_string
,
10455 struct tune_params
*tune
)
10457 tune
->extra_tuning_flags
10458 = aarch64_parse_boolean_options (tune_string
,
10459 aarch64_tuning_flags
,
10460 tune
->extra_tuning_flags
,
10464 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10465 we understand. If it is, extract the option string and handoff to
10466 the appropriate function. */
10469 aarch64_parse_one_override_token (const char* token
,
10471 struct tune_params
*tune
)
10473 const struct aarch64_tuning_override_function
*fn
10474 = aarch64_tuning_override_functions
;
10476 const char *option_part
= strchr (token
, '=');
10479 error ("tuning string missing in option (%s)", token
);
10483 /* Get the length of the option name. */
10484 length
= option_part
- token
;
10485 /* Skip the '=' to get to the option string. */
10488 for (; fn
->name
!= NULL
; fn
++)
10490 if (!strncmp (fn
->name
, token
, length
))
10492 fn
->parse_override (option_part
, tune
);
10497 error ("unknown tuning option (%s)",token
);
10501 /* A checking mechanism for the implementation of the tls size. */
10504 initialize_aarch64_tls_size (struct gcc_options
*opts
)
10506 if (aarch64_tls_size
== 0)
10507 aarch64_tls_size
= 24;
10509 switch (opts
->x_aarch64_cmodel_var
)
10511 case AARCH64_CMODEL_TINY
:
10512 /* Both the default and maximum TLS size allowed under tiny is 1M which
10513 needs two instructions to address, so we clamp the size to 24. */
10514 if (aarch64_tls_size
> 24)
10515 aarch64_tls_size
= 24;
10517 case AARCH64_CMODEL_SMALL
:
10518 /* The maximum TLS size allowed under small is 4G. */
10519 if (aarch64_tls_size
> 32)
10520 aarch64_tls_size
= 32;
10522 case AARCH64_CMODEL_LARGE
:
10523 /* The maximum TLS size allowed under large is 16E.
10524 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10525 if (aarch64_tls_size
> 48)
10526 aarch64_tls_size
= 48;
10529 gcc_unreachable ();
10535 /* Parse STRING looking for options in the format:
10536 string :: option:string
10537 option :: name=substring
10539 substring :: defined by option. */
10542 aarch64_parse_override_string (const char* input_string
,
10543 struct tune_params
* tune
)
10545 const char separator
= ':';
10546 size_t string_length
= strlen (input_string
) + 1;
10547 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
10548 char *string
= string_root
;
10549 strncpy (string
, input_string
, string_length
);
10550 string
[string_length
- 1] = '\0';
10552 char* ntoken
= string
;
10554 while ((ntoken
= strchr (string
, separator
)))
10556 size_t token_length
= ntoken
- string
;
10557 /* Make this substring look like a string. */
10559 aarch64_parse_one_override_token (string
, token_length
, tune
);
10563 /* One last option to parse. */
10564 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
10565 free (string_root
);
10570 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
10572 /* PR 70044: We have to be careful about being called multiple times for the
10573 same function. This means all changes should be repeatable. */
10575 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10576 Disable the frame pointer flag so the mid-end will not use a frame
10577 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10578 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10579 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10580 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
10581 if (opts
->x_flag_omit_frame_pointer
== 0)
10582 opts
->x_flag_omit_frame_pointer
= 2;
10584 /* If not optimizing for size, set the default
10585 alignment to what the target wants. */
10586 if (!opts
->x_optimize_size
)
10588 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
10589 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
10590 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
10591 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
10592 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
10593 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
10596 /* We default to no pc-relative literal loads. */
10598 aarch64_pcrelative_literal_loads
= false;
10600 /* If -mpc-relative-literal-loads is set on the command line, this
10601 implies that the user asked for PC relative literal loads. */
10602 if (opts
->x_pcrelative_literal_loads
== 1)
10603 aarch64_pcrelative_literal_loads
= true;
10605 /* In the tiny memory model it makes no sense to disallow PC relative
10606 literal pool loads. */
10607 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10608 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10609 aarch64_pcrelative_literal_loads
= true;
10611 /* When enabling the lower precision Newton series for the square root, also
10612 enable it for the reciprocal square root, since the latter is an
10613 intermediary step for the former. */
10614 if (flag_mlow_precision_sqrt
)
10615 flag_mrecip_low_precision_sqrt
= true;
10618 /* 'Unpack' up the internal tuning structs and update the options
10619 in OPTS. The caller must have set up selected_tune and selected_arch
10620 as all the other target-specific codegen decisions are
10621 derived from them. */
10624 aarch64_override_options_internal (struct gcc_options
*opts
)
10626 aarch64_tune_flags
= selected_tune
->flags
;
10627 aarch64_tune
= selected_tune
->sched_core
;
10628 /* Make a copy of the tuning parameters attached to the core, which
10629 we may later overwrite. */
10630 aarch64_tune_params
= *(selected_tune
->tune
);
10631 aarch64_architecture_version
= selected_arch
->architecture_version
;
10633 if (opts
->x_aarch64_override_tune_string
)
10634 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
10635 &aarch64_tune_params
);
10637 /* This target defaults to strict volatile bitfields. */
10638 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
10639 opts
->x_flag_strict_volatile_bitfields
= 1;
10641 initialize_aarch64_code_model (opts
);
10642 initialize_aarch64_tls_size (opts
);
10644 int queue_depth
= 0;
10645 switch (aarch64_tune_params
.autoprefetcher_model
)
10647 case tune_params::AUTOPREFETCHER_OFF
:
10650 case tune_params::AUTOPREFETCHER_WEAK
:
10653 case tune_params::AUTOPREFETCHER_STRONG
:
10654 queue_depth
= max_insn_queue_index
+ 1;
10657 gcc_unreachable ();
10660 /* We don't mind passing in global_options_set here as we don't use
10661 the *options_set structs anyway. */
10662 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
10664 opts
->x_param_values
,
10665 global_options_set
.x_param_values
);
10667 /* Set up parameters to be used in prefetching algorithm. Do not
10668 override the defaults unless we are tuning for a core we have
10669 researched values for. */
10670 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
10671 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
10672 aarch64_tune_params
.prefetch
->num_slots
,
10673 opts
->x_param_values
,
10674 global_options_set
.x_param_values
);
10675 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
10676 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
10677 aarch64_tune_params
.prefetch
->l1_cache_size
,
10678 opts
->x_param_values
,
10679 global_options_set
.x_param_values
);
10680 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
10681 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
10682 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
10683 opts
->x_param_values
,
10684 global_options_set
.x_param_values
);
10685 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
10686 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
10687 aarch64_tune_params
.prefetch
->l2_cache_size
,
10688 opts
->x_param_values
,
10689 global_options_set
.x_param_values
);
10690 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
10691 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
10693 opts
->x_param_values
,
10694 global_options_set
.x_param_values
);
10695 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
10696 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
10697 aarch64_tune_params
.prefetch
->minimum_stride
,
10698 opts
->x_param_values
,
10699 global_options_set
.x_param_values
);
10701 /* Use the alternative scheduling-pressure algorithm by default. */
10702 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
10703 opts
->x_param_values
,
10704 global_options_set
.x_param_values
);
10706 /* Enable sw prefetching at specified optimization level for
10707 CPUS that have prefetch. Lower optimization level threshold by 1
10708 when profiling is enabled. */
10709 if (opts
->x_flag_prefetch_loop_arrays
< 0
10710 && !opts
->x_optimize_size
10711 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
10712 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
10713 opts
->x_flag_prefetch_loop_arrays
= 1;
10715 aarch64_override_options_after_change_1 (opts
);
10718 /* Print a hint with a suggestion for a core or architecture name that
10719 most closely resembles what the user passed in STR. ARCH is true if
10720 the user is asking for an architecture name. ARCH is false if the user
10721 is asking for a core name. */
10724 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
10726 auto_vec
<const char *> candidates
;
10727 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
10728 for (; entry
->name
!= NULL
; entry
++)
10729 candidates
.safe_push (entry
->name
);
10731 #ifdef HAVE_LOCAL_CPU_DETECT
10732 /* Add also "native" as possible value. */
10734 candidates
.safe_push ("native");
10738 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
10740 inform (input_location
, "valid arguments are: %s;"
10741 " did you mean %qs?", s
, hint
);
10743 inform (input_location
, "valid arguments are: %s", s
);
10748 /* Print a hint with a suggestion for a core name that most closely resembles
10749 what the user passed in STR. */
10752 aarch64_print_hint_for_core (const char *str
)
10754 aarch64_print_hint_for_core_or_arch (str
, false);
10757 /* Print a hint with a suggestion for an architecture name that most closely
10758 resembles what the user passed in STR. */
10761 aarch64_print_hint_for_arch (const char *str
)
10763 aarch64_print_hint_for_core_or_arch (str
, true);
10766 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10767 specified in STR and throw errors if appropriate. Put the results if
10768 they are valid in RES and ISA_FLAGS. Return whether the option is
10772 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
10773 unsigned long *isa_flags
)
10775 enum aarch64_parse_opt_result parse_res
10776 = aarch64_parse_cpu (str
, res
, isa_flags
);
10778 if (parse_res
== AARCH64_PARSE_OK
)
10783 case AARCH64_PARSE_MISSING_ARG
:
10784 error ("missing cpu name in %<-mcpu=%s%>", str
);
10786 case AARCH64_PARSE_INVALID_ARG
:
10787 error ("unknown value %qs for -mcpu", str
);
10788 aarch64_print_hint_for_core (str
);
10790 case AARCH64_PARSE_INVALID_FEATURE
:
10791 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
10794 gcc_unreachable ();
10800 /* Validate a command-line -march option. Parse the arch and extensions
10801 (if any) specified in STR and throw errors if appropriate. Put the
10802 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10803 option is valid. */
10806 aarch64_validate_march (const char *str
, const struct processor
**res
,
10807 unsigned long *isa_flags
)
10809 enum aarch64_parse_opt_result parse_res
10810 = aarch64_parse_arch (str
, res
, isa_flags
);
10812 if (parse_res
== AARCH64_PARSE_OK
)
10817 case AARCH64_PARSE_MISSING_ARG
:
10818 error ("missing arch name in %<-march=%s%>", str
);
10820 case AARCH64_PARSE_INVALID_ARG
:
10821 error ("unknown value %qs for -march", str
);
10822 aarch64_print_hint_for_arch (str
);
10824 case AARCH64_PARSE_INVALID_FEATURE
:
10825 error ("invalid feature modifier in %<-march=%s%>", str
);
10828 gcc_unreachable ();
10834 /* Validate a command-line -mtune option. Parse the cpu
10835 specified in STR and throw errors if appropriate. Put the
10836 result, if it is valid, in RES. Return whether the option is
10840 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
10842 enum aarch64_parse_opt_result parse_res
10843 = aarch64_parse_tune (str
, res
);
10845 if (parse_res
== AARCH64_PARSE_OK
)
10850 case AARCH64_PARSE_MISSING_ARG
:
10851 error ("missing cpu name in %<-mtune=%s%>", str
);
10853 case AARCH64_PARSE_INVALID_ARG
:
10854 error ("unknown value %qs for -mtune", str
);
10855 aarch64_print_hint_for_core (str
);
10858 gcc_unreachable ();
10863 /* Return the CPU corresponding to the enum CPU.
10864 If it doesn't specify a cpu, return the default. */
10866 static const struct processor
*
10867 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
10869 if (cpu
!= aarch64_none
)
10870 return &all_cores
[cpu
];
10872 /* The & 0x3f is to extract the bottom 6 bits that encode the
10873 default cpu as selected by the --with-cpu GCC configure option
10875 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10876 flags mechanism should be reworked to make it more sane. */
10877 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10880 /* Return the architecture corresponding to the enum ARCH.
10881 If it doesn't specify a valid architecture, return the default. */
10883 static const struct processor
*
10884 aarch64_get_arch (enum aarch64_arch arch
)
10886 if (arch
!= aarch64_no_arch
)
10887 return &all_architectures
[arch
];
10889 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10891 return &all_architectures
[cpu
->arch
];
10894 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10897 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
10899 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10900 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10901 deciding which .md file patterns to use and when deciding whether
10902 something is a legitimate address or constant. */
10903 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
10904 return poly_uint16 (2, 2);
10906 return (int) value
/ 64;
10909 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10910 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10911 tuning structs. In particular it must set selected_tune and
10912 aarch64_isa_flags that define the available ISA features and tuning
10913 decisions. It must also set selected_arch as this will be used to
10914 output the .arch asm tags for each function. */
10917 aarch64_override_options (void)
10919 unsigned long cpu_isa
= 0;
10920 unsigned long arch_isa
= 0;
10921 aarch64_isa_flags
= 0;
10923 bool valid_cpu
= true;
10924 bool valid_tune
= true;
10925 bool valid_arch
= true;
10927 selected_cpu
= NULL
;
10928 selected_arch
= NULL
;
10929 selected_tune
= NULL
;
10931 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10932 If either of -march or -mtune is given, they override their
10933 respective component of -mcpu. */
10934 if (aarch64_cpu_string
)
10935 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
10938 if (aarch64_arch_string
)
10939 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
10942 if (aarch64_tune_string
)
10943 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
10945 /* If the user did not specify a processor, choose the default
10946 one for them. This will be the CPU set during configuration using
10947 --with-cpu, otherwise it is "generic". */
10952 selected_cpu
= &all_cores
[selected_arch
->ident
];
10953 aarch64_isa_flags
= arch_isa
;
10954 explicit_arch
= selected_arch
->arch
;
10958 /* Get default configure-time CPU. */
10959 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
10960 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
10964 explicit_tune_core
= selected_tune
->ident
;
10966 /* If both -mcpu and -march are specified check that they are architecturally
10967 compatible, warn if they're not and prefer the -march ISA flags. */
10968 else if (selected_arch
)
10970 if (selected_arch
->arch
!= selected_cpu
->arch
)
10972 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10973 all_architectures
[selected_cpu
->arch
].name
,
10974 selected_arch
->name
);
10976 aarch64_isa_flags
= arch_isa
;
10977 explicit_arch
= selected_arch
->arch
;
10978 explicit_tune_core
= selected_tune
? selected_tune
->ident
10979 : selected_cpu
->ident
;
10983 /* -mcpu but no -march. */
10984 aarch64_isa_flags
= cpu_isa
;
10985 explicit_tune_core
= selected_tune
? selected_tune
->ident
10986 : selected_cpu
->ident
;
10987 gcc_assert (selected_cpu
);
10988 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10989 explicit_arch
= selected_arch
->arch
;
10992 /* Set the arch as well as we will need it when outputing
10993 the .arch directive in assembly. */
10994 if (!selected_arch
)
10996 gcc_assert (selected_cpu
);
10997 selected_arch
= &all_architectures
[selected_cpu
->arch
];
11000 if (!selected_tune
)
11001 selected_tune
= selected_cpu
;
11003 #ifndef HAVE_AS_MABI_OPTION
11004 /* The compiler may have been configured with 2.23.* binutils, which does
11005 not have support for ILP32. */
11007 error ("assembler does not support -mabi=ilp32");
11010 /* Convert -msve-vector-bits to a VG count. */
11011 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
11013 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
11014 sorry ("return address signing is only supported for -mabi=lp64");
11016 /* Make sure we properly set up the explicit options. */
11017 if ((aarch64_cpu_string
&& valid_cpu
)
11018 || (aarch64_tune_string
&& valid_tune
))
11019 gcc_assert (explicit_tune_core
!= aarch64_none
);
11021 if ((aarch64_cpu_string
&& valid_cpu
)
11022 || (aarch64_arch_string
&& valid_arch
))
11023 gcc_assert (explicit_arch
!= aarch64_no_arch
);
11025 aarch64_override_options_internal (&global_options
);
11027 /* Save these options as the default ones in case we push and pop them later
11028 while processing functions with potential target attributes. */
11029 target_option_default_node
= target_option_current_node
11030 = build_target_option_node (&global_options
);
11033 /* Implement targetm.override_options_after_change. */
11036 aarch64_override_options_after_change (void)
11038 aarch64_override_options_after_change_1 (&global_options
);
11041 static struct machine_function
*
11042 aarch64_init_machine_status (void)
11044 struct machine_function
*machine
;
11045 machine
= ggc_cleared_alloc
<machine_function
> ();
11050 aarch64_init_expanders (void)
11052 init_machine_status
= aarch64_init_machine_status
;
11055 /* A checking mechanism for the implementation of the various code models. */
11057 initialize_aarch64_code_model (struct gcc_options
*opts
)
11059 if (opts
->x_flag_pic
)
11061 switch (opts
->x_aarch64_cmodel_var
)
11063 case AARCH64_CMODEL_TINY
:
11064 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
11066 case AARCH64_CMODEL_SMALL
:
11067 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11068 aarch64_cmodel
= (flag_pic
== 2
11069 ? AARCH64_CMODEL_SMALL_PIC
11070 : AARCH64_CMODEL_SMALL_SPIC
);
11072 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
11075 case AARCH64_CMODEL_LARGE
:
11076 sorry ("code model %qs with -f%s", "large",
11077 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
11080 gcc_unreachable ();
11084 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
11087 /* Implement TARGET_OPTION_SAVE. */
11090 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
11092 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
11095 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11096 using the information saved in PTR. */
11099 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
11101 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
11102 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11103 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
11104 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11105 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
11107 aarch64_override_options_internal (opts
);
11110 /* Implement TARGET_OPTION_PRINT. */
11113 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
11115 const struct processor
*cpu
11116 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11117 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
11118 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11119 std::string extension
11120 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
11122 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
11123 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
11124 arch
->name
, extension
.c_str ());
11127 static GTY(()) tree aarch64_previous_fndecl
;
11130 aarch64_reset_previous_fndecl (void)
11132 aarch64_previous_fndecl
= NULL
;
11135 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11136 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11137 make sure optab availability predicates are recomputed when necessary. */
11140 aarch64_save_restore_target_globals (tree new_tree
)
11142 if (TREE_TARGET_GLOBALS (new_tree
))
11143 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
11144 else if (new_tree
== target_option_default_node
)
11145 restore_target_globals (&default_target_globals
);
11147 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
11150 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11151 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11152 of the function, if such exists. This function may be called multiple
11153 times on a single function so use aarch64_previous_fndecl to avoid
11154 setting up identical state. */
11157 aarch64_set_current_function (tree fndecl
)
11159 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
11162 tree old_tree
= (aarch64_previous_fndecl
11163 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
11166 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11168 /* If current function has no attributes but the previous one did,
11169 use the default node. */
11170 if (!new_tree
&& old_tree
)
11171 new_tree
= target_option_default_node
;
11173 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11174 the default have been handled by aarch64_save_restore_target_globals from
11175 aarch64_pragma_target_parse. */
11176 if (old_tree
== new_tree
)
11179 aarch64_previous_fndecl
= fndecl
;
11181 /* First set the target options. */
11182 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
11184 aarch64_save_restore_target_globals (new_tree
);
11187 /* Enum describing the various ways we can handle attributes.
11188 In many cases we can reuse the generic option handling machinery. */
11190 enum aarch64_attr_opt_type
11192 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
11193 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
11194 aarch64_attr_enum
, /* Attribute sets an enum variable. */
11195 aarch64_attr_custom
/* Attribute requires a custom handling function. */
11198 /* All the information needed to handle a target attribute.
11199 NAME is the name of the attribute.
11200 ATTR_TYPE specifies the type of behavior of the attribute as described
11201 in the definition of enum aarch64_attr_opt_type.
11202 ALLOW_NEG is true if the attribute supports a "no-" form.
11203 HANDLER is the function that takes the attribute string as an argument
11204 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11205 OPT_NUM is the enum specifying the option that the attribute modifies.
11206 This is needed for attributes that mirror the behavior of a command-line
11207 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11208 aarch64_attr_enum. */
11210 struct aarch64_attribute_info
11213 enum aarch64_attr_opt_type attr_type
;
11215 bool (*handler
) (const char *);
11216 enum opt_code opt_num
;
11219 /* Handle the ARCH_STR argument to the arch= target attribute. */
11222 aarch64_handle_attr_arch (const char *str
)
11224 const struct processor
*tmp_arch
= NULL
;
11225 enum aarch64_parse_opt_result parse_res
11226 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
11228 if (parse_res
== AARCH64_PARSE_OK
)
11230 gcc_assert (tmp_arch
);
11231 selected_arch
= tmp_arch
;
11232 explicit_arch
= selected_arch
->arch
;
11238 case AARCH64_PARSE_MISSING_ARG
:
11239 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11241 case AARCH64_PARSE_INVALID_ARG
:
11242 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
11243 aarch64_print_hint_for_arch (str
);
11245 case AARCH64_PARSE_INVALID_FEATURE
:
11246 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11249 gcc_unreachable ();
11255 /* Handle the argument CPU_STR to the cpu= target attribute. */
11258 aarch64_handle_attr_cpu (const char *str
)
11260 const struct processor
*tmp_cpu
= NULL
;
11261 enum aarch64_parse_opt_result parse_res
11262 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
11264 if (parse_res
== AARCH64_PARSE_OK
)
11266 gcc_assert (tmp_cpu
);
11267 selected_tune
= tmp_cpu
;
11268 explicit_tune_core
= selected_tune
->ident
;
11270 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
11271 explicit_arch
= selected_arch
->arch
;
11277 case AARCH64_PARSE_MISSING_ARG
:
11278 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11280 case AARCH64_PARSE_INVALID_ARG
:
11281 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
11282 aarch64_print_hint_for_core (str
);
11284 case AARCH64_PARSE_INVALID_FEATURE
:
11285 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11288 gcc_unreachable ();
11294 /* Handle the argument STR to the tune= target attribute. */
11297 aarch64_handle_attr_tune (const char *str
)
11299 const struct processor
*tmp_tune
= NULL
;
11300 enum aarch64_parse_opt_result parse_res
11301 = aarch64_parse_tune (str
, &tmp_tune
);
11303 if (parse_res
== AARCH64_PARSE_OK
)
11305 gcc_assert (tmp_tune
);
11306 selected_tune
= tmp_tune
;
11307 explicit_tune_core
= selected_tune
->ident
;
11313 case AARCH64_PARSE_INVALID_ARG
:
11314 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
11315 aarch64_print_hint_for_core (str
);
11318 gcc_unreachable ();
11324 /* Parse an architecture extensions target attribute string specified in STR.
11325 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11326 if successful. Update aarch64_isa_flags to reflect the ISA features
11330 aarch64_handle_attr_isa_flags (char *str
)
11332 enum aarch64_parse_opt_result parse_res
;
11333 unsigned long isa_flags
= aarch64_isa_flags
;
11335 /* We allow "+nothing" in the beginning to clear out all architectural
11336 features if the user wants to handpick specific features. */
11337 if (strncmp ("+nothing", str
, 8) == 0)
11343 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
11345 if (parse_res
== AARCH64_PARSE_OK
)
11347 aarch64_isa_flags
= isa_flags
;
11353 case AARCH64_PARSE_MISSING_ARG
:
11354 error ("missing value in %<target()%> pragma or attribute");
11357 case AARCH64_PARSE_INVALID_FEATURE
:
11358 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11362 gcc_unreachable ();
11368 /* The target attributes that we support. On top of these we also support just
11369 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11370 handled explicitly in aarch64_process_one_target_attr. */
11372 static const struct aarch64_attribute_info aarch64_attributes
[] =
11374 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
11375 OPT_mgeneral_regs_only
},
11376 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
11377 OPT_mfix_cortex_a53_835769
},
11378 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
11379 OPT_mfix_cortex_a53_843419
},
11380 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
11381 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
11382 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
11383 OPT_momit_leaf_frame_pointer
},
11384 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
11385 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
11387 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
11388 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
11390 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
11391 OPT_msign_return_address_
},
11392 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
11395 /* Parse ARG_STR which contains the definition of one target attribute.
11396 Show appropriate errors if any or return true if the attribute is valid. */
11399 aarch64_process_one_target_attr (char *arg_str
)
11401 bool invert
= false;
11403 size_t len
= strlen (arg_str
);
11407 error ("malformed %<target()%> pragma or attribute");
11411 char *str_to_check
= (char *) alloca (len
+ 1);
11412 strcpy (str_to_check
, arg_str
);
11414 /* Skip leading whitespace. */
11415 while (*str_to_check
== ' ' || *str_to_check
== '\t')
11418 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11419 It is easier to detect and handle it explicitly here rather than going
11420 through the machinery for the rest of the target attributes in this
11422 if (*str_to_check
== '+')
11423 return aarch64_handle_attr_isa_flags (str_to_check
);
11425 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
11430 char *arg
= strchr (str_to_check
, '=');
11432 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11433 and point ARG to "foo". */
11439 const struct aarch64_attribute_info
*p_attr
;
11440 bool found
= false;
11441 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
11443 /* If the names don't match up, or the user has given an argument
11444 to an attribute that doesn't accept one, or didn't give an argument
11445 to an attribute that expects one, fail to match. */
11446 if (strcmp (str_to_check
, p_attr
->name
) != 0)
11450 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
11451 || p_attr
->attr_type
== aarch64_attr_enum
;
11453 if (attr_need_arg_p
^ (arg
!= NULL
))
11455 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
11459 /* If the name matches but the attribute does not allow "no-" versions
11460 then we can't match. */
11461 if (invert
&& !p_attr
->allow_neg
)
11463 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
11467 switch (p_attr
->attr_type
)
11469 /* Has a custom handler registered.
11470 For example, cpu=, arch=, tune=. */
11471 case aarch64_attr_custom
:
11472 gcc_assert (p_attr
->handler
);
11473 if (!p_attr
->handler (arg
))
11477 /* Either set or unset a boolean option. */
11478 case aarch64_attr_bool
:
11480 struct cl_decoded_option decoded
;
11482 generate_option (p_attr
->opt_num
, NULL
, !invert
,
11483 CL_TARGET
, &decoded
);
11484 aarch64_handle_option (&global_options
, &global_options_set
,
11485 &decoded
, input_location
);
11488 /* Set or unset a bit in the target_flags. aarch64_handle_option
11489 should know what mask to apply given the option number. */
11490 case aarch64_attr_mask
:
11492 struct cl_decoded_option decoded
;
11493 /* We only need to specify the option number.
11494 aarch64_handle_option will know which mask to apply. */
11495 decoded
.opt_index
= p_attr
->opt_num
;
11496 decoded
.value
= !invert
;
11497 aarch64_handle_option (&global_options
, &global_options_set
,
11498 &decoded
, input_location
);
11501 /* Use the option setting machinery to set an option to an enum. */
11502 case aarch64_attr_enum
:
11507 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
11508 &value
, CL_TARGET
);
11511 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
11512 NULL
, DK_UNSPECIFIED
, input_location
,
11517 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
11522 gcc_unreachable ();
11526 /* If we reached here we either have found an attribute and validated
11527 it or didn't match any. If we matched an attribute but its arguments
11528 were malformed we will have returned false already. */
11532 /* Count how many times the character C appears in
11533 NULL-terminated string STR. */
11535 static unsigned int
11536 num_occurences_in_str (char c
, char *str
)
11538 unsigned int res
= 0;
11539 while (*str
!= '\0')
11550 /* Parse the tree in ARGS that contains the target attribute information
11551 and update the global target options space. */
11554 aarch64_process_target_attr (tree args
)
11556 if (TREE_CODE (args
) == TREE_LIST
)
11560 tree head
= TREE_VALUE (args
);
11563 if (!aarch64_process_target_attr (head
))
11566 args
= TREE_CHAIN (args
);
11572 if (TREE_CODE (args
) != STRING_CST
)
11574 error ("attribute %<target%> argument not a string");
11578 size_t len
= strlen (TREE_STRING_POINTER (args
));
11579 char *str_to_check
= (char *) alloca (len
+ 1);
11580 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
11584 error ("malformed %<target()%> pragma or attribute");
11588 /* Used to catch empty spaces between commas i.e.
11589 attribute ((target ("attr1,,attr2"))). */
11590 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
11592 /* Handle multiple target attributes separated by ','. */
11593 char *token
= strtok (str_to_check
, ",");
11595 unsigned int num_attrs
= 0;
11599 if (!aarch64_process_one_target_attr (token
))
11601 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
11605 token
= strtok (NULL
, ",");
11608 if (num_attrs
!= num_commas
+ 1)
11610 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
11617 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11618 process attribute ((target ("..."))). */
11621 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
11623 struct cl_target_option cur_target
;
11626 tree new_target
, new_optimize
;
11627 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11629 /* If what we're processing is the current pragma string then the
11630 target option node is already stored in target_option_current_node
11631 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11632 having to re-parse the string. This is especially useful to keep
11633 arm_neon.h compile times down since that header contains a lot
11634 of intrinsics enclosed in pragmas. */
11635 if (!existing_target
&& args
== current_target_pragma
)
11637 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
11640 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11642 old_optimize
= build_optimization_node (&global_options
);
11643 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11645 /* If the function changed the optimization levels as well as setting
11646 target options, start with the optimizations specified. */
11647 if (func_optimize
&& func_optimize
!= old_optimize
)
11648 cl_optimization_restore (&global_options
,
11649 TREE_OPTIMIZATION (func_optimize
));
11651 /* Save the current target options to restore at the end. */
11652 cl_target_option_save (&cur_target
, &global_options
);
11654 /* If fndecl already has some target attributes applied to it, unpack
11655 them so that we add this attribute on top of them, rather than
11656 overwriting them. */
11657 if (existing_target
)
11659 struct cl_target_option
*existing_options
11660 = TREE_TARGET_OPTION (existing_target
);
11662 if (existing_options
)
11663 cl_target_option_restore (&global_options
, existing_options
);
11666 cl_target_option_restore (&global_options
,
11667 TREE_TARGET_OPTION (target_option_current_node
));
11669 ret
= aarch64_process_target_attr (args
);
11671 /* Set up any additional state. */
11674 aarch64_override_options_internal (&global_options
);
11675 /* Initialize SIMD builtins if we haven't already.
11676 Set current_target_pragma to NULL for the duration so that
11677 the builtin initialization code doesn't try to tag the functions
11678 being built with the attributes specified by any current pragma, thus
11679 going into an infinite recursion. */
11682 tree saved_current_target_pragma
= current_target_pragma
;
11683 current_target_pragma
= NULL
;
11684 aarch64_init_simd_builtins ();
11685 current_target_pragma
= saved_current_target_pragma
;
11687 new_target
= build_target_option_node (&global_options
);
11692 new_optimize
= build_optimization_node (&global_options
);
11696 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
11698 if (old_optimize
!= new_optimize
)
11699 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
11702 cl_target_option_restore (&global_options
, &cur_target
);
11704 if (old_optimize
!= new_optimize
)
11705 cl_optimization_restore (&global_options
,
11706 TREE_OPTIMIZATION (old_optimize
));
11710 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11711 tri-bool options (yes, no, don't care) and the default value is
11712 DEF, determine whether to reject inlining. */
11715 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
11716 int dont_care
, int def
)
11718 /* If the callee doesn't care, always allow inlining. */
11719 if (callee
== dont_care
)
11722 /* If the caller doesn't care, always allow inlining. */
11723 if (caller
== dont_care
)
11726 /* Otherwise, allow inlining if either the callee and caller values
11727 agree, or if the callee is using the default value. */
11728 return (callee
== caller
|| callee
== def
);
11731 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11732 to inline CALLEE into CALLER based on target-specific info.
11733 Make sure that the caller and callee have compatible architectural
11734 features. Then go through the other possible target attributes
11735 and see if they can block inlining. Try not to reject always_inline
11736 callees unless they are incompatible architecturally. */
11739 aarch64_can_inline_p (tree caller
, tree callee
)
11741 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
11742 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
11744 struct cl_target_option
*caller_opts
11745 = TREE_TARGET_OPTION (caller_tree
? caller_tree
11746 : target_option_default_node
);
11748 struct cl_target_option
*callee_opts
11749 = TREE_TARGET_OPTION (callee_tree
? callee_tree
11750 : target_option_default_node
);
11752 /* Callee's ISA flags should be a subset of the caller's. */
11753 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
11754 != callee_opts
->x_aarch64_isa_flags
)
11757 /* Allow non-strict aligned functions inlining into strict
11759 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
11760 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
11761 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
11762 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
11765 bool always_inline
= lookup_attribute ("always_inline",
11766 DECL_ATTRIBUTES (callee
));
11768 /* If the architectural features match up and the callee is always_inline
11769 then the other attributes don't matter. */
11773 if (caller_opts
->x_aarch64_cmodel_var
11774 != callee_opts
->x_aarch64_cmodel_var
)
11777 if (caller_opts
->x_aarch64_tls_dialect
11778 != callee_opts
->x_aarch64_tls_dialect
)
11781 /* Honour explicit requests to workaround errata. */
11782 if (!aarch64_tribools_ok_for_inlining_p (
11783 caller_opts
->x_aarch64_fix_a53_err835769
,
11784 callee_opts
->x_aarch64_fix_a53_err835769
,
11785 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
11788 if (!aarch64_tribools_ok_for_inlining_p (
11789 caller_opts
->x_aarch64_fix_a53_err843419
,
11790 callee_opts
->x_aarch64_fix_a53_err843419
,
11791 2, TARGET_FIX_ERR_A53_843419
))
11794 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11795 caller and calle and they don't match up, reject inlining. */
11796 if (!aarch64_tribools_ok_for_inlining_p (
11797 caller_opts
->x_flag_omit_leaf_frame_pointer
,
11798 callee_opts
->x_flag_omit_leaf_frame_pointer
,
11802 /* If the callee has specific tuning overrides, respect them. */
11803 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
11804 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
11807 /* If the user specified tuning override strings for the
11808 caller and callee and they don't match up, reject inlining.
11809 We just do a string compare here, we don't analyze the meaning
11810 of the string, as it would be too costly for little gain. */
11811 if (callee_opts
->x_aarch64_override_tune_string
11812 && caller_opts
->x_aarch64_override_tune_string
11813 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
11814 caller_opts
->x_aarch64_override_tune_string
) != 0))
11820 /* Return true if SYMBOL_REF X binds locally. */
11823 aarch64_symbol_binds_local_p (const_rtx x
)
11825 return (SYMBOL_REF_DECL (x
)
11826 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
11827 : SYMBOL_REF_LOCAL_P (x
));
11830 /* Return true if SYMBOL_REF X is thread local */
11832 aarch64_tls_symbol_p (rtx x
)
11834 if (! TARGET_HAVE_TLS
)
11837 if (GET_CODE (x
) != SYMBOL_REF
)
11840 return SYMBOL_REF_TLS_MODEL (x
) != 0;
11843 /* Classify a TLS symbol into one of the TLS kinds. */
11844 enum aarch64_symbol_type
11845 aarch64_classify_tls_symbol (rtx x
)
11847 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
11851 case TLS_MODEL_GLOBAL_DYNAMIC
:
11852 case TLS_MODEL_LOCAL_DYNAMIC
:
11853 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
11855 case TLS_MODEL_INITIAL_EXEC
:
11856 switch (aarch64_cmodel
)
11858 case AARCH64_CMODEL_TINY
:
11859 case AARCH64_CMODEL_TINY_PIC
:
11860 return SYMBOL_TINY_TLSIE
;
11862 return SYMBOL_SMALL_TLSIE
;
11865 case TLS_MODEL_LOCAL_EXEC
:
11866 if (aarch64_tls_size
== 12)
11867 return SYMBOL_TLSLE12
;
11868 else if (aarch64_tls_size
== 24)
11869 return SYMBOL_TLSLE24
;
11870 else if (aarch64_tls_size
== 32)
11871 return SYMBOL_TLSLE32
;
11872 else if (aarch64_tls_size
== 48)
11873 return SYMBOL_TLSLE48
;
11875 gcc_unreachable ();
11877 case TLS_MODEL_EMULATED
:
11878 case TLS_MODEL_NONE
:
11879 return SYMBOL_FORCE_TO_MEM
;
11882 gcc_unreachable ();
11886 /* Return the correct method for accessing X + OFFSET, where X is either
11887 a SYMBOL_REF or LABEL_REF. */
11889 enum aarch64_symbol_type
11890 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
11892 if (GET_CODE (x
) == LABEL_REF
)
11894 switch (aarch64_cmodel
)
11896 case AARCH64_CMODEL_LARGE
:
11897 return SYMBOL_FORCE_TO_MEM
;
11899 case AARCH64_CMODEL_TINY_PIC
:
11900 case AARCH64_CMODEL_TINY
:
11901 return SYMBOL_TINY_ABSOLUTE
;
11903 case AARCH64_CMODEL_SMALL_SPIC
:
11904 case AARCH64_CMODEL_SMALL_PIC
:
11905 case AARCH64_CMODEL_SMALL
:
11906 return SYMBOL_SMALL_ABSOLUTE
;
11909 gcc_unreachable ();
11913 if (GET_CODE (x
) == SYMBOL_REF
)
11915 if (aarch64_tls_symbol_p (x
))
11916 return aarch64_classify_tls_symbol (x
);
11918 switch (aarch64_cmodel
)
11920 case AARCH64_CMODEL_TINY
:
11921 /* When we retrieve symbol + offset address, we have to make sure
11922 the offset does not cause overflow of the final address. But
11923 we have no way of knowing the address of symbol at compile time
11924 so we can't accurately say if the distance between the PC and
11925 symbol + offset is outside the addressible range of +/-1M in the
11926 TINY code model. So we rely on images not being greater than
11927 1M and cap the offset at 1M and anything beyond 1M will have to
11928 be loaded using an alternative mechanism. Furthermore if the
11929 symbol is a weak reference to something that isn't known to
11930 resolve to a symbol in this module, then force to memory. */
11931 if ((SYMBOL_REF_WEAK (x
)
11932 && !aarch64_symbol_binds_local_p (x
))
11933 || !IN_RANGE (offset
, -1048575, 1048575))
11934 return SYMBOL_FORCE_TO_MEM
;
11935 return SYMBOL_TINY_ABSOLUTE
;
11937 case AARCH64_CMODEL_SMALL
:
11938 /* Same reasoning as the tiny code model, but the offset cap here is
11940 if ((SYMBOL_REF_WEAK (x
)
11941 && !aarch64_symbol_binds_local_p (x
))
11942 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
11943 HOST_WIDE_INT_C (4294967264)))
11944 return SYMBOL_FORCE_TO_MEM
;
11945 return SYMBOL_SMALL_ABSOLUTE
;
11947 case AARCH64_CMODEL_TINY_PIC
:
11948 if (!aarch64_symbol_binds_local_p (x
))
11949 return SYMBOL_TINY_GOT
;
11950 return SYMBOL_TINY_ABSOLUTE
;
11952 case AARCH64_CMODEL_SMALL_SPIC
:
11953 case AARCH64_CMODEL_SMALL_PIC
:
11954 if (!aarch64_symbol_binds_local_p (x
))
11955 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
11956 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
11957 return SYMBOL_SMALL_ABSOLUTE
;
11959 case AARCH64_CMODEL_LARGE
:
11960 /* This is alright even in PIC code as the constant
11961 pool reference is always PC relative and within
11962 the same translation unit. */
11963 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
11964 return SYMBOL_SMALL_ABSOLUTE
;
11966 return SYMBOL_FORCE_TO_MEM
;
11969 gcc_unreachable ();
11973 /* By default push everything into the constant pool. */
11974 return SYMBOL_FORCE_TO_MEM
;
11978 aarch64_constant_address_p (rtx x
)
11980 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
11984 aarch64_legitimate_pic_operand_p (rtx x
)
11986 if (GET_CODE (x
) == SYMBOL_REF
11987 || (GET_CODE (x
) == CONST
11988 && GET_CODE (XEXP (x
, 0)) == PLUS
11989 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
11995 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11996 that should be rematerialized rather than spilled. */
11999 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
12001 /* Support CSE and rematerialization of common constants. */
12002 if (CONST_INT_P (x
)
12003 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12004 || GET_CODE (x
) == CONST_VECTOR
)
12007 /* Do not allow vector struct mode constants for Advanced SIMD.
12008 We could support 0 and -1 easily, but they need support in
12009 aarch64-simd.md. */
12010 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12011 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12014 /* Only accept variable-length vector constants if they can be
12017 ??? It would be possible to handle rematerialization of other
12018 constants via secondary reloads. */
12019 if (vec_flags
& VEC_ANY_SVE
)
12020 return aarch64_simd_valid_immediate (x
, NULL
);
12022 if (GET_CODE (x
) == HIGH
)
12025 /* Accept polynomial constants that can be calculated by using the
12026 destination of a move as the sole temporary. Constants that
12027 require a second temporary cannot be rematerialized (they can't be
12028 forced to memory and also aren't legitimate constants). */
12030 if (poly_int_rtx_p (x
, &offset
))
12031 return aarch64_offset_temporaries (false, offset
) <= 1;
12033 /* If an offset is being added to something else, we need to allow the
12034 base to be moved into the destination register, meaning that there
12035 are no free temporaries for the offset. */
12036 x
= strip_offset (x
, &offset
);
12037 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
12040 /* Do not allow const (plus (anchor_symbol, const_int)). */
12041 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
12044 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12045 so spilling them is better than rematerialization. */
12046 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
12049 /* Label references are always constant. */
12050 if (GET_CODE (x
) == LABEL_REF
)
12057 aarch64_load_tp (rtx target
)
12060 || GET_MODE (target
) != Pmode
12061 || !register_operand (target
, Pmode
))
12062 target
= gen_reg_rtx (Pmode
);
12064 /* Can return in any reg. */
12065 emit_insn (gen_aarch64_load_tp_hard (target
));
12069 /* On AAPCS systems, this is the "struct __va_list". */
12070 static GTY(()) tree va_list_type
;
12072 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12073 Return the type to use as __builtin_va_list.
12075 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12087 aarch64_build_builtin_va_list (void)
12090 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12092 /* Create the type. */
12093 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
12094 /* Give it the required name. */
12095 va_list_name
= build_decl (BUILTINS_LOCATION
,
12097 get_identifier ("__va_list"),
12099 DECL_ARTIFICIAL (va_list_name
) = 1;
12100 TYPE_NAME (va_list_type
) = va_list_name
;
12101 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
12103 /* Create the fields. */
12104 f_stack
= build_decl (BUILTINS_LOCATION
,
12105 FIELD_DECL
, get_identifier ("__stack"),
12107 f_grtop
= build_decl (BUILTINS_LOCATION
,
12108 FIELD_DECL
, get_identifier ("__gr_top"),
12110 f_vrtop
= build_decl (BUILTINS_LOCATION
,
12111 FIELD_DECL
, get_identifier ("__vr_top"),
12113 f_groff
= build_decl (BUILTINS_LOCATION
,
12114 FIELD_DECL
, get_identifier ("__gr_offs"),
12115 integer_type_node
);
12116 f_vroff
= build_decl (BUILTINS_LOCATION
,
12117 FIELD_DECL
, get_identifier ("__vr_offs"),
12118 integer_type_node
);
12120 /* Tell tree-stdarg pass about our internal offset fields.
12121 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12122 purpose to identify whether the code is updating va_list internal
12123 offset fields through irregular way. */
12124 va_list_gpr_counter_field
= f_groff
;
12125 va_list_fpr_counter_field
= f_vroff
;
12127 DECL_ARTIFICIAL (f_stack
) = 1;
12128 DECL_ARTIFICIAL (f_grtop
) = 1;
12129 DECL_ARTIFICIAL (f_vrtop
) = 1;
12130 DECL_ARTIFICIAL (f_groff
) = 1;
12131 DECL_ARTIFICIAL (f_vroff
) = 1;
12133 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
12134 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
12135 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
12136 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
12137 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
12139 TYPE_FIELDS (va_list_type
) = f_stack
;
12140 DECL_CHAIN (f_stack
) = f_grtop
;
12141 DECL_CHAIN (f_grtop
) = f_vrtop
;
12142 DECL_CHAIN (f_vrtop
) = f_groff
;
12143 DECL_CHAIN (f_groff
) = f_vroff
;
12145 /* Compute its layout. */
12146 layout_type (va_list_type
);
12148 return va_list_type
;
12151 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12153 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
12155 const CUMULATIVE_ARGS
*cum
;
12156 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12157 tree stack
, grtop
, vrtop
, groff
, vroff
;
12159 int gr_save_area_size
= cfun
->va_list_gpr_size
;
12160 int vr_save_area_size
= cfun
->va_list_fpr_size
;
12163 cum
= &crtl
->args
.info
;
12164 if (cfun
->va_list_gpr_size
)
12165 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
12166 cfun
->va_list_gpr_size
);
12167 if (cfun
->va_list_fpr_size
)
12168 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
12169 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
12173 gcc_assert (cum
->aapcs_nvrn
== 0);
12174 vr_save_area_size
= 0;
12177 f_stack
= TYPE_FIELDS (va_list_type_node
);
12178 f_grtop
= DECL_CHAIN (f_stack
);
12179 f_vrtop
= DECL_CHAIN (f_grtop
);
12180 f_groff
= DECL_CHAIN (f_vrtop
);
12181 f_vroff
= DECL_CHAIN (f_groff
);
12183 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
12185 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
12187 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
12189 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
12191 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
12194 /* Emit code to initialize STACK, which points to the next varargs stack
12195 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12196 by named arguments. STACK is 8-byte aligned. */
12197 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
12198 if (cum
->aapcs_stack_size
> 0)
12199 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
12200 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
12201 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12203 /* Emit code to initialize GRTOP, the top of the GR save area.
12204 virtual_incoming_args_rtx should have been 16 byte aligned. */
12205 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
12206 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
12207 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12209 /* Emit code to initialize VRTOP, the top of the VR save area.
12210 This address is gr_save_area_bytes below GRTOP, rounded
12211 down to the next 16-byte boundary. */
12212 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
12213 vr_offset
= ROUND_UP (gr_save_area_size
,
12214 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12217 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
12218 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
12219 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12221 /* Emit code to initialize GROFF, the offset from GRTOP of the
12222 next GPR argument. */
12223 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
12224 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
12225 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12227 /* Likewise emit code to initialize VROFF, the offset from FTOP
12228 of the next VR argument. */
12229 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
12230 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
12231 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12234 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12237 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
12238 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
12242 bool is_ha
; /* is HFA or HVA. */
12243 bool dw_align
; /* double-word align. */
12244 machine_mode ag_mode
= VOIDmode
;
12248 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12249 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
12250 HOST_WIDE_INT size
, rsize
, adjust
, align
;
12251 tree t
, u
, cond1
, cond2
;
12253 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
12255 type
= build_pointer_type (type
);
12257 mode
= TYPE_MODE (type
);
12259 f_stack
= TYPE_FIELDS (va_list_type_node
);
12260 f_grtop
= DECL_CHAIN (f_stack
);
12261 f_vrtop
= DECL_CHAIN (f_grtop
);
12262 f_groff
= DECL_CHAIN (f_vrtop
);
12263 f_vroff
= DECL_CHAIN (f_groff
);
12265 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
12266 f_stack
, NULL_TREE
);
12267 size
= int_size_in_bytes (type
);
12268 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
12272 if (aarch64_vfp_is_call_or_return_candidate (mode
,
12278 /* No frontends can create types with variable-sized modes, so we
12279 shouldn't be asked to pass or return them. */
12280 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
12282 /* TYPE passed in fp/simd registers. */
12284 aarch64_err_no_fpadvsimd (mode
);
12286 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
12287 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
12288 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
12289 unshare_expr (valist
), f_vroff
, NULL_TREE
);
12291 rsize
= nregs
* UNITS_PER_VREG
;
12295 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
12296 adjust
= UNITS_PER_VREG
- ag_size
;
12298 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12299 && size
< UNITS_PER_VREG
)
12301 adjust
= UNITS_PER_VREG
- size
;
12306 /* TYPE passed in general registers. */
12307 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
12308 unshare_expr (valist
), f_grtop
, NULL_TREE
);
12309 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
12310 unshare_expr (valist
), f_groff
, NULL_TREE
);
12311 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
12312 nregs
= rsize
/ UNITS_PER_WORD
;
12317 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12318 && size
< UNITS_PER_WORD
)
12320 adjust
= UNITS_PER_WORD
- size
;
12324 /* Get a local temporary for the field value. */
12325 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
12327 /* Emit code to branch if off >= 0. */
12328 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
12329 build_int_cst (TREE_TYPE (off
), 0));
12330 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
12334 /* Emit: offs = (offs + 15) & -16. */
12335 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12336 build_int_cst (TREE_TYPE (off
), 15));
12337 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
12338 build_int_cst (TREE_TYPE (off
), -16));
12339 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
12344 /* Update ap.__[g|v]r_offs */
12345 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12346 build_int_cst (TREE_TYPE (off
), rsize
));
12347 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
12351 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12353 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12354 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
12355 build_int_cst (TREE_TYPE (f_off
), 0));
12356 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
12358 /* String up: make sure the assignment happens before the use. */
12359 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
12360 COND_EXPR_ELSE (cond1
) = t
;
12362 /* Prepare the trees handling the argument that is passed on the stack;
12363 the top level node will store in ON_STACK. */
12364 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
12367 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12368 t
= fold_build_pointer_plus_hwi (arg
, 15);
12369 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12370 build_int_cst (TREE_TYPE (t
), -16));
12371 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
12375 /* Advance ap.__stack */
12376 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
12377 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12378 build_int_cst (TREE_TYPE (t
), -8));
12379 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
12380 /* String up roundup and advance. */
12382 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12383 /* String up with arg */
12384 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
12385 /* Big-endianness related address adjustment. */
12386 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12387 && size
< UNITS_PER_WORD
)
12389 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
12390 size_int (UNITS_PER_WORD
- size
));
12391 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
12394 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
12395 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
12397 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12400 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
12401 build_int_cst (TREE_TYPE (off
), adjust
));
12403 t
= fold_convert (sizetype
, t
);
12404 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
12408 /* type ha; // treat as "struct {ftype field[n];}"
12409 ... [computing offs]
12410 for (i = 0; i <nregs; ++i, offs += 16)
12411 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12414 tree tmp_ha
, field_t
, field_ptr_t
;
12416 /* Declare a local variable. */
12417 tmp_ha
= create_tmp_var_raw (type
, "ha");
12418 gimple_add_tmp_var (tmp_ha
);
12420 /* Establish the base type. */
12424 field_t
= float_type_node
;
12425 field_ptr_t
= float_ptr_type_node
;
12428 field_t
= double_type_node
;
12429 field_ptr_t
= double_ptr_type_node
;
12432 field_t
= long_double_type_node
;
12433 field_ptr_t
= long_double_ptr_type_node
;
12436 field_t
= aarch64_fp16_type_node
;
12437 field_ptr_t
= aarch64_fp16_ptr_type_node
;
12442 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
12443 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
12444 field_ptr_t
= build_pointer_type (field_t
);
12451 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12452 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
12454 t
= fold_convert (field_ptr_t
, addr
);
12455 t
= build2 (MODIFY_EXPR
, field_t
,
12456 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
12457 build1 (INDIRECT_REF
, field_t
, t
));
12459 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12460 for (i
= 1; i
< nregs
; ++i
)
12462 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
12463 u
= fold_convert (field_ptr_t
, addr
);
12464 u
= build2 (MODIFY_EXPR
, field_t
,
12465 build2 (MEM_REF
, field_t
, tmp_ha
,
12466 build_int_cst (field_ptr_t
,
12468 int_size_in_bytes (field_t
)))),
12469 build1 (INDIRECT_REF
, field_t
, u
));
12470 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
12473 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
12474 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
12477 COND_EXPR_ELSE (cond2
) = t
;
12478 addr
= fold_convert (build_pointer_type (type
), cond1
);
12479 addr
= build_va_arg_indirect_ref (addr
);
12482 addr
= build_va_arg_indirect_ref (addr
);
12487 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12490 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
12491 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
12494 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
12495 CUMULATIVE_ARGS local_cum
;
12496 int gr_saved
= cfun
->va_list_gpr_size
;
12497 int vr_saved
= cfun
->va_list_fpr_size
;
12499 /* The caller has advanced CUM up to, but not beyond, the last named
12500 argument. Advance a local copy of CUM past the last "real" named
12501 argument, to find out how many registers are left over. */
12503 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
12505 /* Found out how many registers we need to save.
12506 Honor tree-stdvar analysis results. */
12507 if (cfun
->va_list_gpr_size
)
12508 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
12509 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
12510 if (cfun
->va_list_fpr_size
)
12511 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
12512 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
12516 gcc_assert (local_cum
.aapcs_nvrn
== 0);
12526 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12527 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
12528 - gr_saved
* UNITS_PER_WORD
);
12529 mem
= gen_frame_mem (BLKmode
, ptr
);
12530 set_mem_alias_set (mem
, get_varargs_alias_set ());
12532 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
12537 /* We can't use move_block_from_reg, because it will use
12538 the wrong mode, storing D regs only. */
12539 machine_mode mode
= TImode
;
12540 int off
, i
, vr_start
;
12542 /* Set OFF to the offset from virtual_incoming_args_rtx of
12543 the first vector register. The VR save area lies below
12544 the GR one, and is aligned to 16 bytes. */
12545 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12546 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12547 off
-= vr_saved
* UNITS_PER_VREG
;
12549 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
12550 for (i
= 0; i
< vr_saved
; ++i
)
12554 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
12555 mem
= gen_frame_mem (mode
, ptr
);
12556 set_mem_alias_set (mem
, get_varargs_alias_set ());
12557 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
12558 off
+= UNITS_PER_VREG
;
12563 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12564 any complication of having crtl->args.pretend_args_size changed. */
12565 cfun
->machine
->frame
.saved_varargs_size
12566 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12567 STACK_BOUNDARY
/ BITS_PER_UNIT
)
12568 + vr_saved
* UNITS_PER_VREG
);
12572 aarch64_conditional_register_usage (void)
12577 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
12580 call_used_regs
[i
] = 1;
12584 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
12587 call_used_regs
[i
] = 1;
12591 /* Walk down the type tree of TYPE counting consecutive base elements.
12592 If *MODEP is VOIDmode, then set it to the first valid floating point
12593 type. If a non-floating point type is found, or if a floating point
12594 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12595 otherwise return the count in the sub-tree. */
12597 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
12600 HOST_WIDE_INT size
;
12602 switch (TREE_CODE (type
))
12605 mode
= TYPE_MODE (type
);
12606 if (mode
!= DFmode
&& mode
!= SFmode
12607 && mode
!= TFmode
&& mode
!= HFmode
)
12610 if (*modep
== VOIDmode
)
12613 if (*modep
== mode
)
12619 mode
= TYPE_MODE (TREE_TYPE (type
));
12620 if (mode
!= DFmode
&& mode
!= SFmode
12621 && mode
!= TFmode
&& mode
!= HFmode
)
12624 if (*modep
== VOIDmode
)
12627 if (*modep
== mode
)
12633 /* Use V2SImode and V4SImode as representatives of all 64-bit
12634 and 128-bit vector types. */
12635 size
= int_size_in_bytes (type
);
12648 if (*modep
== VOIDmode
)
12651 /* Vector modes are considered to be opaque: two vectors are
12652 equivalent for the purposes of being homogeneous aggregates
12653 if they are the same size. */
12654 if (*modep
== mode
)
12662 tree index
= TYPE_DOMAIN (type
);
12664 /* Can't handle incomplete types nor sizes that are not
12666 if (!COMPLETE_TYPE_P (type
)
12667 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12670 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
12673 || !TYPE_MAX_VALUE (index
)
12674 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
12675 || !TYPE_MIN_VALUE (index
)
12676 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
12680 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
12681 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
12683 /* There must be no padding. */
12684 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12685 count
* GET_MODE_BITSIZE (*modep
)))
12697 /* Can't handle incomplete types nor sizes that are not
12699 if (!COMPLETE_TYPE_P (type
)
12700 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12703 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12705 if (TREE_CODE (field
) != FIELD_DECL
)
12708 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12711 count
+= sub_count
;
12714 /* There must be no padding. */
12715 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12716 count
* GET_MODE_BITSIZE (*modep
)))
12723 case QUAL_UNION_TYPE
:
12725 /* These aren't very interesting except in a degenerate case. */
12730 /* Can't handle incomplete types nor sizes that are not
12732 if (!COMPLETE_TYPE_P (type
)
12733 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12736 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12738 if (TREE_CODE (field
) != FIELD_DECL
)
12741 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12744 count
= count
> sub_count
? count
: sub_count
;
12747 /* There must be no padding. */
12748 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12749 count
* GET_MODE_BITSIZE (*modep
)))
12762 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12763 type as described in AAPCS64 \S 4.1.2.
12765 See the comment above aarch64_composite_type_p for the notes on MODE. */
12768 aarch64_short_vector_p (const_tree type
,
12771 poly_int64 size
= -1;
12773 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
12774 size
= int_size_in_bytes (type
);
12775 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
12776 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
12777 size
= GET_MODE_SIZE (mode
);
12779 return known_eq (size
, 8) || known_eq (size
, 16);
12782 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12783 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12784 array types. The C99 floating-point complex types are also considered
12785 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12786 types, which are GCC extensions and out of the scope of AAPCS64, are
12787 treated as composite types here as well.
12789 Note that MODE itself is not sufficient in determining whether a type
12790 is such a composite type or not. This is because
12791 stor-layout.c:compute_record_mode may have already changed the MODE
12792 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12793 structure with only one field may have its MODE set to the mode of the
12794 field. Also an integer mode whose size matches the size of the
12795 RECORD_TYPE type may be used to substitute the original mode
12796 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12797 solely relied on. */
12800 aarch64_composite_type_p (const_tree type
,
12803 if (aarch64_short_vector_p (type
, mode
))
12806 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
12809 if (mode
== BLKmode
12810 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
12811 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
12817 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12818 shall be passed or returned in simd/fp register(s) (providing these
12819 parameter passing registers are available).
12821 Upon successful return, *COUNT returns the number of needed registers,
12822 *BASE_MODE returns the mode of the individual register and when IS_HAF
12823 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12824 floating-point aggregate or a homogeneous short-vector aggregate. */
12827 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
12829 machine_mode
*base_mode
,
12833 machine_mode new_mode
= VOIDmode
;
12834 bool composite_p
= aarch64_composite_type_p (type
, mode
);
12836 if (is_ha
!= NULL
) *is_ha
= false;
12838 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12839 || aarch64_short_vector_p (type
, mode
))
12844 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
12846 if (is_ha
!= NULL
) *is_ha
= true;
12848 new_mode
= GET_MODE_INNER (mode
);
12850 else if (type
&& composite_p
)
12852 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
12854 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
12856 if (is_ha
!= NULL
) *is_ha
= true;
12865 *base_mode
= new_mode
;
12869 /* Implement TARGET_STRUCT_VALUE_RTX. */
12872 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
12873 int incoming ATTRIBUTE_UNUSED
)
12875 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
12878 /* Implements target hook vector_mode_supported_p. */
12880 aarch64_vector_mode_supported_p (machine_mode mode
)
12882 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12883 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
12886 /* Return appropriate SIMD container
12887 for MODE within a vector of WIDTH bits. */
12888 static machine_mode
12889 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
12891 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
12907 return VNx16QImode
;
12912 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
12915 if (known_eq (width
, 128))
12955 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12956 static machine_mode
12957 aarch64_preferred_simd_mode (scalar_mode mode
)
12959 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
12960 return aarch64_simd_container_mode (mode
, bits
);
12963 /* Return a list of possible vector sizes for the vectorizer
12964 to iterate over. */
12966 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
12969 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
12970 sizes
->safe_push (16);
12971 sizes
->safe_push (8);
12974 /* Implement TARGET_MANGLE_TYPE. */
12976 static const char *
12977 aarch64_mangle_type (const_tree type
)
12979 /* The AArch64 ABI documents say that "__va_list" has to be
12980 managled as if it is in the "std" namespace. */
12981 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
12982 return "St9__va_list";
12984 /* Half-precision float. */
12985 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
12988 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12990 if (TYPE_NAME (type
) != NULL
)
12991 return aarch64_mangle_builtin_type (type
);
12993 /* Use the default mangling. */
12997 /* Find the first rtx_insn before insn that will generate an assembly
13001 aarch64_prev_real_insn (rtx_insn
*insn
)
13008 insn
= prev_real_insn (insn
);
13010 while (insn
&& recog_memoized (insn
) < 0);
13016 is_madd_op (enum attr_type t1
)
13019 /* A number of these may be AArch32 only. */
13020 enum attr_type mlatypes
[] = {
13021 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
13022 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
13023 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
13026 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
13028 if (t1
== mlatypes
[i
])
13035 /* Check if there is a register dependency between a load and the insn
13036 for which we hold recog_data. */
13039 dep_between_memop_and_curr (rtx memop
)
13044 gcc_assert (GET_CODE (memop
) == SET
);
13046 if (!REG_P (SET_DEST (memop
)))
13049 load_reg
= SET_DEST (memop
);
13050 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
13052 rtx operand
= recog_data
.operand
[opno
];
13053 if (REG_P (operand
)
13054 && reg_overlap_mentioned_p (load_reg
, operand
))
13062 /* When working around the Cortex-A53 erratum 835769,
13063 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13064 instruction and has a preceding memory instruction such that a NOP
13065 should be inserted between them. */
13068 aarch64_madd_needs_nop (rtx_insn
* insn
)
13070 enum attr_type attr_type
;
13074 if (!TARGET_FIX_ERR_A53_835769
)
13077 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
13080 attr_type
= get_attr_type (insn
);
13081 if (!is_madd_op (attr_type
))
13084 prev
= aarch64_prev_real_insn (insn
);
13085 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13086 Restore recog state to INSN to avoid state corruption. */
13087 extract_constrain_insn_cached (insn
);
13089 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
13092 body
= single_set (prev
);
13094 /* If the previous insn is a memory op and there is no dependency between
13095 it and the DImode madd, emit a NOP between them. If body is NULL then we
13096 have a complex memory operation, probably a load/store pair.
13097 Be conservative for now and emit a NOP. */
13098 if (GET_MODE (recog_data
.operand
[0]) == DImode
13099 && (!body
|| !dep_between_memop_and_curr (body
)))
13107 /* Implement FINAL_PRESCAN_INSN. */
13110 aarch64_final_prescan_insn (rtx_insn
*insn
)
13112 if (aarch64_madd_needs_nop (insn
))
13113 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
13117 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13121 aarch64_sve_index_immediate_p (rtx base_or_step
)
13123 return (CONST_INT_P (base_or_step
)
13124 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
13127 /* Return true if X is a valid immediate for the SVE ADD and SUB
13128 instructions. Negate X first if NEGATE_P is true. */
13131 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
13135 if (!const_vec_duplicate_p (x
, &elt
)
13136 || !CONST_INT_P (elt
))
13139 HOST_WIDE_INT val
= INTVAL (elt
);
13142 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
13145 return IN_RANGE (val
, 0, 0xff);
13146 return IN_RANGE (val
, 0, 0xff00);
13149 /* Return true if X is a valid immediate operand for an SVE logical
13150 instruction such as AND. */
13153 aarch64_sve_bitmask_immediate_p (rtx x
)
13157 return (const_vec_duplicate_p (x
, &elt
)
13158 && CONST_INT_P (elt
)
13159 && aarch64_bitmask_imm (INTVAL (elt
),
13160 GET_MODE_INNER (GET_MODE (x
))));
13163 /* Return true if X is a valid immediate for the SVE DUP and CPY
13167 aarch64_sve_dup_immediate_p (rtx x
)
13171 if (!const_vec_duplicate_p (x
, &elt
)
13172 || !CONST_INT_P (elt
))
13175 HOST_WIDE_INT val
= INTVAL (elt
);
13177 return IN_RANGE (val
, -0x80, 0x7f);
13178 return IN_RANGE (val
, -0x8000, 0x7f00);
13181 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13182 SIGNED_P says whether the operand is signed rather than unsigned. */
13185 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
13189 return (const_vec_duplicate_p (x
, &elt
)
13190 && CONST_INT_P (elt
)
13192 ? IN_RANGE (INTVAL (elt
), -16, 15)
13193 : IN_RANGE (INTVAL (elt
), 0, 127)));
13196 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13197 instruction. Negate X first if NEGATE_P is true. */
13200 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
13205 if (!const_vec_duplicate_p (x
, &elt
)
13206 || GET_CODE (elt
) != CONST_DOUBLE
)
13209 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
13212 r
= real_value_negate (&r
);
13214 if (real_equal (&r
, &dconst1
))
13216 if (real_equal (&r
, &dconsthalf
))
13221 /* Return true if X is a valid immediate operand for an SVE FMUL
13225 aarch64_sve_float_mul_immediate_p (rtx x
)
13229 /* GCC will never generate a multiply with an immediate of 2, so there is no
13230 point testing for it (even though it is a valid constant). */
13231 return (const_vec_duplicate_p (x
, &elt
)
13232 && GET_CODE (elt
) == CONST_DOUBLE
13233 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
13236 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13237 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13238 is nonnull, use it to describe valid immediates. */
13240 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
13241 simd_immediate_info
*info
,
13242 enum simd_immediate_check which
,
13243 simd_immediate_info::insn_type insn
)
13245 /* Try a 4-byte immediate with LSL. */
13246 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
13247 if ((val32
& (0xff << shift
)) == val32
)
13250 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13251 simd_immediate_info::LSL
, shift
);
13255 /* Try a 2-byte immediate with LSL. */
13256 unsigned int imm16
= val32
& 0xffff;
13257 if (imm16
== (val32
>> 16))
13258 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
13259 if ((imm16
& (0xff << shift
)) == imm16
)
13262 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
13263 simd_immediate_info::LSL
, shift
);
13267 /* Try a 4-byte immediate with MSL, except for cases that MVN
13269 if (which
== AARCH64_CHECK_MOV
)
13270 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
13272 unsigned int low
= (1 << shift
) - 1;
13273 if (((val32
& (0xff << shift
)) | low
) == val32
)
13276 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13277 simd_immediate_info::MSL
, shift
);
13285 /* Return true if replicating VAL64 is a valid immediate for the
13286 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13287 use it to describe valid immediates. */
13289 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
13290 simd_immediate_info
*info
,
13291 enum simd_immediate_check which
)
13293 unsigned int val32
= val64
& 0xffffffff;
13294 unsigned int val16
= val64
& 0xffff;
13295 unsigned int val8
= val64
& 0xff;
13297 if (val32
== (val64
>> 32))
13299 if ((which
& AARCH64_CHECK_ORR
) != 0
13300 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
13301 simd_immediate_info::MOV
))
13304 if ((which
& AARCH64_CHECK_BIC
) != 0
13305 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
13306 simd_immediate_info::MVN
))
13309 /* Try using a replicated byte. */
13310 if (which
== AARCH64_CHECK_MOV
13311 && val16
== (val32
>> 16)
13312 && val8
== (val16
>> 8))
13315 *info
= simd_immediate_info (QImode
, val8
);
13320 /* Try using a bit-to-bytemask. */
13321 if (which
== AARCH64_CHECK_MOV
)
13324 for (i
= 0; i
< 64; i
+= 8)
13326 unsigned char byte
= (val64
>> i
) & 0xff;
13327 if (byte
!= 0 && byte
!= 0xff)
13333 *info
= simd_immediate_info (DImode
, val64
);
13340 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13341 instruction. If INFO is nonnull, use it to describe valid immediates. */
13344 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
13345 simd_immediate_info
*info
)
13347 scalar_int_mode mode
= DImode
;
13348 unsigned int val32
= val64
& 0xffffffff;
13349 if (val32
== (val64
>> 32))
13352 unsigned int val16
= val32
& 0xffff;
13353 if (val16
== (val32
>> 16))
13356 unsigned int val8
= val16
& 0xff;
13357 if (val8
== (val16
>> 8))
13361 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
13362 if (IN_RANGE (val
, -0x80, 0x7f))
13364 /* DUP with no shift. */
13366 *info
= simd_immediate_info (mode
, val
);
13369 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
13371 /* DUP with LSL #8. */
13373 *info
= simd_immediate_info (mode
, val
);
13376 if (aarch64_bitmask_imm (val64
, mode
))
13380 *info
= simd_immediate_info (mode
, val
);
13386 /* Return true if OP is a valid SIMD immediate for the operation
13387 described by WHICH. If INFO is nonnull, use it to describe valid
13390 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
13391 enum simd_immediate_check which
)
13393 machine_mode mode
= GET_MODE (op
);
13394 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13395 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13398 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
13400 unsigned int n_elts
;
13401 if (GET_CODE (op
) == CONST_VECTOR
13402 && CONST_VECTOR_DUPLICATE_P (op
))
13403 n_elts
= CONST_VECTOR_NPATTERNS (op
);
13404 else if ((vec_flags
& VEC_SVE_DATA
)
13405 && const_vec_series_p (op
, &base
, &step
))
13407 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
13408 if (!aarch64_sve_index_immediate_p (base
)
13409 || !aarch64_sve_index_immediate_p (step
))
13413 *info
= simd_immediate_info (elt_mode
, base
, step
);
13416 else if (GET_CODE (op
) == CONST_VECTOR
13417 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
13418 /* N_ELTS set above. */;
13422 /* Handle PFALSE and PTRUE. */
13423 if (vec_flags
& VEC_SVE_PRED
)
13424 return (op
== CONST0_RTX (mode
)
13425 || op
== CONSTM1_RTX (mode
));
13427 scalar_float_mode elt_float_mode
;
13429 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
13431 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
13432 if (aarch64_float_const_zero_rtx_p (elt
)
13433 || aarch64_float_const_representable_p (elt
))
13436 *info
= simd_immediate_info (elt_float_mode
, elt
);
13441 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
13445 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
13447 /* Expand the vector constant out into a byte vector, with the least
13448 significant byte of the register first. */
13449 auto_vec
<unsigned char, 16> bytes
;
13450 bytes
.reserve (n_elts
* elt_size
);
13451 for (unsigned int i
= 0; i
< n_elts
; i
++)
13453 /* The vector is provided in gcc endian-neutral fashion.
13454 For aarch64_be Advanced SIMD, it must be laid out in the vector
13455 register in reverse order. */
13456 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
13457 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
13459 if (elt_mode
!= elt_int_mode
)
13460 elt
= gen_lowpart (elt_int_mode
, elt
);
13462 if (!CONST_INT_P (elt
))
13465 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
13466 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
13468 bytes
.quick_push (elt_val
& 0xff);
13469 elt_val
>>= BITS_PER_UNIT
;
13473 /* The immediate must repeat every eight bytes. */
13474 unsigned int nbytes
= bytes
.length ();
13475 for (unsigned i
= 8; i
< nbytes
; ++i
)
13476 if (bytes
[i
] != bytes
[i
- 8])
13479 /* Get the repeating 8-byte value as an integer. No endian correction
13480 is needed here because bytes is already in lsb-first order. */
13481 unsigned HOST_WIDE_INT val64
= 0;
13482 for (unsigned int i
= 0; i
< 8; i
++)
13483 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
13484 << (i
* BITS_PER_UNIT
));
13486 if (vec_flags
& VEC_SVE_DATA
)
13487 return aarch64_sve_valid_immediate (val64
, info
);
13489 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
13492 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13493 has a step in the range of INDEX. Return the index expression if so,
13494 otherwise return null. */
13496 aarch64_check_zero_based_sve_index_immediate (rtx x
)
13499 if (const_vec_series_p (x
, &base
, &step
)
13500 && base
== const0_rtx
13501 && aarch64_sve_index_immediate_p (step
))
13506 /* Check of immediate shift constants are within range. */
13508 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
13510 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
13512 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
13514 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
13517 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13518 operation of width WIDTH at bit position POS. */
13521 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
13523 gcc_assert (CONST_INT_P (width
));
13524 gcc_assert (CONST_INT_P (pos
));
13526 unsigned HOST_WIDE_INT mask
13527 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
13528 return GEN_INT (mask
<< UINTVAL (pos
));
13532 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
13534 if (GET_CODE (x
) == HIGH
13535 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
13538 if (CONST_INT_P (x
))
13541 if (VECTOR_MODE_P (GET_MODE (x
)))
13542 return aarch64_simd_valid_immediate (x
, NULL
);
13544 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
13547 if (aarch64_sve_cnt_immediate_p (x
))
13550 return aarch64_classify_symbolic_expression (x
)
13551 == SYMBOL_TINY_ABSOLUTE
;
13554 /* Return a const_int vector of VAL. */
13556 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
13558 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
13559 return gen_const_vec_duplicate (mode
, c
);
13562 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13565 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
13567 machine_mode vmode
;
13569 vmode
= aarch64_simd_container_mode (mode
, 64);
13570 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
13571 return aarch64_simd_valid_immediate (op_v
, NULL
);
13574 /* Construct and return a PARALLEL RTX vector with elements numbering the
13575 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13576 the vector - from the perspective of the architecture. This does not
13577 line up with GCC's perspective on lane numbers, so we end up with
13578 different masks depending on our target endian-ness. The diagram
13579 below may help. We must draw the distinction when building masks
13580 which select one half of the vector. An instruction selecting
13581 architectural low-lanes for a big-endian target, must be described using
13582 a mask selecting GCC high-lanes.
13584 Big-Endian Little-Endian
13586 GCC 0 1 2 3 3 2 1 0
13587 | x | x | x | x | | x | x | x | x |
13588 Architecture 3 2 1 0 3 2 1 0
13590 Low Mask: { 2, 3 } { 0, 1 }
13591 High Mask: { 0, 1 } { 2, 3 }
13593 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13596 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
13598 rtvec v
= rtvec_alloc (nunits
/ 2);
13599 int high_base
= nunits
/ 2;
13605 if (BYTES_BIG_ENDIAN
)
13606 base
= high
? low_base
: high_base
;
13608 base
= high
? high_base
: low_base
;
13610 for (i
= 0; i
< nunits
/ 2; i
++)
13611 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
13613 t1
= gen_rtx_PARALLEL (mode
, v
);
13617 /* Check OP for validity as a PARALLEL RTX vector with elements
13618 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13619 from the perspective of the architecture. See the diagram above
13620 aarch64_simd_vect_par_cnst_half for more details. */
13623 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
13627 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
13630 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
13631 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
13632 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
13635 if (count_op
!= count_ideal
)
13638 for (i
= 0; i
< count_ideal
; i
++)
13640 rtx elt_op
= XVECEXP (op
, 0, i
);
13641 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
13643 if (!CONST_INT_P (elt_op
)
13644 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
13650 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13651 HIGH (exclusive). */
13653 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
13656 HOST_WIDE_INT lane
;
13657 gcc_assert (CONST_INT_P (operand
));
13658 lane
= INTVAL (operand
);
13660 if (lane
< low
|| lane
>= high
)
13663 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
13665 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
13669 /* Peform endian correction on lane number N, which indexes a vector
13670 of mode MODE, and return the result as an SImode rtx. */
13673 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
13675 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
13678 /* Return TRUE if OP is a valid vector addressing mode. */
13681 aarch64_simd_mem_operand_p (rtx op
)
13683 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
13684 || REG_P (XEXP (op
, 0)));
13687 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13690 aarch64_sve_ld1r_operand_p (rtx op
)
13692 struct aarch64_address_info addr
;
13696 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
13697 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
13698 && addr
.type
== ADDRESS_REG_IMM
13699 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
13702 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13703 The conditions for STR are the same. */
13705 aarch64_sve_ldr_operand_p (rtx op
)
13707 struct aarch64_address_info addr
;
13710 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
13711 false, ADDR_QUERY_ANY
)
13712 && addr
.type
== ADDRESS_REG_IMM
);
13715 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13716 We need to be able to access the individual pieces, so the range
13717 is different from LD[234] and ST[234]. */
13719 aarch64_sve_struct_memory_operand_p (rtx op
)
13724 machine_mode mode
= GET_MODE (op
);
13725 struct aarch64_address_info addr
;
13726 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
13728 || addr
.type
!= ADDRESS_REG_IMM
)
13731 poly_int64 first
= addr
.const_offset
;
13732 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
13733 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
13734 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
13737 /* Emit a register copy from operand to operand, taking care not to
13738 early-clobber source registers in the process.
13740 COUNT is the number of components into which the copy needs to be
13743 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
13744 unsigned int count
)
13747 int rdest
= REGNO (operands
[0]);
13748 int rsrc
= REGNO (operands
[1]);
13750 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
13752 for (i
= 0; i
< count
; i
++)
13753 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
13754 gen_rtx_REG (mode
, rsrc
+ i
));
13756 for (i
= 0; i
< count
; i
++)
13757 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
13758 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
13761 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13762 one of VSTRUCT modes: OI, CI, or XI. */
13764 aarch64_simd_attr_length_rglist (machine_mode mode
)
13766 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13767 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
13770 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13771 alignment of a vector to 128 bits. SVE predicates have an alignment of
13773 static HOST_WIDE_INT
13774 aarch64_simd_vector_alignment (const_tree type
)
13776 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13777 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13778 be set for non-predicate vectors of booleans. Modes are the most
13779 direct way we have of identifying real SVE predicate types. */
13780 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
13781 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
13782 return MIN (align
, 128);
13785 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13786 static HOST_WIDE_INT
13787 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
13789 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
13791 /* If the length of the vector is fixed, try to align to that length,
13792 otherwise don't try to align at all. */
13793 HOST_WIDE_INT result
;
13794 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
13795 result
= TYPE_ALIGN (TREE_TYPE (type
));
13798 return TYPE_ALIGN (type
);
13801 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13803 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
13808 /* For fixed-length vectors, check that the vectorizer will aim for
13809 full-vector alignment. This isn't true for generic GCC vectors
13810 that are wider than the ABI maximum of 128 bits. */
13811 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
13812 && (wi::to_widest (TYPE_SIZE (type
))
13813 != aarch64_vectorize_preferred_vector_alignment (type
)))
13816 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13820 /* Return true if the vector misalignment factor is supported by the
13823 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
13824 const_tree type
, int misalignment
,
13827 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
13829 /* Return if movmisalign pattern is not supported for this mode. */
13830 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
13833 /* Misalignment factor is unknown at compile time. */
13834 if (misalignment
== -1)
13837 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
13841 /* If VALS is a vector constant that can be loaded into a register
13842 using DUP, generate instructions to do so and return an RTX to
13843 assign to the register. Otherwise return NULL_RTX. */
13845 aarch64_simd_dup_constant (rtx vals
)
13847 machine_mode mode
= GET_MODE (vals
);
13848 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13851 if (!const_vec_duplicate_p (vals
, &x
))
13854 /* We can load this constant by using DUP and a constant in a
13855 single ARM register. This will be cheaper than a vector
13857 x
= copy_to_mode_reg (inner_mode
, x
);
13858 return gen_vec_duplicate (mode
, x
);
13862 /* Generate code to load VALS, which is a PARALLEL containing only
13863 constants (for vec_init) or CONST_VECTOR, efficiently into a
13864 register. Returns an RTX to copy into the register, or NULL_RTX
13865 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13867 aarch64_simd_make_constant (rtx vals
)
13869 machine_mode mode
= GET_MODE (vals
);
13871 rtx const_vec
= NULL_RTX
;
13875 if (GET_CODE (vals
) == CONST_VECTOR
)
13877 else if (GET_CODE (vals
) == PARALLEL
)
13879 /* A CONST_VECTOR must contain only CONST_INTs and
13880 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13881 Only store valid constants in a CONST_VECTOR. */
13882 int n_elts
= XVECLEN (vals
, 0);
13883 for (i
= 0; i
< n_elts
; ++i
)
13885 rtx x
= XVECEXP (vals
, 0, i
);
13886 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13889 if (n_const
== n_elts
)
13890 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
13893 gcc_unreachable ();
13895 if (const_vec
!= NULL_RTX
13896 && aarch64_simd_valid_immediate (const_vec
, NULL
))
13897 /* Load using MOVI/MVNI. */
13899 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
13900 /* Loaded using DUP. */
13902 else if (const_vec
!= NULL_RTX
)
13903 /* Load from constant pool. We can not take advantage of single-cycle
13904 LD1 because we need a PC-relative addressing mode. */
13907 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13908 We can not construct an initializer. */
13912 /* Expand a vector initialisation sequence, such that TARGET is
13913 initialised to contain VALS. */
13916 aarch64_expand_vector_init (rtx target
, rtx vals
)
13918 machine_mode mode
= GET_MODE (target
);
13919 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
13920 /* The number of vector elements. */
13921 int n_elts
= XVECLEN (vals
, 0);
13922 /* The number of vector elements which are not constant. */
13924 rtx any_const
= NULL_RTX
;
13925 /* The first element of vals. */
13926 rtx v0
= XVECEXP (vals
, 0, 0);
13927 bool all_same
= true;
13929 /* Count the number of variable elements to initialise. */
13930 for (int i
= 0; i
< n_elts
; ++i
)
13932 rtx x
= XVECEXP (vals
, 0, i
);
13933 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
13938 all_same
&= rtx_equal_p (x
, v0
);
13941 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13942 how best to handle this. */
13945 rtx constant
= aarch64_simd_make_constant (vals
);
13946 if (constant
!= NULL_RTX
)
13948 emit_move_insn (target
, constant
);
13953 /* Splat a single non-constant element if we can. */
13956 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
13957 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13961 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
13962 gcc_assert (icode
!= CODE_FOR_nothing
);
13964 /* If there are only variable elements, try to optimize
13965 the insertion using dup for the most common element
13966 followed by insertions. */
13968 /* The algorithm will fill matches[*][0] with the earliest matching element,
13969 and matches[X][1] with the count of duplicate elements (if X is the
13970 earliest element which has duplicates). */
13972 if (n_var
== n_elts
&& n_elts
<= 16)
13974 int matches
[16][2] = {0};
13975 for (int i
= 0; i
< n_elts
; i
++)
13977 for (int j
= 0; j
<= i
; j
++)
13979 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
13987 int maxelement
= 0;
13989 for (int i
= 0; i
< n_elts
; i
++)
13990 if (matches
[i
][1] > maxv
)
13993 maxv
= matches
[i
][1];
13996 /* Create a duplicate of the most common element, unless all elements
13997 are equally useless to us, in which case just immediately set the
13998 vector register using the first element. */
14002 /* For vectors of two 64-bit elements, we can do even better. */
14004 && (inner_mode
== E_DImode
14005 || inner_mode
== E_DFmode
))
14008 rtx x0
= XVECEXP (vals
, 0, 0);
14009 rtx x1
= XVECEXP (vals
, 0, 1);
14010 /* Combine can pick up this case, but handling it directly
14011 here leaves clearer RTL.
14013 This is load_pair_lanes<mode>, and also gives us a clean-up
14014 for store_pair_lanes<mode>. */
14015 if (memory_operand (x0
, inner_mode
)
14016 && memory_operand (x1
, inner_mode
)
14017 && !STRICT_ALIGNMENT
14018 && rtx_equal_p (XEXP (x1
, 0),
14019 plus_constant (Pmode
,
14021 GET_MODE_SIZE (inner_mode
))))
14024 if (inner_mode
== DFmode
)
14025 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
14027 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
14032 /* The subreg-move sequence below will move into lane zero of the
14033 vector register. For big-endian we want that position to hold
14034 the last element of VALS. */
14035 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
14036 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14037 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
14041 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14042 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
14045 /* Insert the rest. */
14046 for (int i
= 0; i
< n_elts
; i
++)
14048 rtx x
= XVECEXP (vals
, 0, i
);
14049 if (matches
[i
][0] == maxelement
)
14051 x
= copy_to_mode_reg (inner_mode
, x
);
14052 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14057 /* Initialise a vector which is part-variable. We want to first try
14058 to build those lanes which are constant in the most efficient way we
14060 if (n_var
!= n_elts
)
14062 rtx copy
= copy_rtx (vals
);
14064 /* Load constant part of vector. We really don't care what goes into the
14065 parts we will overwrite, but we're more likely to be able to load the
14066 constant efficiently if it has fewer, larger, repeating parts
14067 (see aarch64_simd_valid_immediate). */
14068 for (int i
= 0; i
< n_elts
; i
++)
14070 rtx x
= XVECEXP (vals
, 0, i
);
14071 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14073 rtx subst
= any_const
;
14074 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
14076 /* Look in the copied vector, as more elements are const. */
14077 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
14078 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
14084 XVECEXP (copy
, 0, i
) = subst
;
14086 aarch64_expand_vector_init (target
, copy
);
14089 /* Insert the variable lanes directly. */
14090 for (int i
= 0; i
< n_elts
; i
++)
14092 rtx x
= XVECEXP (vals
, 0, i
);
14093 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14095 x
= copy_to_mode_reg (inner_mode
, x
);
14096 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14100 static unsigned HOST_WIDE_INT
14101 aarch64_shift_truncation_mask (machine_mode mode
)
14103 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
14105 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
14108 /* Select a format to encode pointers in exception handling data. */
14110 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
14113 switch (aarch64_cmodel
)
14115 case AARCH64_CMODEL_TINY
:
14116 case AARCH64_CMODEL_TINY_PIC
:
14117 case AARCH64_CMODEL_SMALL
:
14118 case AARCH64_CMODEL_SMALL_PIC
:
14119 case AARCH64_CMODEL_SMALL_SPIC
:
14120 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14122 type
= DW_EH_PE_sdata4
;
14125 /* No assumptions here. 8-byte relocs required. */
14126 type
= DW_EH_PE_sdata8
;
14129 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
14132 /* The last .arch and .tune assembly strings that we printed. */
14133 static std::string aarch64_last_printed_arch_string
;
14134 static std::string aarch64_last_printed_tune_string
;
14136 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14137 by the function fndecl. */
14140 aarch64_declare_function_name (FILE *stream
, const char* name
,
14143 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14145 struct cl_target_option
*targ_options
;
14147 targ_options
= TREE_TARGET_OPTION (target_parts
);
14149 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
14150 gcc_assert (targ_options
);
14152 const struct processor
*this_arch
14153 = aarch64_get_arch (targ_options
->x_explicit_arch
);
14155 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
14156 std::string extension
14157 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
14159 /* Only update the assembler .arch string if it is distinct from the last
14160 such string we printed. */
14161 std::string to_print
= this_arch
->name
+ extension
;
14162 if (to_print
!= aarch64_last_printed_arch_string
)
14164 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
14165 aarch64_last_printed_arch_string
= to_print
;
14168 /* Print the cpu name we're tuning for in the comments, might be
14169 useful to readers of the generated asm. Do it only when it changes
14170 from function to function and verbose assembly is requested. */
14171 const struct processor
*this_tune
14172 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
14174 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
14176 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
14178 aarch64_last_printed_tune_string
= this_tune
->name
;
14181 /* Don't forget the type directive for ELF. */
14182 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
14183 ASM_OUTPUT_LABEL (stream
, name
);
14186 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14189 aarch64_start_file (void)
14191 struct cl_target_option
*default_options
14192 = TREE_TARGET_OPTION (target_option_default_node
);
14194 const struct processor
*default_arch
14195 = aarch64_get_arch (default_options
->x_explicit_arch
);
14196 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
14197 std::string extension
14198 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
14199 default_arch
->flags
);
14201 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
14202 aarch64_last_printed_tune_string
= "";
14203 asm_fprintf (asm_out_file
, "\t.arch %s\n",
14204 aarch64_last_printed_arch_string
.c_str ());
14206 default_file_start ();
14209 /* Emit load exclusive. */
14212 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
14213 rtx mem
, rtx model_rtx
)
14215 rtx (*gen
) (rtx
, rtx
, rtx
);
14219 case E_QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
14220 case E_HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
14221 case E_SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
14222 case E_DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
14224 gcc_unreachable ();
14227 emit_insn (gen (rval
, mem
, model_rtx
));
14230 /* Emit store exclusive. */
14233 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
14234 rtx rval
, rtx mem
, rtx model_rtx
)
14236 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14240 case E_QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
14241 case E_HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
14242 case E_SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
14243 case E_DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
14245 gcc_unreachable ();
14248 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
14251 /* Mark the previous jump instruction as unlikely. */
14254 aarch64_emit_unlikely_jump (rtx insn
)
14256 rtx_insn
*jump
= emit_jump_insn (insn
);
14257 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
14260 /* Expand a compare and swap pattern. */
14263 aarch64_expand_compare_and_swap (rtx operands
[])
14265 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
14266 machine_mode mode
, cmp_mode
;
14267 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14270 const gen_cas_fn split_cas
[] =
14272 gen_aarch64_compare_and_swapqi
,
14273 gen_aarch64_compare_and_swaphi
,
14274 gen_aarch64_compare_and_swapsi
,
14275 gen_aarch64_compare_and_swapdi
14277 const gen_cas_fn atomic_cas
[] =
14279 gen_aarch64_compare_and_swapqi_lse
,
14280 gen_aarch64_compare_and_swaphi_lse
,
14281 gen_aarch64_compare_and_swapsi_lse
,
14282 gen_aarch64_compare_and_swapdi_lse
14285 bval
= operands
[0];
14286 rval
= operands
[1];
14288 oldval
= operands
[3];
14289 newval
= operands
[4];
14290 is_weak
= operands
[5];
14291 mod_s
= operands
[6];
14292 mod_f
= operands
[7];
14293 mode
= GET_MODE (mem
);
14296 /* Normally the succ memory model must be stronger than fail, but in the
14297 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14298 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14300 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
14301 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
14302 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
14308 /* For short modes, we're going to perform the comparison in SImode,
14309 so do the zero-extension now. */
14311 rval
= gen_reg_rtx (SImode
);
14312 oldval
= convert_modes (SImode
, mode
, oldval
, true);
14313 /* Fall through. */
14317 /* Force the value into a register if needed. */
14318 if (!aarch64_plus_operand (oldval
, mode
))
14319 oldval
= force_reg (cmp_mode
, oldval
);
14323 gcc_unreachable ();
14328 case E_QImode
: idx
= 0; break;
14329 case E_HImode
: idx
= 1; break;
14330 case E_SImode
: idx
= 2; break;
14331 case E_DImode
: idx
= 3; break;
14333 gcc_unreachable ();
14336 gen
= atomic_cas
[idx
];
14338 gen
= split_cas
[idx
];
14340 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
14342 if (mode
== QImode
|| mode
== HImode
)
14343 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
14345 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14346 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
14347 emit_insn (gen_rtx_SET (bval
, x
));
14350 /* Test whether the target supports using a atomic load-operate instruction.
14351 CODE is the operation and AFTER is TRUE if the data in memory after the
14352 operation should be returned and FALSE if the data before the operation
14353 should be returned. Returns FALSE if the operation isn't supported by the
14357 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
14376 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14377 sequence implementing an atomic operation. */
14380 aarch64_emit_post_barrier (enum memmodel model
)
14382 const enum memmodel base_model
= memmodel_base (model
);
14384 if (is_mm_sync (model
)
14385 && (base_model
== MEMMODEL_ACQUIRE
14386 || base_model
== MEMMODEL_ACQ_REL
14387 || base_model
== MEMMODEL_SEQ_CST
))
14389 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
14393 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14394 for the data in memory. EXPECTED is the value expected to be in memory.
14395 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14396 is the memory ordering to use. */
14399 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
14400 rtx expected
, rtx desired
,
14403 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14406 mode
= GET_MODE (mem
);
14410 case E_QImode
: gen
= gen_aarch64_atomic_casqi
; break;
14411 case E_HImode
: gen
= gen_aarch64_atomic_cashi
; break;
14412 case E_SImode
: gen
= gen_aarch64_atomic_cassi
; break;
14413 case E_DImode
: gen
= gen_aarch64_atomic_casdi
; break;
14415 gcc_unreachable ();
14418 /* Move the expected value into the CAS destination register. */
14419 emit_insn (gen_rtx_SET (rval
, expected
));
14421 /* Emit the CAS. */
14422 emit_insn (gen (rval
, mem
, desired
, model
));
14424 /* Compare the expected value with the value loaded by the CAS, to establish
14425 whether the swap was made. */
14426 aarch64_gen_compare_reg (EQ
, rval
, expected
);
14429 /* Split a compare and swap pattern. */
14432 aarch64_split_compare_and_swap (rtx operands
[])
14434 rtx rval
, mem
, oldval
, newval
, scratch
;
14437 rtx_code_label
*label1
, *label2
;
14439 enum memmodel model
;
14442 rval
= operands
[0];
14444 oldval
= operands
[2];
14445 newval
= operands
[3];
14446 is_weak
= (operands
[4] != const0_rtx
);
14447 model_rtx
= operands
[5];
14448 scratch
= operands
[7];
14449 mode
= GET_MODE (mem
);
14450 model
= memmodel_from_int (INTVAL (model_rtx
));
14452 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14455 LD[A]XR rval, [mem]
14457 ST[L]XR scratch, newval, [mem]
14458 CBNZ scratch, .label1
14461 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
14466 label1
= gen_label_rtx ();
14467 emit_label (label1
);
14469 label2
= gen_label_rtx ();
14471 /* The initial load can be relaxed for a __sync operation since a final
14472 barrier will be emitted to stop code hoisting. */
14473 if (is_mm_sync (model
))
14474 aarch64_emit_load_exclusive (mode
, rval
, mem
,
14475 GEN_INT (MEMMODEL_RELAXED
));
14477 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
14481 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
14482 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14483 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14484 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14488 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
14489 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14490 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14491 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14492 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14495 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
14499 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
14500 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14501 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
14502 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14506 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14507 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
14508 emit_insn (gen_rtx_SET (cond
, x
));
14511 emit_label (label2
);
14512 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14513 to set the condition flags. If this is not used it will be removed by
14517 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14518 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
14519 emit_insn (gen_rtx_SET (cond
, x
));
14521 /* Emit any final barrier needed for a __sync operation. */
14522 if (is_mm_sync (model
))
14523 aarch64_emit_post_barrier (model
);
14526 /* Emit a BIC instruction. */
14529 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
14531 rtx shift_rtx
= GEN_INT (shift
);
14532 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14536 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
14537 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
14539 gcc_unreachable ();
14542 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
14545 /* Emit an atomic swap. */
14548 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
14549 rtx mem
, rtx model
)
14551 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14555 case E_QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
14556 case E_HImode
: gen
= gen_aarch64_atomic_swphi
; break;
14557 case E_SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
14558 case E_DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
14560 gcc_unreachable ();
14563 emit_insn (gen (dst
, mem
, value
, model
));
14566 /* Operations supported by aarch64_emit_atomic_load_op. */
14568 enum aarch64_atomic_load_op_code
14570 AARCH64_LDOP_PLUS
, /* A + B */
14571 AARCH64_LDOP_XOR
, /* A ^ B */
14572 AARCH64_LDOP_OR
, /* A | B */
14573 AARCH64_LDOP_BIC
/* A & ~B */
14576 /* Emit an atomic load-operate. */
14579 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
14580 machine_mode mode
, rtx dst
, rtx src
,
14581 rtx mem
, rtx model
)
14583 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
14584 const aarch64_atomic_load_op_fn plus
[] =
14586 gen_aarch64_atomic_loadaddqi
,
14587 gen_aarch64_atomic_loadaddhi
,
14588 gen_aarch64_atomic_loadaddsi
,
14589 gen_aarch64_atomic_loadadddi
14591 const aarch64_atomic_load_op_fn eor
[] =
14593 gen_aarch64_atomic_loadeorqi
,
14594 gen_aarch64_atomic_loadeorhi
,
14595 gen_aarch64_atomic_loadeorsi
,
14596 gen_aarch64_atomic_loadeordi
14598 const aarch64_atomic_load_op_fn ior
[] =
14600 gen_aarch64_atomic_loadsetqi
,
14601 gen_aarch64_atomic_loadsethi
,
14602 gen_aarch64_atomic_loadsetsi
,
14603 gen_aarch64_atomic_loadsetdi
14605 const aarch64_atomic_load_op_fn bic
[] =
14607 gen_aarch64_atomic_loadclrqi
,
14608 gen_aarch64_atomic_loadclrhi
,
14609 gen_aarch64_atomic_loadclrsi
,
14610 gen_aarch64_atomic_loadclrdi
14612 aarch64_atomic_load_op_fn gen
;
14617 case E_QImode
: idx
= 0; break;
14618 case E_HImode
: idx
= 1; break;
14619 case E_SImode
: idx
= 2; break;
14620 case E_DImode
: idx
= 3; break;
14622 gcc_unreachable ();
14627 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
14628 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
14629 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
14630 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
14632 gcc_unreachable ();
14635 emit_insn (gen (dst
, mem
, src
, model
));
14638 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14639 location to store the data read from memory. OUT_RESULT is the location to
14640 store the result of the operation. MEM is the memory location to read and
14641 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14642 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14646 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
14647 rtx mem
, rtx value
, rtx model_rtx
)
14649 machine_mode mode
= GET_MODE (mem
);
14650 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14651 const bool short_mode
= (mode
< SImode
);
14652 aarch64_atomic_load_op_code ldop_code
;
14657 out_data
= gen_lowpart (mode
, out_data
);
14660 out_result
= gen_lowpart (mode
, out_result
);
14662 /* Make sure the value is in a register, putting it into a destination
14663 register if it needs to be manipulated. */
14664 if (!register_operand (value
, mode
)
14665 || code
== AND
|| code
== MINUS
)
14667 src
= out_result
? out_result
: out_data
;
14668 emit_move_insn (src
, gen_lowpart (mode
, value
));
14672 gcc_assert (register_operand (src
, mode
));
14674 /* Preprocess the data for the operation as necessary. If the operation is
14675 a SET then emit a swap instruction and finish. */
14679 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
14683 /* Negate the value and treat it as a PLUS. */
14687 /* Resize the value if necessary. */
14689 src
= gen_lowpart (wmode
, src
);
14691 neg_src
= gen_rtx_NEG (wmode
, src
);
14692 emit_insn (gen_rtx_SET (src
, neg_src
));
14695 src
= gen_lowpart (mode
, src
);
14697 /* Fall-through. */
14699 ldop_code
= AARCH64_LDOP_PLUS
;
14703 ldop_code
= AARCH64_LDOP_OR
;
14707 ldop_code
= AARCH64_LDOP_XOR
;
14714 /* Resize the value if necessary. */
14716 src
= gen_lowpart (wmode
, src
);
14718 not_src
= gen_rtx_NOT (wmode
, src
);
14719 emit_insn (gen_rtx_SET (src
, not_src
));
14722 src
= gen_lowpart (mode
, src
);
14724 ldop_code
= AARCH64_LDOP_BIC
;
14728 /* The operation can't be done with atomic instructions. */
14729 gcc_unreachable ();
14732 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
14734 /* If necessary, calculate the data in memory after the update by redoing the
14735 operation from values in registers. */
14741 src
= gen_lowpart (wmode
, src
);
14742 out_data
= gen_lowpart (wmode
, out_data
);
14743 out_result
= gen_lowpart (wmode
, out_result
);
14752 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
14755 x
= gen_rtx_IOR (wmode
, out_data
, src
);
14758 x
= gen_rtx_XOR (wmode
, out_data
, src
);
14761 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
14764 gcc_unreachable ();
14767 emit_set_insn (out_result
, x
);
14772 /* Split an atomic operation. */
14775 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
14776 rtx value
, rtx model_rtx
, rtx cond
)
14778 machine_mode mode
= GET_MODE (mem
);
14779 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14780 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
14781 const bool is_sync
= is_mm_sync (model
);
14782 rtx_code_label
*label
;
14785 /* Split the atomic operation into a sequence. */
14786 label
= gen_label_rtx ();
14787 emit_label (label
);
14790 new_out
= gen_lowpart (wmode
, new_out
);
14792 old_out
= gen_lowpart (wmode
, old_out
);
14795 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
14797 /* The initial load can be relaxed for a __sync operation since a final
14798 barrier will be emitted to stop code hoisting. */
14800 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
14801 GEN_INT (MEMMODEL_RELAXED
));
14803 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
14812 x
= gen_rtx_AND (wmode
, old_out
, value
);
14813 emit_insn (gen_rtx_SET (new_out
, x
));
14814 x
= gen_rtx_NOT (wmode
, new_out
);
14815 emit_insn (gen_rtx_SET (new_out
, x
));
14819 if (CONST_INT_P (value
))
14821 value
= GEN_INT (-INTVAL (value
));
14824 /* Fall through. */
14827 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
14828 emit_insn (gen_rtx_SET (new_out
, x
));
14832 aarch64_emit_store_exclusive (mode
, cond
, mem
,
14833 gen_lowpart (mode
, new_out
), model_rtx
);
14835 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14836 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14837 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
14838 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14840 /* Emit any final barrier needed for a __sync operation. */
14842 aarch64_emit_post_barrier (model
);
14846 aarch64_init_libfuncs (void)
14848 /* Half-precision float operations. The compiler handles all operations
14849 with NULL libfuncs by converting to SFmode. */
14852 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
14853 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
14856 set_optab_libfunc (add_optab
, HFmode
, NULL
);
14857 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
14858 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
14859 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
14860 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
14863 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
14864 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
14865 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
14866 set_optab_libfunc (le_optab
, HFmode
, NULL
);
14867 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
14868 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
14869 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
14872 /* Target hook for c_mode_for_suffix. */
14873 static machine_mode
14874 aarch64_c_mode_for_suffix (char suffix
)
14882 /* We can only represent floating point constants which will fit in
14883 "quarter-precision" values. These values are characterised by
14884 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14887 (-1)^s * (n/16) * 2^r
14890 's' is the sign bit.
14891 'n' is an integer in the range 16 <= n <= 31.
14892 'r' is an integer in the range -3 <= r <= 4. */
14894 /* Return true iff X can be represented by a quarter-precision
14895 floating point immediate operand X. Note, we cannot represent 0.0. */
14897 aarch64_float_const_representable_p (rtx x
)
14899 /* This represents our current view of how many bits
14900 make up the mantissa. */
14901 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
14903 unsigned HOST_WIDE_INT mantissa
, mask
;
14904 REAL_VALUE_TYPE r
, m
;
14907 if (!CONST_DOUBLE_P (x
))
14910 /* We don't support HFmode constants yet. */
14911 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
14914 r
= *CONST_DOUBLE_REAL_VALUE (x
);
14916 /* We cannot represent infinities, NaNs or +/-zero. We won't
14917 know if we have +zero until we analyse the mantissa, but we
14918 can reject the other invalid values. */
14919 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
14920 || REAL_VALUE_MINUS_ZERO (r
))
14923 /* Extract exponent. */
14924 r
= real_value_abs (&r
);
14925 exponent
= REAL_EXP (&r
);
14927 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14928 highest (sign) bit, with a fixed binary point at bit point_pos.
14929 m1 holds the low part of the mantissa, m2 the high part.
14930 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14931 bits for the mantissa, this can fail (low bits will be lost). */
14932 real_ldexp (&m
, &r
, point_pos
- exponent
);
14933 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
14935 /* If the low part of the mantissa has bits set we cannot represent
14937 if (w
.ulow () != 0)
14939 /* We have rejected the lower HOST_WIDE_INT, so update our
14940 understanding of how many bits lie in the mantissa and
14941 look only at the high HOST_WIDE_INT. */
14942 mantissa
= w
.elt (1);
14943 point_pos
-= HOST_BITS_PER_WIDE_INT
;
14945 /* We can only represent values with a mantissa of the form 1.xxxx. */
14946 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
14947 if ((mantissa
& mask
) != 0)
14950 /* Having filtered unrepresentable values, we may now remove all
14951 but the highest 5 bits. */
14952 mantissa
>>= point_pos
- 5;
14954 /* We cannot represent the value 0.0, so reject it. This is handled
14959 /* Then, as bit 4 is always set, we can mask it off, leaving
14960 the mantissa in the range [0, 15]. */
14961 mantissa
&= ~(1 << 4);
14962 gcc_assert (mantissa
<= 15);
14964 /* GCC internally does not use IEEE754-like encoding (where normalized
14965 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14966 Our mantissa values are shifted 4 places to the left relative to
14967 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14968 by 5 places to correct for GCC's representation. */
14969 exponent
= 5 - exponent
;
14971 return (exponent
>= 0 && exponent
<= 7);
14974 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14975 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14976 output MOVI/MVNI, ORR or BIC immediate. */
14978 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
14979 enum simd_immediate_check which
)
14982 static char templ
[40];
14983 const char *mnemonic
;
14984 const char *shift_op
;
14985 unsigned int lane_count
= 0;
14988 struct simd_immediate_info info
;
14990 /* This will return true to show const_vector is legal for use as either
14991 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14992 It will also update INFO to show how the immediate should be generated.
14993 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14994 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
14995 gcc_assert (is_valid
);
14997 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14998 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
15000 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15002 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
15003 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15004 move immediate path. */
15005 if (aarch64_float_const_zero_rtx_p (info
.value
))
15006 info
.value
= GEN_INT (0);
15009 const unsigned int buf_size
= 20;
15010 char float_buf
[buf_size
] = {'\0'};
15011 real_to_decimal_for_mode (float_buf
,
15012 CONST_DOUBLE_REAL_VALUE (info
.value
),
15013 buf_size
, buf_size
, 1, info
.elt_mode
);
15015 if (lane_count
== 1)
15016 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
15018 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
15019 lane_count
, element_char
, float_buf
);
15024 gcc_assert (CONST_INT_P (info
.value
));
15026 if (which
== AARCH64_CHECK_MOV
)
15028 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
15029 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
15030 if (lane_count
== 1)
15031 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
15032 mnemonic
, UINTVAL (info
.value
));
15033 else if (info
.shift
)
15034 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15035 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
15036 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
15038 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15039 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
15040 element_char
, UINTVAL (info
.value
));
15044 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15045 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
15047 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15048 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
15049 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
15051 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15052 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
15053 element_char
, UINTVAL (info
.value
));
15059 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
15062 /* If a floating point number was passed and we desire to use it in an
15063 integer mode do the conversion to integer. */
15064 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
15066 unsigned HOST_WIDE_INT ival
;
15067 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
15068 gcc_unreachable ();
15069 immediate
= gen_int_mode (ival
, mode
);
15072 machine_mode vmode
;
15073 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15074 a 128 bit vector mode. */
15075 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
15077 vmode
= aarch64_simd_container_mode (mode
, width
);
15078 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
15079 return aarch64_output_simd_mov_immediate (v_op
, width
);
15082 /* Return the output string to use for moving immediate CONST_VECTOR
15083 into an SVE register. */
15086 aarch64_output_sve_mov_immediate (rtx const_vector
)
15088 static char templ
[40];
15089 struct simd_immediate_info info
;
15092 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
15093 gcc_assert (is_valid
);
15095 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15099 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
15100 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
15101 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
15105 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15107 if (aarch64_float_const_zero_rtx_p (info
.value
))
15108 info
.value
= GEN_INT (0);
15111 const int buf_size
= 20;
15112 char float_buf
[buf_size
] = {};
15113 real_to_decimal_for_mode (float_buf
,
15114 CONST_DOUBLE_REAL_VALUE (info
.value
),
15115 buf_size
, buf_size
, 1, info
.elt_mode
);
15117 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
15118 element_char
, float_buf
);
15123 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
15124 element_char
, INTVAL (info
.value
));
15128 /* Return the asm format for a PTRUE instruction whose destination has
15129 mode MODE. SUFFIX is the element size suffix. */
15132 aarch64_output_ptrue (machine_mode mode
, char suffix
)
15134 unsigned int nunits
;
15135 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
15136 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
15137 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
15139 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
15143 /* Split operands into moves from op[1] + op[2] into op[0]. */
15146 aarch64_split_combinev16qi (rtx operands
[3])
15148 unsigned int dest
= REGNO (operands
[0]);
15149 unsigned int src1
= REGNO (operands
[1]);
15150 unsigned int src2
= REGNO (operands
[2]);
15151 machine_mode halfmode
= GET_MODE (operands
[1]);
15152 unsigned int halfregs
= REG_NREGS (operands
[1]);
15153 rtx destlo
, desthi
;
15155 gcc_assert (halfmode
== V16QImode
);
15157 if (src1
== dest
&& src2
== dest
+ halfregs
)
15159 /* No-op move. Can't split to nothing; emit something. */
15160 emit_note (NOTE_INSN_DELETED
);
15164 /* Preserve register attributes for variable tracking. */
15165 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
15166 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
15167 GET_MODE_SIZE (halfmode
));
15169 /* Special case of reversed high/low parts. */
15170 if (reg_overlap_mentioned_p (operands
[2], destlo
)
15171 && reg_overlap_mentioned_p (operands
[1], desthi
))
15173 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15174 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
15175 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15177 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
15179 /* Try to avoid unnecessary moves if part of the result
15180 is in the right place already. */
15182 emit_move_insn (destlo
, operands
[1]);
15183 if (src2
!= dest
+ halfregs
)
15184 emit_move_insn (desthi
, operands
[2]);
15188 if (src2
!= dest
+ halfregs
)
15189 emit_move_insn (desthi
, operands
[2]);
15191 emit_move_insn (destlo
, operands
[1]);
15195 /* vec_perm support. */
15197 struct expand_vec_perm_d
15199 rtx target
, op0
, op1
;
15200 vec_perm_indices perm
;
15201 machine_mode vmode
;
15202 unsigned int vec_flags
;
15207 /* Generate a variable permutation. */
15210 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15212 machine_mode vmode
= GET_MODE (target
);
15213 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15215 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
15216 gcc_checking_assert (GET_MODE (op0
) == vmode
);
15217 gcc_checking_assert (GET_MODE (op1
) == vmode
);
15218 gcc_checking_assert (GET_MODE (sel
) == vmode
);
15219 gcc_checking_assert (TARGET_SIMD
);
15223 if (vmode
== V8QImode
)
15225 /* Expand the argument to a V16QI mode by duplicating it. */
15226 rtx pair
= gen_reg_rtx (V16QImode
);
15227 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
15228 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15232 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
15239 if (vmode
== V8QImode
)
15241 pair
= gen_reg_rtx (V16QImode
);
15242 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
15243 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15247 pair
= gen_reg_rtx (OImode
);
15248 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
15249 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
15254 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15255 NELT is the number of elements in the vector. */
15258 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
15261 machine_mode vmode
= GET_MODE (target
);
15262 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15265 /* The TBL instruction does not use a modulo index, so we must take care
15266 of that ourselves. */
15267 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
15268 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
15269 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
15271 /* For big-endian, we also need to reverse the index within the vector
15272 (but not which vector). */
15273 if (BYTES_BIG_ENDIAN
)
15275 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15277 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15278 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15279 NULL
, 0, OPTAB_LIB_WIDEN
);
15281 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15284 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15287 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15289 emit_insn (gen_rtx_SET (target
,
15290 gen_rtx_UNSPEC (GET_MODE (target
),
15291 gen_rtvec (2, op0
, op1
), code
)));
15294 /* Expand an SVE vec_perm with the given operands. */
15297 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15299 machine_mode data_mode
= GET_MODE (target
);
15300 machine_mode sel_mode
= GET_MODE (sel
);
15301 /* Enforced by the pattern condition. */
15302 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15304 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15305 size of the two value vectors, i.e. the upper bits of the indices
15306 are effectively ignored. SVE TBL instead produces 0 for any
15307 out-of-range indices, so we need to modulo all the vec_perm indices
15308 to ensure they are all in range. */
15309 rtx sel_reg
= force_reg (sel_mode
, sel
);
15311 /* Check if the sel only references the first values vector. */
15312 if (GET_CODE (sel
) == CONST_VECTOR
15313 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15315 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15319 /* Check if the two values vectors are the same. */
15320 if (rtx_equal_p (op0
, op1
))
15322 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15323 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15324 NULL
, 0, OPTAB_DIRECT
);
15325 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15329 /* Run TBL on for each value vector and combine the results. */
15331 rtx res0
= gen_reg_rtx (data_mode
);
15332 rtx res1
= gen_reg_rtx (data_mode
);
15333 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15334 if (GET_CODE (sel
) != CONST_VECTOR
15335 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15337 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15339 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15340 NULL
, 0, OPTAB_DIRECT
);
15342 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15343 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15344 NULL
, 0, OPTAB_DIRECT
);
15345 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15346 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15347 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15349 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15352 /* Recognize patterns suitable for the TRN instructions. */
15354 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15357 poly_uint64 nelt
= d
->perm
.length ();
15358 rtx out
, in0
, in1
, x
;
15359 machine_mode vmode
= d
->vmode
;
15361 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15364 /* Note that these are little-endian tests.
15365 We correct for big-endian later. */
15366 if (!d
->perm
[0].is_constant (&odd
)
15367 || (odd
!= 0 && odd
!= 1)
15368 || !d
->perm
.series_p (0, 2, odd
, 2)
15369 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
15378 /* We don't need a big-endian lane correction for SVE; see the comment
15379 at the head of aarch64-sve.md for details. */
15380 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15382 x
= in0
, in0
= in1
, in1
= x
;
15387 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15388 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
15392 /* Recognize patterns suitable for the UZP instructions. */
15394 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
15397 rtx out
, in0
, in1
, x
;
15398 machine_mode vmode
= d
->vmode
;
15400 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15403 /* Note that these are little-endian tests.
15404 We correct for big-endian later. */
15405 if (!d
->perm
[0].is_constant (&odd
)
15406 || (odd
!= 0 && odd
!= 1)
15407 || !d
->perm
.series_p (0, 1, odd
, 2))
15416 /* We don't need a big-endian lane correction for SVE; see the comment
15417 at the head of aarch64-sve.md for details. */
15418 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15420 x
= in0
, in0
= in1
, in1
= x
;
15425 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15426 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15430 /* Recognize patterns suitable for the ZIP instructions. */
15432 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15435 poly_uint64 nelt
= d
->perm
.length ();
15436 rtx out
, in0
, in1
, x
;
15437 machine_mode vmode
= d
->vmode
;
15439 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15442 /* Note that these are little-endian tests.
15443 We correct for big-endian later. */
15444 poly_uint64 first
= d
->perm
[0];
15445 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15446 || !d
->perm
.series_p (0, 2, first
, 1)
15447 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15449 high
= maybe_ne (first
, 0U);
15457 /* We don't need a big-endian lane correction for SVE; see the comment
15458 at the head of aarch64-sve.md for details. */
15459 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15461 x
= in0
, in0
= in1
, in1
= x
;
15466 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15467 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
15471 /* Recognize patterns for the EXT insn. */
15474 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
15476 HOST_WIDE_INT location
;
15479 /* The first element always refers to the first vector.
15480 Check if the extracted indices are increasing by one. */
15481 if (d
->vec_flags
== VEC_SVE_PRED
15482 || !d
->perm
[0].is_constant (&location
)
15483 || !d
->perm
.series_p (0, 1, location
, 1))
15490 /* The case where (location == 0) is a no-op for both big- and little-endian,
15491 and is removed by the mid-end at optimization levels -O1 and higher.
15493 We don't need a big-endian lane correction for SVE; see the comment
15494 at the head of aarch64-sve.md for details. */
15495 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
15497 /* After setup, we want the high elements of the first vector (stored
15498 at the LSB end of the register), and the low elements of the second
15499 vector (stored at the MSB end of the register). So swap. */
15500 std::swap (d
->op0
, d
->op1
);
15501 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15502 to_constant () is safe since this is restricted to Advanced SIMD
15504 location
= d
->perm
.length ().to_constant () - location
;
15507 offset
= GEN_INT (location
);
15508 emit_set_insn (d
->target
,
15509 gen_rtx_UNSPEC (d
->vmode
,
15510 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
15515 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15516 within each 64-bit, 32-bit or 16-bit granule. */
15519 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
15521 HOST_WIDE_INT diff
;
15522 unsigned int i
, size
, unspec
;
15523 machine_mode pred_mode
;
15525 if (d
->vec_flags
== VEC_SVE_PRED
15526 || !d
->one_vector_p
15527 || !d
->perm
[0].is_constant (&diff
))
15530 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
15533 unspec
= UNSPEC_REV64
;
15534 pred_mode
= VNx2BImode
;
15536 else if (size
== 4)
15538 unspec
= UNSPEC_REV32
;
15539 pred_mode
= VNx4BImode
;
15541 else if (size
== 2)
15543 unspec
= UNSPEC_REV16
;
15544 pred_mode
= VNx8BImode
;
15549 unsigned int step
= diff
+ 1;
15550 for (i
= 0; i
< step
; ++i
)
15551 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
15558 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
15559 if (d
->vec_flags
== VEC_SVE_DATA
)
15561 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15562 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
15563 UNSPEC_MERGE_PTRUE
);
15565 emit_set_insn (d
->target
, src
);
15569 /* Recognize patterns for the REV insn, which reverses elements within
15573 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
15575 poly_uint64 nelt
= d
->perm
.length ();
15577 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
15580 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
15587 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
15588 emit_set_insn (d
->target
, src
);
15593 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
15595 rtx out
= d
->target
;
15598 machine_mode vmode
= d
->vmode
;
15601 if (d
->vec_flags
== VEC_SVE_PRED
15602 || d
->perm
.encoding ().encoded_nelts () != 1
15603 || !d
->perm
[0].is_constant (&elt
))
15606 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
15613 /* The generic preparation in aarch64_expand_vec_perm_const_1
15614 swaps the operand order and the permute indices if it finds
15615 d->perm[0] to be in the second operand. Thus, we can always
15616 use d->op0 and need not do any extra arithmetic to get the
15617 correct lane number. */
15619 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
15621 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
15622 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
15623 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
15628 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
15630 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
15631 machine_mode vmode
= d
->vmode
;
15633 /* Make sure that the indices are constant. */
15634 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
15635 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
15636 if (!d
->perm
[i
].is_constant ())
15642 /* Generic code will try constant permutation twice. Once with the
15643 original mode and again with the elements lowered to QImode.
15644 So wait and don't do the selector expansion ourselves. */
15645 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
15648 /* to_constant is safe since this routine is specific to Advanced SIMD
15650 unsigned int nelt
= d
->perm
.length ().to_constant ();
15651 for (unsigned int i
= 0; i
< nelt
; ++i
)
15652 /* If big-endian and two vectors we end up with a weird mixed-endian
15653 mode on NEON. Reverse the index within each word but not the word
15654 itself. to_constant is safe because we checked is_constant above. */
15655 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
15656 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
15657 : d
->perm
[i
].to_constant ());
15659 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
15660 sel
= force_reg (vmode
, sel
);
15662 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
15666 /* Try to implement D using an SVE TBL instruction. */
15669 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
15671 unsigned HOST_WIDE_INT nelt
;
15673 /* Permuting two variable-length vectors could overflow the
15675 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
15681 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
15682 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
15683 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
15688 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
15690 /* The pattern matching functions above are written to look for a small
15691 number to begin the sequence (0, 1, N/2). If we begin with an index
15692 from the second operand, we can swap the operands. */
15693 poly_int64 nelt
= d
->perm
.length ();
15694 if (known_ge (d
->perm
[0], nelt
))
15696 d
->perm
.rotate_inputs (1);
15697 std::swap (d
->op0
, d
->op1
);
15700 if ((d
->vec_flags
== VEC_ADVSIMD
15701 || d
->vec_flags
== VEC_SVE_DATA
15702 || d
->vec_flags
== VEC_SVE_PRED
)
15703 && known_gt (nelt
, 1))
15705 if (aarch64_evpc_rev_local (d
))
15707 else if (aarch64_evpc_rev_global (d
))
15709 else if (aarch64_evpc_ext (d
))
15711 else if (aarch64_evpc_dup (d
))
15713 else if (aarch64_evpc_zip (d
))
15715 else if (aarch64_evpc_uzp (d
))
15717 else if (aarch64_evpc_trn (d
))
15719 if (d
->vec_flags
== VEC_SVE_DATA
)
15720 return aarch64_evpc_sve_tbl (d
);
15721 else if (d
->vec_flags
== VEC_SVE_DATA
)
15722 return aarch64_evpc_tbl (d
);
15727 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15730 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
15731 rtx op1
, const vec_perm_indices
&sel
)
15733 struct expand_vec_perm_d d
;
15735 /* Check whether the mask can be applied to a single vector. */
15736 if (op0
&& rtx_equal_p (op0
, op1
))
15737 d
.one_vector_p
= true;
15738 else if (sel
.all_from_input_p (0))
15740 d
.one_vector_p
= true;
15743 else if (sel
.all_from_input_p (1))
15745 d
.one_vector_p
= true;
15749 d
.one_vector_p
= false;
15751 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
15752 sel
.nelts_per_input ());
15754 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
15758 d
.testing_p
= !target
;
15761 return aarch64_expand_vec_perm_const_1 (&d
);
15763 rtx_insn
*last
= get_last_insn ();
15764 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
15765 gcc_assert (last
== get_last_insn ());
15770 /* Generate a byte permute mask for a register of mode MODE,
15771 which has NUNITS units. */
15774 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
15776 /* We have to reverse each vector because we dont have
15777 a permuted load that can reverse-load according to ABI rules. */
15779 rtvec v
= rtvec_alloc (16);
15781 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
15783 gcc_assert (BYTES_BIG_ENDIAN
);
15784 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
15786 for (i
= 0; i
< nunits
; i
++)
15787 for (j
= 0; j
< usize
; j
++)
15788 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
15789 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
15790 return force_reg (V16QImode
, mask
);
15793 /* Return true if X is a valid second operand for the SVE instruction
15794 that implements integer comparison OP_CODE. */
15797 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
15799 if (register_operand (x
, VOIDmode
))
15808 return aarch64_sve_cmp_immediate_p (x
, false);
15815 return aarch64_sve_cmp_immediate_p (x
, true);
15817 gcc_unreachable ();
15821 /* Use predicated SVE instructions to implement the equivalent of:
15825 given that PTRUE is an all-true predicate of the appropriate mode. */
15828 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
15830 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15831 gen_rtvec (2, ptrue
, op
),
15832 UNSPEC_MERGE_PTRUE
);
15833 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
15834 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15837 /* Likewise, but also clobber the condition codes. */
15840 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
15842 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15843 gen_rtvec (2, ptrue
, op
),
15844 UNSPEC_MERGE_PTRUE
);
15845 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
15846 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15849 /* Return the UNSPEC_COND_* code for comparison CODE. */
15851 static unsigned int
15852 aarch64_unspec_cond_code (rtx_code code
)
15857 return UNSPEC_COND_NE
;
15859 return UNSPEC_COND_EQ
;
15861 return UNSPEC_COND_LT
;
15863 return UNSPEC_COND_GT
;
15865 return UNSPEC_COND_LE
;
15867 return UNSPEC_COND_GE
;
15869 gcc_unreachable ();
15875 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15877 where <X> is the operation associated with comparison CODE. This form
15878 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15879 semantics, such as when PRED might not be all-true and when comparing
15880 inactive lanes could have side effects. */
15883 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
15884 rtx pred
, rtx op0
, rtx op1
)
15886 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
15887 gen_rtvec (3, pred
, op0
, op1
),
15888 aarch64_unspec_cond_code (code
));
15889 emit_set_insn (target
, unspec
);
15892 /* Expand an SVE integer comparison using the SVE equivalent of:
15894 (set TARGET (CODE OP0 OP1)). */
15897 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
15899 machine_mode pred_mode
= GET_MODE (target
);
15900 machine_mode data_mode
= GET_MODE (op0
);
15902 if (!aarch64_sve_cmp_operand_p (code
, op1
))
15903 op1
= force_reg (data_mode
, op1
);
15905 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15906 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15907 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
15910 /* Emit the SVE equivalent of:
15912 (set TMP1 (CODE1 OP0 OP1))
15913 (set TMP2 (CODE2 OP0 OP1))
15914 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15916 PTRUE is an all-true predicate with the same mode as TARGET. */
15919 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
15920 rtx ptrue
, rtx op0
, rtx op1
)
15922 machine_mode pred_mode
= GET_MODE (ptrue
);
15923 rtx tmp1
= gen_reg_rtx (pred_mode
);
15924 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
15925 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
15926 rtx tmp2
= gen_reg_rtx (pred_mode
);
15927 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
15928 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
15929 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
15932 /* Emit the SVE equivalent of:
15934 (set TMP (CODE OP0 OP1))
15935 (set TARGET (not TMP))
15937 PTRUE is an all-true predicate with the same mode as TARGET. */
15940 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
15943 machine_mode pred_mode
= GET_MODE (ptrue
);
15944 rtx tmp
= gen_reg_rtx (pred_mode
);
15945 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
15946 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
15947 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15950 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15952 (set TARGET (CODE OP0 OP1))
15954 If CAN_INVERT_P is true, the caller can also handle inverted results;
15955 return true if the result is in fact inverted. */
15958 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
15959 rtx op0
, rtx op1
, bool can_invert_p
)
15961 machine_mode pred_mode
= GET_MODE (target
);
15962 machine_mode data_mode
= GET_MODE (op0
);
15964 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15968 /* UNORDERED has no immediate form. */
15969 op1
= force_reg (data_mode
, op1
);
15978 /* There is native support for the comparison. */
15979 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15980 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15985 /* This is a trapping operation (LT or GT). */
15986 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
15990 if (!flag_trapping_math
)
15992 /* This would trap for signaling NaNs. */
15993 op1
= force_reg (data_mode
, op1
);
15994 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
16002 if (flag_trapping_math
)
16004 /* Work out which elements are ordered. */
16005 rtx ordered
= gen_reg_rtx (pred_mode
);
16006 op1
= force_reg (data_mode
, op1
);
16007 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
16009 /* Test the opposite condition for the ordered elements,
16010 then invert the result. */
16014 code
= reverse_condition_maybe_unordered (code
);
16017 aarch64_emit_sve_predicated_cond (target
, code
,
16018 ordered
, op0
, op1
);
16021 rtx tmp
= gen_reg_rtx (pred_mode
);
16022 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
16023 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
16029 /* ORDERED has no immediate form. */
16030 op1
= force_reg (data_mode
, op1
);
16034 gcc_unreachable ();
16037 /* There is native support for the inverse comparison. */
16038 code
= reverse_condition_maybe_unordered (code
);
16041 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16042 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
16045 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
16049 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16050 of the data being selected and CMP_MODE is the mode of the values being
16054 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
16057 machine_mode pred_mode
16058 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
16059 GET_MODE_SIZE (cmp_mode
)).require ();
16060 rtx pred
= gen_reg_rtx (pred_mode
);
16061 if (FLOAT_MODE_P (cmp_mode
))
16063 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
16064 ops
[4], ops
[5], true))
16065 std::swap (ops
[1], ops
[2]);
16068 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
16070 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
16071 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
16074 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16075 true. However due to issues with register allocation it is preferable
16076 to avoid tieing integer scalar and FP scalar modes. Executing integer
16077 operations in general registers is better than treating them as scalar
16078 vector operations. This reduces latency and avoids redundant int<->FP
16079 moves. So tie modes if they are either the same class, or vector modes
16080 with other vector modes, vector structs or any scalar mode. */
16083 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
16085 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
16088 /* We specifically want to allow elements of "structure" modes to
16089 be tieable to the structure. This more general condition allows
16090 other rarer situations too. The reason we don't extend this to
16091 predicate modes is that there are no predicate structure modes
16092 nor any specific instructions for extracting part of a predicate
16094 if (aarch64_vector_data_mode_p (mode1
)
16095 && aarch64_vector_data_mode_p (mode2
))
16098 /* Also allow any scalar modes with vectors. */
16099 if (aarch64_vector_mode_supported_p (mode1
)
16100 || aarch64_vector_mode_supported_p (mode2
))
16106 /* Return a new RTX holding the result of moving POINTER forward by
16110 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
16112 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
16114 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
16118 /* Return a new RTX holding the result of moving POINTER forward by the
16119 size of the mode it points to. */
16122 aarch64_progress_pointer (rtx pointer
)
16124 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
16127 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16131 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
16134 rtx reg
= gen_reg_rtx (mode
);
16136 /* "Cast" the pointers to the correct mode. */
16137 *src
= adjust_address (*src
, mode
, 0);
16138 *dst
= adjust_address (*dst
, mode
, 0);
16139 /* Emit the memcpy. */
16140 emit_move_insn (reg
, *src
);
16141 emit_move_insn (*dst
, reg
);
16142 /* Move the pointers forward. */
16143 *src
= aarch64_progress_pointer (*src
);
16144 *dst
= aarch64_progress_pointer (*dst
);
16147 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16148 we succeed, otherwise return false. */
16151 aarch64_expand_movmem (rtx
*operands
)
16154 rtx dst
= operands
[0];
16155 rtx src
= operands
[1];
16157 machine_mode cur_mode
= BLKmode
, next_mode
;
16158 bool speed_p
= !optimize_function_for_size_p (cfun
);
16160 /* When optimizing for size, give a better estimate of the length of a
16161 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16162 will always require an even number of instructions to do now. And each
16163 operation requires both a load+store, so devide the max number by 2. */
16164 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
16166 /* We can't do anything smart if the amount to copy is not constant. */
16167 if (!CONST_INT_P (operands
[2]))
16170 n
= INTVAL (operands
[2]);
16172 /* Try to keep the number of instructions low. For all cases we will do at
16173 most two moves for the residual amount, since we'll always overlap the
16175 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
16178 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
16179 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
16181 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
16182 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
16184 /* Convert n to bits to make the rest of the code simpler. */
16185 n
= n
* BITS_PER_UNIT
;
16189 /* Find the largest mode in which to do the copy in without over reading
16191 opt_scalar_int_mode mode_iter
;
16192 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
16193 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= n
)
16194 cur_mode
= mode_iter
.require ();
16196 gcc_assert (cur_mode
!= BLKmode
);
16198 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
16199 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
16203 /* Do certain trailing copies as overlapping if it's going to be
16204 cheaper. i.e. less instructions to do so. For instance doing a 15
16205 byte copy it's more efficient to do two overlapping 8 byte copies than
16207 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
16208 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
16209 if (n
> 0 && n_bits
> n
&& n_bits
<= 8 * BITS_PER_UNIT
)
16211 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
16212 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
16220 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16221 SImode stores. Handle the case when the constant has identical
16222 bottom and top halves. This is beneficial when the two stores can be
16223 merged into an STP and we avoid synthesising potentially expensive
16224 immediates twice. Return true if such a split is possible. */
16227 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
16229 rtx lo
= gen_lowpart (SImode
, src
);
16230 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
16232 bool size_p
= optimize_function_for_size_p (cfun
);
16234 if (!rtx_equal_p (lo
, hi
))
16237 unsigned int orig_cost
16238 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
16239 unsigned int lo_cost
16240 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
16242 /* We want to transform:
16244 MOVK x1, 0x140, lsl 16
16245 MOVK x1, 0xc0da, lsl 32
16246 MOVK x1, 0x140, lsl 48
16250 MOVK w1, 0x140, lsl 16
16252 So we want to perform this only when we save two instructions
16253 or more. When optimizing for size, however, accept any code size
16255 if (size_p
&& orig_cost
<= lo_cost
)
16259 && (orig_cost
<= lo_cost
+ 1))
16262 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
16263 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
16266 rtx tmp_reg
= gen_reg_rtx (SImode
);
16267 aarch64_expand_mov_immediate (tmp_reg
, lo
);
16268 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
16269 /* Don't emit an explicit store pair as this may not be always profitable.
16270 Let the sched-fusion logic decide whether to merge them. */
16271 emit_move_insn (mem_lo
, tmp_reg
);
16272 emit_move_insn (mem_hi
, tmp_reg
);
16277 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16279 static unsigned HOST_WIDE_INT
16280 aarch64_asan_shadow_offset (void)
16282 return (HOST_WIDE_INT_1
<< 36);
16286 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
16287 int code
, tree treeop0
, tree treeop1
)
16289 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16291 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16293 struct expand_operand ops
[4];
16296 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16298 op_mode
= GET_MODE (op0
);
16299 if (op_mode
== VOIDmode
)
16300 op_mode
= GET_MODE (op1
);
16308 icode
= CODE_FOR_cmpsi
;
16313 icode
= CODE_FOR_cmpdi
;
16318 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16319 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
16324 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16325 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
16333 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
16334 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
16340 *prep_seq
= get_insns ();
16343 create_fixed_operand (&ops
[0], op0
);
16344 create_fixed_operand (&ops
[1], op1
);
16347 if (!maybe_expand_insn (icode
, 2, ops
))
16352 *gen_seq
= get_insns ();
16355 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
16356 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
16360 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
16361 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
16363 rtx op0
, op1
, target
;
16364 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16365 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16367 struct expand_operand ops
[6];
16370 push_to_sequence (*prep_seq
);
16371 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16373 op_mode
= GET_MODE (op0
);
16374 if (op_mode
== VOIDmode
)
16375 op_mode
= GET_MODE (op1
);
16383 icode
= CODE_FOR_ccmpsi
;
16388 icode
= CODE_FOR_ccmpdi
;
16393 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16394 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
16399 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16400 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
16408 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
16409 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
16415 *prep_seq
= get_insns ();
16418 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
16419 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
16421 if (bit_code
!= AND
)
16423 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
16424 GET_MODE (XEXP (prev
, 0))),
16425 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
16426 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
16429 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
16430 create_fixed_operand (&ops
[1], target
);
16431 create_fixed_operand (&ops
[2], op0
);
16432 create_fixed_operand (&ops
[3], op1
);
16433 create_fixed_operand (&ops
[4], prev
);
16434 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
16436 push_to_sequence (*gen_seq
);
16437 if (!maybe_expand_insn (icode
, 6, ops
))
16443 *gen_seq
= get_insns ();
16446 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
16449 #undef TARGET_GEN_CCMP_FIRST
16450 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16452 #undef TARGET_GEN_CCMP_NEXT
16453 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16455 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16456 instruction fusion of some sort. */
16459 aarch64_macro_fusion_p (void)
16461 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
16465 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16466 should be kept together during scheduling. */
16469 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
16472 rtx prev_set
= single_set (prev
);
16473 rtx curr_set
= single_set (curr
);
16474 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16475 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
16477 if (!aarch64_macro_fusion_p ())
16480 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
16482 /* We are trying to match:
16483 prev (mov) == (set (reg r0) (const_int imm16))
16484 curr (movk) == (set (zero_extract (reg r0)
16487 (const_int imm16_1)) */
16489 set_dest
= SET_DEST (curr_set
);
16491 if (GET_CODE (set_dest
) == ZERO_EXTRACT
16492 && CONST_INT_P (SET_SRC (curr_set
))
16493 && CONST_INT_P (SET_SRC (prev_set
))
16494 && CONST_INT_P (XEXP (set_dest
, 2))
16495 && INTVAL (XEXP (set_dest
, 2)) == 16
16496 && REG_P (XEXP (set_dest
, 0))
16497 && REG_P (SET_DEST (prev_set
))
16498 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
16504 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
16507 /* We're trying to match:
16508 prev (adrp) == (set (reg r1)
16509 (high (symbol_ref ("SYM"))))
16510 curr (add) == (set (reg r0)
16512 (symbol_ref ("SYM"))))
16513 Note that r0 need not necessarily be the same as r1, especially
16514 during pre-regalloc scheduling. */
16516 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16517 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16519 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
16520 && REG_P (XEXP (SET_SRC (curr_set
), 0))
16521 && REGNO (XEXP (SET_SRC (curr_set
), 0))
16522 == REGNO (SET_DEST (prev_set
))
16523 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
16524 XEXP (SET_SRC (curr_set
), 1)))
16529 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
16532 /* We're trying to match:
16533 prev (movk) == (set (zero_extract (reg r0)
16536 (const_int imm16_1))
16537 curr (movk) == (set (zero_extract (reg r0)
16540 (const_int imm16_2)) */
16542 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
16543 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
16544 && REG_P (XEXP (SET_DEST (prev_set
), 0))
16545 && REG_P (XEXP (SET_DEST (curr_set
), 0))
16546 && REGNO (XEXP (SET_DEST (prev_set
), 0))
16547 == REGNO (XEXP (SET_DEST (curr_set
), 0))
16548 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
16549 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
16550 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
16551 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
16552 && CONST_INT_P (SET_SRC (prev_set
))
16553 && CONST_INT_P (SET_SRC (curr_set
)))
16557 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
16559 /* We're trying to match:
16560 prev (adrp) == (set (reg r0)
16561 (high (symbol_ref ("SYM"))))
16562 curr (ldr) == (set (reg r1)
16563 (mem (lo_sum (reg r0)
16564 (symbol_ref ("SYM")))))
16566 curr (ldr) == (set (reg r1)
16569 (symbol_ref ("SYM")))))) */
16570 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16571 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16573 rtx curr_src
= SET_SRC (curr_set
);
16575 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
16576 curr_src
= XEXP (curr_src
, 0);
16578 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
16579 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
16580 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
16581 == REGNO (SET_DEST (prev_set
))
16582 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
16583 XEXP (SET_SRC (prev_set
), 0)))
16588 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
16589 && aarch_crypto_can_dual_issue (prev
, curr
))
16592 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
16593 && any_condjump_p (curr
))
16595 enum attr_type prev_type
= get_attr_type (prev
);
16597 unsigned int condreg1
, condreg2
;
16599 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
16600 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
16602 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
16604 && modified_in_p (cc_reg_1
, prev
))
16606 /* FIXME: this misses some which is considered simple arthematic
16607 instructions for ThunderX. Simple shifts are missed here. */
16608 if (prev_type
== TYPE_ALUS_SREG
16609 || prev_type
== TYPE_ALUS_IMM
16610 || prev_type
== TYPE_LOGICS_REG
16611 || prev_type
== TYPE_LOGICS_IMM
)
16618 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
16619 && any_condjump_p (curr
))
16621 /* We're trying to match:
16622 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16623 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16625 (label_ref ("SYM"))
16627 if (SET_DEST (curr_set
) == (pc_rtx
)
16628 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
16629 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
16630 && REG_P (SET_DEST (prev_set
))
16631 && REGNO (SET_DEST (prev_set
))
16632 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
16634 /* Fuse ALU operations followed by conditional branch instruction. */
16635 switch (get_attr_type (prev
))
16638 case TYPE_ALU_SREG
:
16641 case TYPE_ADCS_REG
:
16642 case TYPE_ADCS_IMM
:
16643 case TYPE_LOGIC_REG
:
16644 case TYPE_LOGIC_IMM
:
16648 case TYPE_SHIFT_REG
:
16649 case TYPE_SHIFT_IMM
:
16664 /* Return true iff the instruction fusion described by OP is enabled. */
16667 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
16669 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
16672 /* If MEM is in the form of [base+offset], extract the two parts
16673 of address and set to BASE and OFFSET, otherwise return false
16674 after clearing BASE and OFFSET. */
16677 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
16681 gcc_assert (MEM_P (mem
));
16683 addr
= XEXP (mem
, 0);
16688 *offset
= const0_rtx
;
16692 if (GET_CODE (addr
) == PLUS
16693 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
16695 *base
= XEXP (addr
, 0);
16696 *offset
= XEXP (addr
, 1);
16701 *offset
= NULL_RTX
;
16706 /* Types for scheduling fusion. */
16707 enum sched_fusion_type
16709 SCHED_FUSION_NONE
= 0,
16710 SCHED_FUSION_LD_SIGN_EXTEND
,
16711 SCHED_FUSION_LD_ZERO_EXTEND
,
16717 /* If INSN is a load or store of address in the form of [base+offset],
16718 extract the two parts and set to BASE and OFFSET. Return scheduling
16719 fusion type this INSN is. */
16721 static enum sched_fusion_type
16722 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
16725 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
16727 gcc_assert (INSN_P (insn
));
16728 x
= PATTERN (insn
);
16729 if (GET_CODE (x
) != SET
)
16730 return SCHED_FUSION_NONE
;
16733 dest
= SET_DEST (x
);
16735 machine_mode dest_mode
= GET_MODE (dest
);
16737 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
16738 return SCHED_FUSION_NONE
;
16740 if (GET_CODE (src
) == SIGN_EXTEND
)
16742 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
16743 src
= XEXP (src
, 0);
16744 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16745 return SCHED_FUSION_NONE
;
16747 else if (GET_CODE (src
) == ZERO_EXTEND
)
16749 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
16750 src
= XEXP (src
, 0);
16751 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16752 return SCHED_FUSION_NONE
;
16755 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
16756 extract_base_offset_in_addr (src
, base
, offset
);
16757 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
16759 fusion
= SCHED_FUSION_ST
;
16760 extract_base_offset_in_addr (dest
, base
, offset
);
16763 return SCHED_FUSION_NONE
;
16765 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
16766 fusion
= SCHED_FUSION_NONE
;
16771 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16773 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16774 and PRI are only calculated for these instructions. For other instruction,
16775 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16776 type instruction fusion can be added by returning different priorities.
16778 It's important that irrelevant instructions get the largest FUSION_PRI. */
16781 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
16782 int *fusion_pri
, int *pri
)
16786 enum sched_fusion_type fusion
;
16788 gcc_assert (INSN_P (insn
));
16791 fusion
= fusion_load_store (insn
, &base
, &offset
);
16792 if (fusion
== SCHED_FUSION_NONE
)
16799 /* Set FUSION_PRI according to fusion type and base register. */
16800 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
16802 /* Calculate PRI. */
16805 /* INSN with smaller offset goes first. */
16806 off_val
= (int)(INTVAL (offset
));
16808 tmp
-= (off_val
& 0xfffff);
16810 tmp
+= ((- off_val
) & 0xfffff);
16816 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16817 Adjust priority of sha1h instructions so they are scheduled before
16818 other SHA1 instructions. */
16821 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
16823 rtx x
= PATTERN (insn
);
16825 if (GET_CODE (x
) == SET
)
16829 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
16830 return priority
+ 10;
16836 /* Given OPERANDS of consecutive load/store, check if we can merge
16837 them into ldp/stp. LOAD is true if they are load instructions.
16838 MODE is the mode of memory operands. */
16841 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
16844 HOST_WIDE_INT offval_1
, offval_2
, msize
;
16845 enum reg_class rclass_1
, rclass_2
;
16846 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
16850 mem_1
= operands
[1];
16851 mem_2
= operands
[3];
16852 reg_1
= operands
[0];
16853 reg_2
= operands
[2];
16854 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
16855 if (REGNO (reg_1
) == REGNO (reg_2
))
16860 mem_1
= operands
[0];
16861 mem_2
= operands
[2];
16862 reg_1
= operands
[1];
16863 reg_2
= operands
[3];
16866 /* The mems cannot be volatile. */
16867 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
16870 /* If we have SImode and slow unaligned ldp,
16871 check the alignment to be at least 8 byte. */
16873 && (aarch64_tune_params
.extra_tuning_flags
16874 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16876 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16879 /* Check if the addresses are in the form of [base+offset]. */
16880 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16881 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16883 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16884 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16887 /* Check if the bases are same. */
16888 if (!rtx_equal_p (base_1
, base_2
))
16891 /* The operands must be of the same size. */
16892 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
16893 GET_MODE_SIZE (GET_MODE (mem_2
))));
16895 offval_1
= INTVAL (offset_1
);
16896 offval_2
= INTVAL (offset_2
);
16897 /* We should only be trying this for fixed-sized modes. There is no
16898 SVE LDP/STP instruction. */
16899 msize
= GET_MODE_SIZE (mode
).to_constant ();
16900 /* Check if the offsets are consecutive. */
16901 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
16904 /* Check if the addresses are clobbered by load. */
16907 if (reg_mentioned_p (reg_1
, mem_1
))
16910 /* In increasing order, the last load can clobber the address. */
16911 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
16915 /* One of the memory accesses must be a mempair operand.
16916 If it is not the first one, they need to be swapped by the
16918 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
16919 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
16922 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16923 rclass_1
= FP_REGS
;
16925 rclass_1
= GENERAL_REGS
;
16927 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16928 rclass_2
= FP_REGS
;
16930 rclass_2
= GENERAL_REGS
;
16932 /* Check if the registers are of same class. */
16933 if (rclass_1
!= rclass_2
)
16939 /* Given OPERANDS of consecutive load/store that can be merged,
16940 swap them if they are not in ascending order. */
16942 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
16944 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
16945 HOST_WIDE_INT offval_1
, offval_2
;
16949 mem_1
= operands
[1];
16950 mem_2
= operands
[3];
16954 mem_1
= operands
[0];
16955 mem_2
= operands
[2];
16958 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16959 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16961 offval_1
= INTVAL (offset_1
);
16962 offval_2
= INTVAL (offset_2
);
16964 if (offval_1
> offval_2
)
16966 /* Irrespective of whether this is a load or a store,
16967 we do the same swap. */
16968 std::swap (operands
[0], operands
[2]);
16969 std::swap (operands
[1], operands
[3]);
16973 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16974 comparison between the two. */
16976 aarch64_host_wide_int_compare (const void *x
, const void *y
)
16978 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
16979 * ((const HOST_WIDE_INT
*) y
));
16982 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16983 other pointing to a REG rtx containing an offset, compare the offsets
16988 1 iff offset (X) > offset (Y)
16989 0 iff offset (X) == offset (Y)
16990 -1 iff offset (X) < offset (Y) */
16992 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
16994 const rtx
* operands_1
= (const rtx
*) x
;
16995 const rtx
* operands_2
= (const rtx
*) y
;
16996 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
16998 if (MEM_P (operands_1
[0]))
16999 mem_1
= operands_1
[0];
17001 mem_1
= operands_1
[1];
17003 if (MEM_P (operands_2
[0]))
17004 mem_2
= operands_2
[0];
17006 mem_2
= operands_2
[1];
17008 /* Extract the offsets. */
17009 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17010 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
17012 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
17014 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
17017 /* Given OPERANDS of consecutive load/store, check if we can merge
17018 them into ldp/stp by adjusting the offset. LOAD is true if they
17019 are load instructions. MODE is the mode of memory operands.
17021 Given below consecutive stores:
17023 str w1, [xb, 0x100]
17024 str w1, [xb, 0x104]
17025 str w1, [xb, 0x108]
17026 str w1, [xb, 0x10c]
17028 Though the offsets are out of the range supported by stp, we can
17029 still pair them after adjusting the offset, like:
17031 add scratch, xb, 0x100
17032 stp w1, w1, [scratch]
17033 stp w1, w1, [scratch, 0x8]
17035 The peephole patterns detecting this opportunity should guarantee
17036 the scratch register is avaliable. */
17039 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
17042 const int num_insns
= 4;
17043 enum reg_class rclass
;
17044 HOST_WIDE_INT offvals
[num_insns
], msize
;
17045 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
17049 for (int i
= 0; i
< num_insns
; i
++)
17051 reg
[i
] = operands
[2 * i
];
17052 mem
[i
] = operands
[2 * i
+ 1];
17054 gcc_assert (REG_P (reg
[i
]));
17057 /* Do not attempt to merge the loads if the loads clobber each other. */
17058 for (int i
= 0; i
< 8; i
+= 2)
17059 for (int j
= i
+ 2; j
< 8; j
+= 2)
17060 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
17064 for (int i
= 0; i
< num_insns
; i
++)
17066 mem
[i
] = operands
[2 * i
];
17067 reg
[i
] = operands
[2 * i
+ 1];
17070 /* Skip if memory operand is by itself valid for ldp/stp. */
17071 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
17074 for (int i
= 0; i
< num_insns
; i
++)
17076 /* The mems cannot be volatile. */
17077 if (MEM_VOLATILE_P (mem
[i
]))
17080 /* Check if the addresses are in the form of [base+offset]. */
17081 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
17082 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
17086 /* Check if addresses are clobbered by load. */
17088 for (int i
= 0; i
< num_insns
; i
++)
17089 if (reg_mentioned_p (reg
[i
], mem
[i
]))
17092 /* Check if the bases are same. */
17093 for (int i
= 0; i
< num_insns
- 1; i
++)
17094 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
17097 for (int i
= 0; i
< num_insns
; i
++)
17098 offvals
[i
] = INTVAL (offset
[i
]);
17100 msize
= GET_MODE_SIZE (mode
);
17102 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17103 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
17104 aarch64_host_wide_int_compare
);
17106 if (!(offvals
[1] == offvals
[0] + msize
17107 && offvals
[3] == offvals
[2] + msize
))
17110 /* Check that offsets are within range of each other. The ldp/stp
17111 instructions have 7 bit immediate offsets, so use 0x80. */
17112 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
17115 /* The offsets must be aligned with respect to each other. */
17116 if (offvals
[0] % msize
!= offvals
[2] % msize
)
17119 /* If we have SImode and slow unaligned ldp,
17120 check the alignment to be at least 8 byte. */
17122 && (aarch64_tune_params
.extra_tuning_flags
17123 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17125 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
17128 /* Check if the registers are of same class. */
17129 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
17130 ? FP_REGS
: GENERAL_REGS
;
17132 for (int i
= 1; i
< num_insns
; i
++)
17133 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
17135 if (rclass
!= FP_REGS
)
17140 if (rclass
!= GENERAL_REGS
)
17147 /* Given OPERANDS of consecutive load/store, this function pairs them
17148 into LDP/STP after adjusting the offset. It depends on the fact
17149 that the operands can be sorted so the offsets are correct for STP.
17150 MODE is the mode of memory operands. CODE is the rtl operator
17151 which should be applied to all memory operands, it's SIGN_EXTEND,
17152 ZERO_EXTEND or UNKNOWN. */
17155 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17156 scalar_mode mode
, RTX_CODE code
)
17158 rtx base
, offset_1
, offset_3
, t1
, t2
;
17159 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17160 rtx temp_operands
[8];
17161 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
17162 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
17164 /* We make changes on a copy as we may still bail out. */
17165 for (int i
= 0; i
< 8; i
++)
17166 temp_operands
[i
] = operands
[i
];
17168 /* Sort the operands. */
17169 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
17173 mem_1
= temp_operands
[1];
17174 mem_2
= temp_operands
[3];
17175 mem_3
= temp_operands
[5];
17176 mem_4
= temp_operands
[7];
17180 mem_1
= temp_operands
[0];
17181 mem_2
= temp_operands
[2];
17182 mem_3
= temp_operands
[4];
17183 mem_4
= temp_operands
[6];
17184 gcc_assert (code
== UNKNOWN
);
17187 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17188 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
17189 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
17190 && offset_3
!= NULL_RTX
);
17192 /* Adjust offset so it can fit in LDP/STP instruction. */
17193 msize
= GET_MODE_SIZE (mode
);
17194 stp_off_upper_limit
= msize
* (0x40 - 1);
17195 stp_off_lower_limit
= - msize
* 0x40;
17197 off_val_1
= INTVAL (offset_1
);
17198 off_val_3
= INTVAL (offset_3
);
17200 /* The base offset is optimally half way between the two STP/LDP offsets. */
17202 base_off
= (off_val_1
+ off_val_3
) / 2;
17204 /* However, due to issues with negative LDP/STP offset generation for
17205 larger modes, for DF, DI and vector modes. we must not use negative
17206 addresses smaller than 9 signed unadjusted bits can store. This
17207 provides the most range in this case. */
17208 base_off
= off_val_1
;
17210 /* Adjust the base so that it is aligned with the addresses but still
17212 if (base_off
% msize
!= off_val_1
% msize
)
17213 /* Fix the offset, bearing in mind we want to make it bigger not
17215 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17216 else if (msize
<= 4)
17217 /* The negative range of LDP/STP is one larger than the positive range. */
17220 /* Check if base offset is too big or too small. We can attempt to resolve
17221 this issue by setting it to the maximum value and seeing if the offsets
17223 if (base_off
>= 0x1000)
17225 base_off
= 0x1000 - 1;
17226 /* We must still make sure that the base offset is aligned with respect
17227 to the address. But it may may not be made any bigger. */
17228 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17231 /* Likewise for the case where the base is too small. */
17232 if (base_off
<= -0x1000)
17234 base_off
= -0x1000 + 1;
17235 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17238 /* Offset of the first STP/LDP. */
17239 new_off_1
= off_val_1
- base_off
;
17241 /* Offset of the second STP/LDP. */
17242 new_off_3
= off_val_3
- base_off
;
17244 /* The offsets must be within the range of the LDP/STP instructions. */
17245 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
17246 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
17249 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
17251 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
17252 new_off_1
+ msize
), true);
17253 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
17255 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
17256 new_off_3
+ msize
), true);
17258 if (!aarch64_mem_pair_operand (mem_1
, mode
)
17259 || !aarch64_mem_pair_operand (mem_3
, mode
))
17262 if (code
== ZERO_EXTEND
)
17264 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
17265 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
17266 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
17267 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
17269 else if (code
== SIGN_EXTEND
)
17271 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
17272 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
17273 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
17274 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
17279 operands
[0] = temp_operands
[0];
17280 operands
[1] = mem_1
;
17281 operands
[2] = temp_operands
[2];
17282 operands
[3] = mem_2
;
17283 operands
[4] = temp_operands
[4];
17284 operands
[5] = mem_3
;
17285 operands
[6] = temp_operands
[6];
17286 operands
[7] = mem_4
;
17290 operands
[0] = mem_1
;
17291 operands
[1] = temp_operands
[1];
17292 operands
[2] = mem_2
;
17293 operands
[3] = temp_operands
[3];
17294 operands
[4] = mem_3
;
17295 operands
[5] = temp_operands
[5];
17296 operands
[6] = mem_4
;
17297 operands
[7] = temp_operands
[7];
17300 /* Emit adjusting instruction. */
17301 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
17302 /* Emit ldp/stp instructions. */
17303 t1
= gen_rtx_SET (operands
[0], operands
[1]);
17304 t2
= gen_rtx_SET (operands
[2], operands
[3]);
17305 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17306 t1
= gen_rtx_SET (operands
[4], operands
[5]);
17307 t2
= gen_rtx_SET (operands
[6], operands
[7]);
17308 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17312 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17313 it isn't worth branching around empty masked ops (including masked
17317 aarch64_empty_mask_is_expensive (unsigned)
17322 /* Return 1 if pseudo register should be created and used to hold
17323 GOT address for PIC code. */
17326 aarch64_use_pseudo_pic_reg (void)
17328 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
17331 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17334 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
17336 switch (XINT (x
, 1))
17338 case UNSPEC_GOTSMALLPIC
:
17339 case UNSPEC_GOTSMALLPIC28K
:
17340 case UNSPEC_GOTTINYPIC
:
17346 return default_unspec_may_trap_p (x
, flags
);
17350 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17351 return the log2 of that value. Otherwise return -1. */
17354 aarch64_fpconst_pow_of_2 (rtx x
)
17356 const REAL_VALUE_TYPE
*r
;
17358 if (!CONST_DOUBLE_P (x
))
17361 r
= CONST_DOUBLE_REAL_VALUE (x
);
17363 if (REAL_VALUE_NEGATIVE (*r
)
17364 || REAL_VALUE_ISNAN (*r
)
17365 || REAL_VALUE_ISINF (*r
)
17366 || !real_isinteger (r
, DFmode
))
17369 return exact_log2 (real_to_integer (r
));
17372 /* If X is a vector of equal CONST_DOUBLE values and that value is
17373 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17376 aarch64_vec_fpconst_pow_of_2 (rtx x
)
17379 if (GET_CODE (x
) != CONST_VECTOR
17380 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
17383 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
17386 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
17390 for (int i
= 1; i
< nelts
; i
++)
17391 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
17397 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17400 __fp16 always promotes through this hook.
17401 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17402 through the generic excess precision logic rather than here. */
17405 aarch64_promoted_type (const_tree t
)
17407 if (SCALAR_FLOAT_TYPE_P (t
)
17408 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
17409 return float_type_node
;
17414 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17417 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
17418 optimization_type opt_type
)
17423 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
17430 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17432 static unsigned int
17433 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
17436 /* Polynomial invariant 1 == (VG / 2) - 1. */
17437 gcc_assert (i
== 1);
17440 return AARCH64_DWARF_VG
;
17443 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17444 if MODE is HFmode, and punt to the generic implementation otherwise. */
17447 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
17449 return (mode
== HFmode
17451 : default_libgcc_floating_mode_supported_p (mode
));
17454 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17455 if MODE is HFmode, and punt to the generic implementation otherwise. */
17458 aarch64_scalar_mode_supported_p (scalar_mode mode
)
17460 return (mode
== HFmode
17462 : default_scalar_mode_supported_p (mode
));
17465 /* Set the value of FLT_EVAL_METHOD.
17466 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17468 0: evaluate all operations and constants, whose semantic type has at
17469 most the range and precision of type float, to the range and
17470 precision of float; evaluate all other operations and constants to
17471 the range and precision of the semantic type;
17473 N, where _FloatN is a supported interchange floating type
17474 evaluate all operations and constants, whose semantic type has at
17475 most the range and precision of _FloatN type, to the range and
17476 precision of the _FloatN type; evaluate all other operations and
17477 constants to the range and precision of the semantic type;
17479 If we have the ARMv8.2-A extensions then we support _Float16 in native
17480 precision, so we should set this to 16. Otherwise, we support the type,
17481 but want to evaluate expressions in float precision, so set this to
17484 static enum flt_eval_method
17485 aarch64_excess_precision (enum excess_precision_type type
)
17489 case EXCESS_PRECISION_TYPE_FAST
:
17490 case EXCESS_PRECISION_TYPE_STANDARD
:
17491 /* We can calculate either in 16-bit range and precision or
17492 32-bit range and precision. Make that decision based on whether
17493 we have native support for the ARMv8.2-A 16-bit floating-point
17494 instructions or not. */
17495 return (TARGET_FP_F16INST
17496 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17497 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
17498 case EXCESS_PRECISION_TYPE_IMPLICIT
:
17499 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
17501 gcc_unreachable ();
17503 return FLT_EVAL_METHOD_UNPREDICTABLE
;
17506 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17507 scheduled for speculative execution. Reject the long-running division
17508 and square-root instructions. */
17511 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
17513 switch (get_attr_type (insn
))
17521 case TYPE_NEON_FP_SQRT_S
:
17522 case TYPE_NEON_FP_SQRT_D
:
17523 case TYPE_NEON_FP_SQRT_S_Q
:
17524 case TYPE_NEON_FP_SQRT_D_Q
:
17525 case TYPE_NEON_FP_DIV_S
:
17526 case TYPE_NEON_FP_DIV_D
:
17527 case TYPE_NEON_FP_DIV_S_Q
:
17528 case TYPE_NEON_FP_DIV_D_Q
:
17535 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17538 aarch64_compute_pressure_classes (reg_class
*classes
)
17541 classes
[i
++] = GENERAL_REGS
;
17542 classes
[i
++] = FP_REGS
;
17543 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17544 registers need to go in PR_LO_REGS at some point during their
17545 lifetime. Splitting it into two halves has the effect of making
17546 all predicates count against PR_LO_REGS, so that we try whenever
17547 possible to restrict the number of live predicates to 8. This
17548 greatly reduces the amount of spilling in certain loops. */
17549 classes
[i
++] = PR_LO_REGS
;
17550 classes
[i
++] = PR_HI_REGS
;
17554 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17557 aarch64_can_change_mode_class (machine_mode from
,
17558 machine_mode to
, reg_class_t
)
17560 if (BYTES_BIG_ENDIAN
)
17562 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
17563 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
17565 /* Don't allow changes between SVE data modes and non-SVE modes.
17566 See the comment at the head of aarch64-sve.md for details. */
17567 if (from_sve_p
!= to_sve_p
)
17570 /* Don't allow changes in element size: lane 0 of the new vector
17571 would not then be lane 0 of the old vector. See the comment
17572 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17575 In the worst case, this forces a register to be spilled in
17576 one mode and reloaded in the other, which handles the
17577 endianness correctly. */
17578 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
17584 /* Implement TARGET_EARLY_REMAT_MODES. */
17587 aarch64_select_early_remat_modes (sbitmap modes
)
17589 /* SVE values are not normally live across a call, so it should be
17590 worth doing early rematerialization even in VL-specific mode. */
17591 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
17593 machine_mode mode
= (machine_mode
) i
;
17594 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
17595 if (vec_flags
& VEC_ANY_SVE
)
17596 bitmap_set_bit (modes
, i
);
17600 /* Target-specific selftests. */
17604 namespace selftest
{
17606 /* Selftest for the RTL loader.
17607 Verify that the RTL loader copes with a dump from
17608 print_rtx_function. This is essentially just a test that class
17609 function_reader can handle a real dump, but it also verifies
17610 that lookup_reg_by_dump_name correctly handles hard regs.
17611 The presence of hard reg names in the dump means that the test is
17612 target-specific, hence it is in this file. */
17615 aarch64_test_loading_full_dump ()
17617 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
17619 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
17621 rtx_insn
*insn_1
= get_insn_by_uid (1);
17622 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
17624 rtx_insn
*insn_15
= get_insn_by_uid (15);
17625 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
17626 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
17628 /* Verify crtl->return_rtx. */
17629 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
17630 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
17631 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
17634 /* Run all target-specific selftests. */
17637 aarch64_run_selftests (void)
17639 aarch64_test_loading_full_dump ();
17642 } // namespace selftest
17644 #endif /* #if CHECKING_P */
17646 #undef TARGET_ADDRESS_COST
17647 #define TARGET_ADDRESS_COST aarch64_address_cost
17649 /* This hook will determines whether unnamed bitfields affect the alignment
17650 of the containing structure. The hook returns true if the structure
17651 should inherit the alignment requirements of an unnamed bitfield's
17653 #undef TARGET_ALIGN_ANON_BITFIELD
17654 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17656 #undef TARGET_ASM_ALIGNED_DI_OP
17657 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17659 #undef TARGET_ASM_ALIGNED_HI_OP
17660 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17662 #undef TARGET_ASM_ALIGNED_SI_OP
17663 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17665 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17666 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17667 hook_bool_const_tree_hwi_hwi_const_tree_true
17669 #undef TARGET_ASM_FILE_START
17670 #define TARGET_ASM_FILE_START aarch64_start_file
17672 #undef TARGET_ASM_OUTPUT_MI_THUNK
17673 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17675 #undef TARGET_ASM_SELECT_RTX_SECTION
17676 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17678 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17679 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17681 #undef TARGET_BUILD_BUILTIN_VA_LIST
17682 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17684 #undef TARGET_CALLEE_COPIES
17685 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17687 #undef TARGET_CAN_ELIMINATE
17688 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17690 #undef TARGET_CAN_INLINE_P
17691 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17693 #undef TARGET_CANNOT_FORCE_CONST_MEM
17694 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17696 #undef TARGET_CASE_VALUES_THRESHOLD
17697 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17699 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17700 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17702 /* Only the least significant bit is used for initialization guard
17704 #undef TARGET_CXX_GUARD_MASK_BIT
17705 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17707 #undef TARGET_C_MODE_FOR_SUFFIX
17708 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17710 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17711 #undef TARGET_DEFAULT_TARGET_FLAGS
17712 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17715 #undef TARGET_CLASS_MAX_NREGS
17716 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17718 #undef TARGET_BUILTIN_DECL
17719 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17721 #undef TARGET_BUILTIN_RECIPROCAL
17722 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17724 #undef TARGET_C_EXCESS_PRECISION
17725 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17727 #undef TARGET_EXPAND_BUILTIN
17728 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17730 #undef TARGET_EXPAND_BUILTIN_VA_START
17731 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17733 #undef TARGET_FOLD_BUILTIN
17734 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17736 #undef TARGET_FUNCTION_ARG
17737 #define TARGET_FUNCTION_ARG aarch64_function_arg
17739 #undef TARGET_FUNCTION_ARG_ADVANCE
17740 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17742 #undef TARGET_FUNCTION_ARG_BOUNDARY
17743 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17745 #undef TARGET_FUNCTION_ARG_PADDING
17746 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17748 #undef TARGET_GET_RAW_RESULT_MODE
17749 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17750 #undef TARGET_GET_RAW_ARG_MODE
17751 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17753 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17754 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17756 #undef TARGET_FUNCTION_VALUE
17757 #define TARGET_FUNCTION_VALUE aarch64_function_value
17759 #undef TARGET_FUNCTION_VALUE_REGNO_P
17760 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17762 #undef TARGET_GIMPLE_FOLD_BUILTIN
17763 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17765 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17766 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17768 #undef TARGET_INIT_BUILTINS
17769 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17771 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17772 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17773 aarch64_ira_change_pseudo_allocno_class
17775 #undef TARGET_LEGITIMATE_ADDRESS_P
17776 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17778 #undef TARGET_LEGITIMATE_CONSTANT_P
17779 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17781 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17782 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17783 aarch64_legitimize_address_displacement
17785 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17786 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17788 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17789 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17790 aarch64_libgcc_floating_mode_supported_p
17792 #undef TARGET_MANGLE_TYPE
17793 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17795 #undef TARGET_MEMORY_MOVE_COST
17796 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17798 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17799 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17801 #undef TARGET_MUST_PASS_IN_STACK
17802 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17804 /* This target hook should return true if accesses to volatile bitfields
17805 should use the narrowest mode possible. It should return false if these
17806 accesses should use the bitfield container type. */
17807 #undef TARGET_NARROW_VOLATILE_BITFIELD
17808 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17810 #undef TARGET_OPTION_OVERRIDE
17811 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17813 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17814 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17815 aarch64_override_options_after_change
17817 #undef TARGET_OPTION_SAVE
17818 #define TARGET_OPTION_SAVE aarch64_option_save
17820 #undef TARGET_OPTION_RESTORE
17821 #define TARGET_OPTION_RESTORE aarch64_option_restore
17823 #undef TARGET_OPTION_PRINT
17824 #define TARGET_OPTION_PRINT aarch64_option_print
17826 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17827 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17829 #undef TARGET_SET_CURRENT_FUNCTION
17830 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17832 #undef TARGET_PASS_BY_REFERENCE
17833 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17835 #undef TARGET_PREFERRED_RELOAD_CLASS
17836 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17838 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17839 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17841 #undef TARGET_PROMOTED_TYPE
17842 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17844 #undef TARGET_SECONDARY_RELOAD
17845 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17847 #undef TARGET_SHIFT_TRUNCATION_MASK
17848 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17850 #undef TARGET_SETUP_INCOMING_VARARGS
17851 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17853 #undef TARGET_STRUCT_VALUE_RTX
17854 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17856 #undef TARGET_REGISTER_MOVE_COST
17857 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17859 #undef TARGET_RETURN_IN_MEMORY
17860 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17862 #undef TARGET_RETURN_IN_MSB
17863 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17865 #undef TARGET_RTX_COSTS
17866 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17868 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17869 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17871 #undef TARGET_SCHED_ISSUE_RATE
17872 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17874 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17875 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17876 aarch64_sched_first_cycle_multipass_dfa_lookahead
17878 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17879 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17880 aarch64_first_cycle_multipass_dfa_lookahead_guard
17882 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17883 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17884 aarch64_get_separate_components
17886 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17887 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17888 aarch64_components_for_bb
17890 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17891 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17892 aarch64_disqualify_components
17894 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17895 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17896 aarch64_emit_prologue_components
17898 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17899 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17900 aarch64_emit_epilogue_components
17902 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17903 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17904 aarch64_set_handled_components
17906 #undef TARGET_TRAMPOLINE_INIT
17907 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17909 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17910 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17912 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17913 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17915 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17916 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17917 aarch64_builtin_support_vector_misalignment
17919 #undef TARGET_ARRAY_MODE
17920 #define TARGET_ARRAY_MODE aarch64_array_mode
17922 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17923 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17925 #undef TARGET_VECTORIZE_ADD_STMT_COST
17926 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17928 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17929 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17930 aarch64_builtin_vectorization_cost
17932 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17933 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17935 #undef TARGET_VECTORIZE_BUILTINS
17936 #define TARGET_VECTORIZE_BUILTINS
17938 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17939 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17940 aarch64_builtin_vectorized_function
17942 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17943 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17944 aarch64_autovectorize_vector_sizes
17946 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17947 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17948 aarch64_atomic_assign_expand_fenv
17950 /* Section anchor support. */
17952 #undef TARGET_MIN_ANCHOR_OFFSET
17953 #define TARGET_MIN_ANCHOR_OFFSET -256
17955 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17956 byte offset; we can do much more for larger data types, but have no way
17957 to determine the size of the access. We assume accesses are aligned. */
17958 #undef TARGET_MAX_ANCHOR_OFFSET
17959 #define TARGET_MAX_ANCHOR_OFFSET 4095
17961 #undef TARGET_VECTOR_ALIGNMENT
17962 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17964 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17965 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17966 aarch64_vectorize_preferred_vector_alignment
17967 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17968 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17969 aarch64_simd_vector_alignment_reachable
17971 /* vec_perm support. */
17973 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17974 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17975 aarch64_vectorize_vec_perm_const
17977 #undef TARGET_VECTORIZE_GET_MASK_MODE
17978 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17979 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17980 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17981 aarch64_empty_mask_is_expensive
17982 #undef TARGET_PREFERRED_ELSE_VALUE
17983 #define TARGET_PREFERRED_ELSE_VALUE \
17984 aarch64_preferred_else_value
17986 #undef TARGET_INIT_LIBFUNCS
17987 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17989 #undef TARGET_FIXED_CONDITION_CODE_REGS
17990 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17992 #undef TARGET_FLAGS_REGNUM
17993 #define TARGET_FLAGS_REGNUM CC_REGNUM
17995 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17996 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17998 #undef TARGET_ASAN_SHADOW_OFFSET
17999 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18001 #undef TARGET_LEGITIMIZE_ADDRESS
18002 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18004 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18005 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18007 #undef TARGET_CAN_USE_DOLOOP_P
18008 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18010 #undef TARGET_SCHED_ADJUST_PRIORITY
18011 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18013 #undef TARGET_SCHED_MACRO_FUSION_P
18014 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18016 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18017 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18019 #undef TARGET_SCHED_FUSION_PRIORITY
18020 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18022 #undef TARGET_UNSPEC_MAY_TRAP_P
18023 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18025 #undef TARGET_USE_PSEUDO_PIC_REG
18026 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18028 #undef TARGET_PRINT_OPERAND
18029 #define TARGET_PRINT_OPERAND aarch64_print_operand
18031 #undef TARGET_PRINT_OPERAND_ADDRESS
18032 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18034 #undef TARGET_OPTAB_SUPPORTED_P
18035 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18037 #undef TARGET_OMIT_STRUCT_RETURN_REG
18038 #define TARGET_OMIT_STRUCT_RETURN_REG true
18040 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18041 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18042 aarch64_dwarf_poly_indeterminate_value
18044 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18045 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18046 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18048 #undef TARGET_HARD_REGNO_NREGS
18049 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18050 #undef TARGET_HARD_REGNO_MODE_OK
18051 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18053 #undef TARGET_MODES_TIEABLE_P
18054 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18056 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18057 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18058 aarch64_hard_regno_call_part_clobbered
18060 #undef TARGET_CONSTANT_ALIGNMENT
18061 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18063 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18064 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18066 #undef TARGET_CAN_CHANGE_MODE_CLASS
18067 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18069 #undef TARGET_SELECT_EARLY_REMAT_MODES
18070 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18073 #undef TARGET_RUN_TARGET_SELFTESTS
18074 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18075 #endif /* #if CHECKING_P */
18077 struct gcc_target targetm
= TARGET_INITIALIZER
;
18079 #include "gt-aarch64.h"