1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
77 #include "function-abi.h"
79 /* This file should be included last. */
80 #include "target-def.h"
82 /* Defined for convenience. */
83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
85 /* Information about a legitimate vector immediate operand. */
86 struct simd_immediate_info
88 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
89 enum modifier_type
{ LSL
, MSL
};
91 simd_immediate_info () {}
92 simd_immediate_info (scalar_float_mode
, rtx
);
93 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
94 insn_type
= MOV
, modifier_type
= LSL
,
96 simd_immediate_info (scalar_mode
, rtx
, rtx
);
97 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
99 /* The mode of the elements. */
100 scalar_mode elt_mode
;
102 /* The instruction to use to move the immediate into a vector. */
107 /* For MOV and MVN. */
110 /* The value of each element. */
113 /* The kind of shift modifier to use, and the number of bits to shift.
114 This is (LSL, 0) if no shift is needed. */
115 modifier_type modifier
;
122 /* The value of the first element and the step to be added for each
123 subsequent element. */
128 aarch64_svpattern pattern
;
132 /* Construct a floating-point immediate in which each element has mode
133 ELT_MODE_IN and value VALUE_IN. */
134 inline simd_immediate_info
135 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
136 : elt_mode (elt_mode_in
), insn (MOV
)
138 u
.mov
.value
= value_in
;
139 u
.mov
.modifier
= LSL
;
143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
144 and value VALUE_IN. The other parameters are as for the structure
146 inline simd_immediate_info
147 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
148 unsigned HOST_WIDE_INT value_in
,
149 insn_type insn_in
, modifier_type modifier_in
,
150 unsigned int shift_in
)
151 : elt_mode (elt_mode_in
), insn (insn_in
)
153 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
154 u
.mov
.modifier
= modifier_in
;
155 u
.mov
.shift
= shift_in
;
158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
159 and where element I is equal to BASE_IN + I * STEP_IN. */
160 inline simd_immediate_info
161 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
162 : elt_mode (elt_mode_in
), insn (INDEX
)
164 u
.index
.base
= base_in
;
165 u
.index
.step
= step_in
;
168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
169 and has PTRUE pattern PATTERN_IN. */
170 inline simd_immediate_info
171 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
172 aarch64_svpattern pattern_in
)
173 : elt_mode (elt_mode_in
), insn (PTRUE
)
175 u
.pattern
= pattern_in
;
178 /* The current code model. */
179 enum aarch64_code_model aarch64_cmodel
;
181 /* The number of 64-bit elements in an SVE vector. */
182 poly_uint16 aarch64_sve_vg
;
185 #undef TARGET_HAVE_TLS
186 #define TARGET_HAVE_TLS 1
189 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
192 machine_mode
*, int *,
194 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
195 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
196 static void aarch64_override_options_after_change (void);
197 static bool aarch64_vector_mode_supported_p (machine_mode
);
198 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
203 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
204 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
205 aarch64_addr_query_type
);
206 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
208 /* Major revision number of the ARM Architecture implemented by the target. */
209 unsigned aarch64_architecture_version
;
211 /* The processor for which instructions should be scheduled. */
212 enum aarch64_processor aarch64_tune
= cortexa53
;
214 /* Mask to specify which instruction scheduling options should be used. */
215 uint64_t aarch64_tune_flags
= 0;
217 /* Global flag for PC relative loads. */
218 bool aarch64_pcrelative_literal_loads
;
220 /* Global flag for whether frame pointer is enabled. */
221 bool aarch64_use_frame_pointer
;
223 #define BRANCH_PROTECT_STR_MAX 255
224 char *accepted_branch_protection_string
= NULL
;
226 static enum aarch64_parse_opt_result
227 aarch64_parse_branch_protection (const char*, char**);
229 /* Support for command line parsing of boolean flags in the tuning
231 struct aarch64_flag_desc
237 #define AARCH64_FUSION_PAIR(name, internal_name) \
238 { name, AARCH64_FUSE_##internal_name },
239 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
241 { "none", AARCH64_FUSE_NOTHING
},
242 #include "aarch64-fusion-pairs.def"
243 { "all", AARCH64_FUSE_ALL
},
244 { NULL
, AARCH64_FUSE_NOTHING
}
247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
248 { name, AARCH64_EXTRA_TUNE_##internal_name },
249 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
251 { "none", AARCH64_EXTRA_TUNE_NONE
},
252 #include "aarch64-tuning-flags.def"
253 { "all", AARCH64_EXTRA_TUNE_ALL
},
254 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
257 /* Tuning parameters. */
259 static const struct cpu_addrcost_table generic_addrcost_table
=
269 0, /* register_offset */
270 0, /* register_sextend */
271 0, /* register_zextend */
275 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
285 1, /* register_offset */
286 1, /* register_sextend */
287 2, /* register_zextend */
291 static const struct cpu_addrcost_table xgene1_addrcost_table
=
301 0, /* register_offset */
302 1, /* register_sextend */
303 1, /* register_zextend */
307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
317 2, /* register_offset */
318 3, /* register_sextend */
319 3, /* register_zextend */
323 static const struct cpu_addrcost_table tsv110_addrcost_table
=
333 0, /* register_offset */
334 1, /* register_sextend */
335 1, /* register_zextend */
339 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
349 3, /* register_offset */
350 3, /* register_sextend */
351 3, /* register_zextend */
355 static const struct cpu_regmove_cost generic_regmove_cost
=
358 /* Avoid the use of slow int<->fp moves for spilling by setting
359 their cost higher than memmov_cost. */
365 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
375 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
378 /* Avoid the use of slow int<->fp moves for spilling by setting
379 their cost higher than memmov_cost. */
385 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost (actual, 4 and 9). */
395 static const struct cpu_regmove_cost thunderx_regmove_cost
=
403 static const struct cpu_regmove_cost xgene1_regmove_cost
=
406 /* Avoid the use of slow int<->fp moves for spilling by setting
407 their cost higher than memmov_cost. */
413 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
416 /* Avoid the use of int<->fp moves for spilling. */
422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
425 /* Avoid the use of int<->fp moves for spilling. */
431 static const struct cpu_regmove_cost tsv110_regmove_cost
=
434 /* Avoid the use of slow int<->fp moves for spilling by setting
435 their cost higher than memmov_cost. */
441 /* Generic costs for vector insn classes. */
442 static const struct cpu_vector_cost generic_vector_cost
=
444 1, /* scalar_int_stmt_cost */
445 1, /* scalar_fp_stmt_cost */
446 1, /* scalar_load_cost */
447 1, /* scalar_store_cost */
448 1, /* vec_int_stmt_cost */
449 1, /* vec_fp_stmt_cost */
450 2, /* vec_permute_cost */
451 1, /* vec_to_scalar_cost */
452 1, /* scalar_to_vec_cost */
453 1, /* vec_align_load_cost */
454 1, /* vec_unalign_load_cost */
455 1, /* vec_unalign_store_cost */
456 1, /* vec_store_cost */
457 3, /* cond_taken_branch_cost */
458 1 /* cond_not_taken_branch_cost */
461 /* QDF24XX costs for vector insn classes. */
462 static const struct cpu_vector_cost qdf24xx_vector_cost
=
464 1, /* scalar_int_stmt_cost */
465 1, /* scalar_fp_stmt_cost */
466 1, /* scalar_load_cost */
467 1, /* scalar_store_cost */
468 1, /* vec_int_stmt_cost */
469 3, /* vec_fp_stmt_cost */
470 2, /* vec_permute_cost */
471 1, /* vec_to_scalar_cost */
472 1, /* scalar_to_vec_cost */
473 1, /* vec_align_load_cost */
474 1, /* vec_unalign_load_cost */
475 1, /* vec_unalign_store_cost */
476 1, /* vec_store_cost */
477 3, /* cond_taken_branch_cost */
478 1 /* cond_not_taken_branch_cost */
481 /* ThunderX costs for vector insn classes. */
482 static const struct cpu_vector_cost thunderx_vector_cost
=
484 1, /* scalar_int_stmt_cost */
485 1, /* scalar_fp_stmt_cost */
486 3, /* scalar_load_cost */
487 1, /* scalar_store_cost */
488 4, /* vec_int_stmt_cost */
489 1, /* vec_fp_stmt_cost */
490 4, /* vec_permute_cost */
491 2, /* vec_to_scalar_cost */
492 2, /* scalar_to_vec_cost */
493 3, /* vec_align_load_cost */
494 5, /* vec_unalign_load_cost */
495 5, /* vec_unalign_store_cost */
496 1, /* vec_store_cost */
497 3, /* cond_taken_branch_cost */
498 3 /* cond_not_taken_branch_cost */
501 static const struct cpu_vector_cost tsv110_vector_cost
=
503 1, /* scalar_int_stmt_cost */
504 1, /* scalar_fp_stmt_cost */
505 5, /* scalar_load_cost */
506 1, /* scalar_store_cost */
507 2, /* vec_int_stmt_cost */
508 2, /* vec_fp_stmt_cost */
509 2, /* vec_permute_cost */
510 3, /* vec_to_scalar_cost */
511 2, /* scalar_to_vec_cost */
512 5, /* vec_align_load_cost */
513 5, /* vec_unalign_load_cost */
514 1, /* vec_unalign_store_cost */
515 1, /* vec_store_cost */
516 1, /* cond_taken_branch_cost */
517 1 /* cond_not_taken_branch_cost */
520 /* Generic costs for vector insn classes. */
521 static const struct cpu_vector_cost cortexa57_vector_cost
=
523 1, /* scalar_int_stmt_cost */
524 1, /* scalar_fp_stmt_cost */
525 4, /* scalar_load_cost */
526 1, /* scalar_store_cost */
527 2, /* vec_int_stmt_cost */
528 2, /* vec_fp_stmt_cost */
529 3, /* vec_permute_cost */
530 8, /* vec_to_scalar_cost */
531 8, /* scalar_to_vec_cost */
532 4, /* vec_align_load_cost */
533 4, /* vec_unalign_load_cost */
534 1, /* vec_unalign_store_cost */
535 1, /* vec_store_cost */
536 1, /* cond_taken_branch_cost */
537 1 /* cond_not_taken_branch_cost */
540 static const struct cpu_vector_cost exynosm1_vector_cost
=
542 1, /* scalar_int_stmt_cost */
543 1, /* scalar_fp_stmt_cost */
544 5, /* scalar_load_cost */
545 1, /* scalar_store_cost */
546 3, /* vec_int_stmt_cost */
547 3, /* vec_fp_stmt_cost */
548 3, /* vec_permute_cost */
549 3, /* vec_to_scalar_cost */
550 3, /* scalar_to_vec_cost */
551 5, /* vec_align_load_cost */
552 5, /* vec_unalign_load_cost */
553 1, /* vec_unalign_store_cost */
554 1, /* vec_store_cost */
555 1, /* cond_taken_branch_cost */
556 1 /* cond_not_taken_branch_cost */
559 /* Generic costs for vector insn classes. */
560 static const struct cpu_vector_cost xgene1_vector_cost
=
562 1, /* scalar_int_stmt_cost */
563 1, /* scalar_fp_stmt_cost */
564 5, /* scalar_load_cost */
565 1, /* scalar_store_cost */
566 2, /* vec_int_stmt_cost */
567 2, /* vec_fp_stmt_cost */
568 2, /* vec_permute_cost */
569 4, /* vec_to_scalar_cost */
570 4, /* scalar_to_vec_cost */
571 10, /* vec_align_load_cost */
572 10, /* vec_unalign_load_cost */
573 2, /* vec_unalign_store_cost */
574 2, /* vec_store_cost */
575 2, /* cond_taken_branch_cost */
576 1 /* cond_not_taken_branch_cost */
579 /* Costs for vector insn classes for Vulcan. */
580 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
582 1, /* scalar_int_stmt_cost */
583 6, /* scalar_fp_stmt_cost */
584 4, /* scalar_load_cost */
585 1, /* scalar_store_cost */
586 5, /* vec_int_stmt_cost */
587 6, /* vec_fp_stmt_cost */
588 3, /* vec_permute_cost */
589 6, /* vec_to_scalar_cost */
590 5, /* scalar_to_vec_cost */
591 8, /* vec_align_load_cost */
592 8, /* vec_unalign_load_cost */
593 4, /* vec_unalign_store_cost */
594 4, /* vec_store_cost */
595 2, /* cond_taken_branch_cost */
596 1 /* cond_not_taken_branch_cost */
599 /* Generic costs for branch instructions. */
600 static const struct cpu_branch_cost generic_branch_cost
=
602 1, /* Predictable. */
603 3 /* Unpredictable. */
606 /* Generic approximation modes. */
607 static const cpu_approx_modes generic_approx_modes
=
609 AARCH64_APPROX_NONE
, /* division */
610 AARCH64_APPROX_NONE
, /* sqrt */
611 AARCH64_APPROX_NONE
/* recip_sqrt */
614 /* Approximation modes for Exynos M1. */
615 static const cpu_approx_modes exynosm1_approx_modes
=
617 AARCH64_APPROX_NONE
, /* division */
618 AARCH64_APPROX_ALL
, /* sqrt */
619 AARCH64_APPROX_ALL
/* recip_sqrt */
622 /* Approximation modes for X-Gene 1. */
623 static const cpu_approx_modes xgene1_approx_modes
=
625 AARCH64_APPROX_NONE
, /* division */
626 AARCH64_APPROX_NONE
, /* sqrt */
627 AARCH64_APPROX_ALL
/* recip_sqrt */
630 /* Generic prefetch settings (which disable prefetch). */
631 static const cpu_prefetch_tune generic_prefetch_tune
=
634 -1, /* l1_cache_size */
635 -1, /* l1_cache_line_size */
636 -1, /* l2_cache_size */
637 true, /* prefetch_dynamic_strides */
638 -1, /* minimum_stride */
639 -1 /* default_opt_level */
642 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
645 -1, /* l1_cache_size */
646 64, /* l1_cache_line_size */
647 -1, /* l2_cache_size */
648 true, /* prefetch_dynamic_strides */
649 -1, /* minimum_stride */
650 -1 /* default_opt_level */
653 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
656 32, /* l1_cache_size */
657 64, /* l1_cache_line_size */
658 512, /* l2_cache_size */
659 false, /* prefetch_dynamic_strides */
660 2048, /* minimum_stride */
661 3 /* default_opt_level */
664 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
667 32, /* l1_cache_size */
668 128, /* l1_cache_line_size */
669 16*1024, /* l2_cache_size */
670 true, /* prefetch_dynamic_strides */
671 -1, /* minimum_stride */
672 3 /* default_opt_level */
675 static const cpu_prefetch_tune thunderx_prefetch_tune
=
678 32, /* l1_cache_size */
679 128, /* l1_cache_line_size */
680 -1, /* l2_cache_size */
681 true, /* prefetch_dynamic_strides */
682 -1, /* minimum_stride */
683 -1 /* default_opt_level */
686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
689 32, /* l1_cache_size */
690 64, /* l1_cache_line_size */
691 256, /* l2_cache_size */
692 true, /* prefetch_dynamic_strides */
693 -1, /* minimum_stride */
694 -1 /* default_opt_level */
697 static const cpu_prefetch_tune tsv110_prefetch_tune
=
700 64, /* l1_cache_size */
701 64, /* l1_cache_line_size */
702 512, /* l2_cache_size */
703 true, /* prefetch_dynamic_strides */
704 -1, /* minimum_stride */
705 -1 /* default_opt_level */
708 static const cpu_prefetch_tune xgene1_prefetch_tune
=
711 32, /* l1_cache_size */
712 64, /* l1_cache_line_size */
713 256, /* l2_cache_size */
714 true, /* prefetch_dynamic_strides */
715 -1, /* minimum_stride */
716 -1 /* default_opt_level */
719 static const struct tune_params generic_tunings
=
721 &cortexa57_extra_costs
,
722 &generic_addrcost_table
,
723 &generic_regmove_cost
,
724 &generic_vector_cost
,
725 &generic_branch_cost
,
726 &generic_approx_modes
,
727 SVE_NOT_IMPLEMENTED
, /* sve_width */
730 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
731 "16:12", /* function_align. */
732 "4", /* jump_align. */
733 "8", /* loop_align. */
734 2, /* int_reassoc_width. */
735 4, /* fp_reassoc_width. */
736 1, /* vec_reassoc_width. */
737 2, /* min_div_recip_mul_sf. */
738 2, /* min_div_recip_mul_df. */
739 0, /* max_case_values. */
740 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
741 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
742 &generic_prefetch_tune
745 static const struct tune_params cortexa35_tunings
=
747 &cortexa53_extra_costs
,
748 &generic_addrcost_table
,
749 &cortexa53_regmove_cost
,
750 &generic_vector_cost
,
751 &generic_branch_cost
,
752 &generic_approx_modes
,
753 SVE_NOT_IMPLEMENTED
, /* sve_width */
756 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
757 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
758 "16", /* function_align. */
759 "4", /* jump_align. */
760 "8", /* loop_align. */
761 2, /* int_reassoc_width. */
762 4, /* fp_reassoc_width. */
763 1, /* vec_reassoc_width. */
764 2, /* min_div_recip_mul_sf. */
765 2, /* min_div_recip_mul_df. */
766 0, /* max_case_values. */
767 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
768 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
769 &generic_prefetch_tune
772 static const struct tune_params cortexa53_tunings
=
774 &cortexa53_extra_costs
,
775 &generic_addrcost_table
,
776 &cortexa53_regmove_cost
,
777 &generic_vector_cost
,
778 &generic_branch_cost
,
779 &generic_approx_modes
,
780 SVE_NOT_IMPLEMENTED
, /* sve_width */
783 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
784 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
785 "16", /* function_align. */
786 "4", /* jump_align. */
787 "8", /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params cortexa57_tunings
=
801 &cortexa57_extra_costs
,
802 &generic_addrcost_table
,
803 &cortexa57_regmove_cost
,
804 &cortexa57_vector_cost
,
805 &generic_branch_cost
,
806 &generic_approx_modes
,
807 SVE_NOT_IMPLEMENTED
, /* sve_width */
810 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
811 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
812 "16", /* function_align. */
813 "4", /* jump_align. */
814 "8", /* loop_align. */
815 2, /* int_reassoc_width. */
816 4, /* fp_reassoc_width. */
817 1, /* vec_reassoc_width. */
818 2, /* min_div_recip_mul_sf. */
819 2, /* min_div_recip_mul_df. */
820 0, /* max_case_values. */
821 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
822 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
823 &generic_prefetch_tune
826 static const struct tune_params cortexa72_tunings
=
828 &cortexa57_extra_costs
,
829 &generic_addrcost_table
,
830 &cortexa57_regmove_cost
,
831 &cortexa57_vector_cost
,
832 &generic_branch_cost
,
833 &generic_approx_modes
,
834 SVE_NOT_IMPLEMENTED
, /* sve_width */
837 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
838 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
839 "16", /* function_align. */
840 "4", /* jump_align. */
841 "8", /* loop_align. */
842 2, /* int_reassoc_width. */
843 4, /* fp_reassoc_width. */
844 1, /* vec_reassoc_width. */
845 2, /* min_div_recip_mul_sf. */
846 2, /* min_div_recip_mul_df. */
847 0, /* max_case_values. */
848 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
849 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
850 &generic_prefetch_tune
853 static const struct tune_params cortexa73_tunings
=
855 &cortexa57_extra_costs
,
856 &generic_addrcost_table
,
857 &cortexa57_regmove_cost
,
858 &cortexa57_vector_cost
,
859 &generic_branch_cost
,
860 &generic_approx_modes
,
861 SVE_NOT_IMPLEMENTED
, /* sve_width */
862 4, /* memmov_cost. */
864 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
865 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
866 "16", /* function_align. */
867 "4", /* jump_align. */
868 "8", /* loop_align. */
869 2, /* int_reassoc_width. */
870 4, /* fp_reassoc_width. */
871 1, /* vec_reassoc_width. */
872 2, /* min_div_recip_mul_sf. */
873 2, /* min_div_recip_mul_df. */
874 0, /* max_case_values. */
875 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
876 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
877 &generic_prefetch_tune
882 static const struct tune_params exynosm1_tunings
=
884 &exynosm1_extra_costs
,
885 &exynosm1_addrcost_table
,
886 &exynosm1_regmove_cost
,
887 &exynosm1_vector_cost
,
888 &generic_branch_cost
,
889 &exynosm1_approx_modes
,
890 SVE_NOT_IMPLEMENTED
, /* sve_width */
893 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
894 "4", /* function_align. */
895 "4", /* jump_align. */
896 "4", /* loop_align. */
897 2, /* int_reassoc_width. */
898 4, /* fp_reassoc_width. */
899 1, /* vec_reassoc_width. */
900 2, /* min_div_recip_mul_sf. */
901 2, /* min_div_recip_mul_df. */
902 48, /* max_case_values. */
903 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
904 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
905 &exynosm1_prefetch_tune
908 static const struct tune_params thunderxt88_tunings
=
910 &thunderx_extra_costs
,
911 &generic_addrcost_table
,
912 &thunderx_regmove_cost
,
913 &thunderx_vector_cost
,
914 &generic_branch_cost
,
915 &generic_approx_modes
,
916 SVE_NOT_IMPLEMENTED
, /* sve_width */
919 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
920 "8", /* function_align. */
921 "8", /* jump_align. */
922 "8", /* loop_align. */
923 2, /* int_reassoc_width. */
924 4, /* fp_reassoc_width. */
925 1, /* vec_reassoc_width. */
926 2, /* min_div_recip_mul_sf. */
927 2, /* min_div_recip_mul_df. */
928 0, /* max_case_values. */
929 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
930 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
931 &thunderxt88_prefetch_tune
934 static const struct tune_params thunderx_tunings
=
936 &thunderx_extra_costs
,
937 &generic_addrcost_table
,
938 &thunderx_regmove_cost
,
939 &thunderx_vector_cost
,
940 &generic_branch_cost
,
941 &generic_approx_modes
,
942 SVE_NOT_IMPLEMENTED
, /* sve_width */
945 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
946 "8", /* function_align. */
947 "8", /* jump_align. */
948 "8", /* loop_align. */
949 2, /* int_reassoc_width. */
950 4, /* fp_reassoc_width. */
951 1, /* vec_reassoc_width. */
952 2, /* min_div_recip_mul_sf. */
953 2, /* min_div_recip_mul_df. */
954 0, /* max_case_values. */
955 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
956 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
957 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
958 &thunderx_prefetch_tune
961 static const struct tune_params tsv110_tunings
=
964 &tsv110_addrcost_table
,
965 &tsv110_regmove_cost
,
967 &generic_branch_cost
,
968 &generic_approx_modes
,
969 SVE_NOT_IMPLEMENTED
, /* sve_width */
972 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
973 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
974 "16", /* function_align. */
975 "4", /* jump_align. */
976 "8", /* loop_align. */
977 2, /* int_reassoc_width. */
978 4, /* fp_reassoc_width. */
979 1, /* vec_reassoc_width. */
980 2, /* min_div_recip_mul_sf. */
981 2, /* min_div_recip_mul_df. */
982 0, /* max_case_values. */
983 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
984 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
985 &tsv110_prefetch_tune
988 static const struct tune_params xgene1_tunings
=
991 &xgene1_addrcost_table
,
992 &xgene1_regmove_cost
,
994 &generic_branch_cost
,
995 &xgene1_approx_modes
,
996 SVE_NOT_IMPLEMENTED
, /* sve_width */
999 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1000 "16", /* function_align. */
1001 "16", /* jump_align. */
1002 "16", /* loop_align. */
1003 2, /* int_reassoc_width. */
1004 4, /* fp_reassoc_width. */
1005 1, /* vec_reassoc_width. */
1006 2, /* min_div_recip_mul_sf. */
1007 2, /* min_div_recip_mul_df. */
1008 17, /* max_case_values. */
1009 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1010 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1011 &xgene1_prefetch_tune
1014 static const struct tune_params emag_tunings
=
1016 &xgene1_extra_costs
,
1017 &xgene1_addrcost_table
,
1018 &xgene1_regmove_cost
,
1019 &xgene1_vector_cost
,
1020 &generic_branch_cost
,
1021 &xgene1_approx_modes
,
1022 SVE_NOT_IMPLEMENTED
,
1023 6, /* memmov_cost */
1025 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1026 "16", /* function_align. */
1027 "16", /* jump_align. */
1028 "16", /* loop_align. */
1029 2, /* int_reassoc_width. */
1030 4, /* fp_reassoc_width. */
1031 1, /* vec_reassoc_width. */
1032 2, /* min_div_recip_mul_sf. */
1033 2, /* min_div_recip_mul_df. */
1034 17, /* max_case_values. */
1035 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1036 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1037 &xgene1_prefetch_tune
1040 static const struct tune_params qdf24xx_tunings
=
1042 &qdf24xx_extra_costs
,
1043 &qdf24xx_addrcost_table
,
1044 &qdf24xx_regmove_cost
,
1045 &qdf24xx_vector_cost
,
1046 &generic_branch_cost
,
1047 &generic_approx_modes
,
1048 SVE_NOT_IMPLEMENTED
, /* sve_width */
1049 4, /* memmov_cost */
1051 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1052 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1053 "16", /* function_align. */
1054 "8", /* jump_align. */
1055 "16", /* loop_align. */
1056 2, /* int_reassoc_width. */
1057 4, /* fp_reassoc_width. */
1058 1, /* vec_reassoc_width. */
1059 2, /* min_div_recip_mul_sf. */
1060 2, /* min_div_recip_mul_df. */
1061 0, /* max_case_values. */
1062 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1063 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1064 &qdf24xx_prefetch_tune
1067 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1069 static const struct tune_params saphira_tunings
=
1071 &generic_extra_costs
,
1072 &generic_addrcost_table
,
1073 &generic_regmove_cost
,
1074 &generic_vector_cost
,
1075 &generic_branch_cost
,
1076 &generic_approx_modes
,
1077 SVE_NOT_IMPLEMENTED
, /* sve_width */
1078 4, /* memmov_cost */
1080 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1081 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1082 "16", /* function_align. */
1083 "8", /* jump_align. */
1084 "16", /* loop_align. */
1085 2, /* int_reassoc_width. */
1086 4, /* fp_reassoc_width. */
1087 1, /* vec_reassoc_width. */
1088 2, /* min_div_recip_mul_sf. */
1089 2, /* min_div_recip_mul_df. */
1090 0, /* max_case_values. */
1091 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1092 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1093 &generic_prefetch_tune
1096 static const struct tune_params thunderx2t99_tunings
=
1098 &thunderx2t99_extra_costs
,
1099 &thunderx2t99_addrcost_table
,
1100 &thunderx2t99_regmove_cost
,
1101 &thunderx2t99_vector_cost
,
1102 &generic_branch_cost
,
1103 &generic_approx_modes
,
1104 SVE_NOT_IMPLEMENTED
, /* sve_width */
1105 4, /* memmov_cost. */
1106 4, /* issue_rate. */
1107 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1108 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1109 "16", /* function_align. */
1110 "8", /* jump_align. */
1111 "16", /* loop_align. */
1112 3, /* int_reassoc_width. */
1113 2, /* fp_reassoc_width. */
1114 2, /* vec_reassoc_width. */
1115 2, /* min_div_recip_mul_sf. */
1116 2, /* min_div_recip_mul_df. */
1117 0, /* max_case_values. */
1118 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1119 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1120 &thunderx2t99_prefetch_tune
1123 static const struct tune_params neoversen1_tunings
=
1125 &cortexa57_extra_costs
,
1126 &generic_addrcost_table
,
1127 &generic_regmove_cost
,
1128 &cortexa57_vector_cost
,
1129 &generic_branch_cost
,
1130 &generic_approx_modes
,
1131 SVE_NOT_IMPLEMENTED
, /* sve_width */
1132 4, /* memmov_cost */
1134 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1135 "32:16", /* function_align. */
1136 "32:16", /* jump_align. */
1137 "32:16", /* loop_align. */
1138 2, /* int_reassoc_width. */
1139 4, /* fp_reassoc_width. */
1140 2, /* vec_reassoc_width. */
1141 2, /* min_div_recip_mul_sf. */
1142 2, /* min_div_recip_mul_df. */
1143 0, /* max_case_values. */
1144 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1145 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1146 &generic_prefetch_tune
1149 /* Support for fine-grained override of the tuning structures. */
1150 struct aarch64_tuning_override_function
1153 void (*parse_override
)(const char*, struct tune_params
*);
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions
[] =
1163 { "fuse", aarch64_parse_fuse_string
},
1164 { "tune", aarch64_parse_tune_string
},
1165 { "sve_width", aarch64_parse_sve_width_string
},
1169 /* A processor implementing AArch64. */
1172 const char *const name
;
1173 enum aarch64_processor ident
;
1174 enum aarch64_processor sched_core
;
1175 enum aarch64_arch arch
;
1176 unsigned architecture_version
;
1177 const uint64_t flags
;
1178 const struct tune_params
*const tune
;
1181 /* Architectures implementing AArch64. */
1182 static const struct processor all_architectures
[] =
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1190 /* Processor cores implementing AArch64. */
1191 static const struct processor all_cores
[] =
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1195 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1196 FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1199 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1200 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1204 /* Target specification. These are populated by the -march, -mtune, -mcpu
1205 handling code or by target attributes. */
1206 static const struct processor
*selected_arch
;
1207 static const struct processor
*selected_cpu
;
1208 static const struct processor
*selected_tune
;
1210 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1212 /* The current tuning set. */
1213 struct tune_params aarch64_tune_params
= generic_tunings
;
1215 /* Table of machine attributes. */
1216 static const struct attribute_spec aarch64_attribute_table
[] =
1218 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1219 affects_type_identity, handler, exclude } */
1220 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1221 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1224 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1226 /* An ISA extension in the co-processor and main instruction set space. */
1227 struct aarch64_option_extension
1229 const char *const name
;
1230 const unsigned long flags_on
;
1231 const unsigned long flags_off
;
1234 typedef enum aarch64_cond_code
1236 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1237 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1238 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1242 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1244 struct aarch64_branch_protect_type
1246 /* The type's name that the user passes to the branch-protection option
1249 /* Function to handle the protection type and set global variables.
1250 First argument is the string token corresponding with this type and the
1251 second argument is the next token in the option string.
1253 * AARCH64_PARSE_OK: Handling was sucessful.
1254 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1255 should print an error.
1256 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1258 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1259 /* A list of types that can follow this type in the option string. */
1260 const aarch64_branch_protect_type
* subtypes
;
1261 unsigned int num_subtypes
;
1264 static enum aarch64_parse_opt_result
1265 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1267 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1268 aarch64_enable_bti
= 0;
1271 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1272 return AARCH64_PARSE_INVALID_FEATURE
;
1274 return AARCH64_PARSE_OK
;
1277 static enum aarch64_parse_opt_result
1278 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1280 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1281 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1282 aarch64_enable_bti
= 1;
1285 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1286 return AARCH64_PARSE_INVALID_FEATURE
;
1288 return AARCH64_PARSE_OK
;
1291 static enum aarch64_parse_opt_result
1292 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1293 char* rest ATTRIBUTE_UNUSED
)
1295 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1296 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1297 return AARCH64_PARSE_OK
;
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1302 char* rest ATTRIBUTE_UNUSED
)
1304 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1305 return AARCH64_PARSE_OK
;
1308 static enum aarch64_parse_opt_result
1309 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1310 char* rest ATTRIBUTE_UNUSED
)
1312 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1313 return AARCH64_PARSE_OK
;
1316 static enum aarch64_parse_opt_result
1317 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1318 char* rest ATTRIBUTE_UNUSED
)
1320 aarch64_enable_bti
= 1;
1321 return AARCH64_PARSE_OK
;
1324 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1325 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1326 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1327 { NULL
, NULL
, NULL
, 0 }
1330 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1331 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1332 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1333 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1334 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1335 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1336 { NULL
, NULL
, NULL
, 0 }
1339 /* The condition codes of the processor, and the inverse function. */
1340 static const char * const aarch64_condition_codes
[] =
1342 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1343 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1346 /* The preferred condition codes for SVE conditions. */
1347 static const char *const aarch64_sve_condition_codes
[] =
1349 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1350 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1353 /* Return the assembly token for svpattern value VALUE. */
1356 svpattern_token (enum aarch64_svpattern pattern
)
1360 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1361 AARCH64_FOR_SVPATTERN (CASE
)
1363 case AARCH64_NUM_SVPATTERNS
:
1369 /* Return the descriptor of the SIMD ABI. */
1371 static const predefined_function_abi
&
1372 aarch64_simd_abi (void)
1374 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1375 if (!simd_abi
.initialized_p ())
1377 HARD_REG_SET full_reg_clobbers
1378 = default_function_abi
.full_reg_clobbers ();
1379 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1380 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1381 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1382 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1387 /* Generate code to enable conditional branches in functions over 1 MiB. */
1389 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1390 const char * branch_format
)
1392 rtx_code_label
* tmp_label
= gen_label_rtx ();
1393 char label_buf
[256];
1395 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1396 CODE_LABEL_NUMBER (tmp_label
));
1397 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1398 rtx dest_label
= operands
[pos_label
];
1399 operands
[pos_label
] = tmp_label
;
1401 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1402 output_asm_insn (buffer
, operands
);
1404 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1405 operands
[pos_label
] = dest_label
;
1406 output_asm_insn (buffer
, operands
);
1411 aarch64_err_no_fpadvsimd (machine_mode mode
)
1413 if (TARGET_GENERAL_REGS_ONLY
)
1414 if (FLOAT_MODE_P (mode
))
1415 error ("%qs is incompatible with the use of floating-point types",
1416 "-mgeneral-regs-only");
1418 error ("%qs is incompatible with the use of vector types",
1419 "-mgeneral-regs-only");
1421 if (FLOAT_MODE_P (mode
))
1422 error ("%qs feature modifier is incompatible with the use of"
1423 " floating-point types", "+nofp");
1425 error ("%qs feature modifier is incompatible with the use of"
1426 " vector types", "+nofp");
1429 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1430 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1431 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1432 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1433 and GENERAL_REGS is lower than the memory cost (in this case the best class
1434 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1435 cost results in bad allocations with many redundant int<->FP moves which
1436 are expensive on various cores.
1437 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1438 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1439 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1440 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1441 The result of this is that it is no longer inefficient to have a higher
1442 memory move cost than the register move cost.
1446 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1447 reg_class_t best_class
)
1451 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1452 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1453 return allocno_class
;
1455 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1456 || !reg_class_subset_p (FP_REGS
, best_class
))
1459 mode
= PSEUDO_REGNO_MODE (regno
);
1460 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1464 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1466 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1467 return aarch64_tune_params
.min_div_recip_mul_sf
;
1468 return aarch64_tune_params
.min_div_recip_mul_df
;
1471 /* Return the reassociation width of treeop OPC with mode MODE. */
1473 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1475 if (VECTOR_MODE_P (mode
))
1476 return aarch64_tune_params
.vec_reassoc_width
;
1477 if (INTEGRAL_MODE_P (mode
))
1478 return aarch64_tune_params
.int_reassoc_width
;
1479 /* Avoid reassociating floating point addition so we emit more FMAs. */
1480 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1481 return aarch64_tune_params
.fp_reassoc_width
;
1485 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1487 aarch64_dbx_register_number (unsigned regno
)
1489 if (GP_REGNUM_P (regno
))
1490 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1491 else if (regno
== SP_REGNUM
)
1492 return AARCH64_DWARF_SP
;
1493 else if (FP_REGNUM_P (regno
))
1494 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1495 else if (PR_REGNUM_P (regno
))
1496 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1497 else if (regno
== VG_REGNUM
)
1498 return AARCH64_DWARF_VG
;
1500 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1501 equivalent DWARF register. */
1502 return DWARF_FRAME_REGISTERS
;
1505 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1506 integer, otherwise return X unmodified. */
1508 aarch64_bit_representation (rtx x
)
1510 if (CONST_DOUBLE_P (x
))
1511 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1515 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1517 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1520 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1523 /* Return true if MODE is an SVE predicate mode. */
1525 aarch64_sve_pred_mode_p (machine_mode mode
)
1528 && (mode
== VNx16BImode
1529 || mode
== VNx8BImode
1530 || mode
== VNx4BImode
1531 || mode
== VNx2BImode
));
1534 /* Three mutually-exclusive flags describing a vector or predicate type. */
1535 const unsigned int VEC_ADVSIMD
= 1;
1536 const unsigned int VEC_SVE_DATA
= 2;
1537 const unsigned int VEC_SVE_PRED
= 4;
1538 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1539 a structure of 2, 3 or 4 vectors. */
1540 const unsigned int VEC_STRUCT
= 8;
1541 /* Useful combinations of the above. */
1542 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1543 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1545 /* Return a set of flags describing the vector properties of mode MODE.
1546 Ignore modes that are not supported by the current target. */
1548 aarch64_classify_vector_mode (machine_mode mode
)
1550 if (aarch64_advsimd_struct_mode_p (mode
))
1551 return VEC_ADVSIMD
| VEC_STRUCT
;
1553 if (aarch64_sve_pred_mode_p (mode
))
1554 return VEC_SVE_PRED
;
1556 /* Make the decision based on the mode's enum value rather than its
1557 properties, so that we keep the correct classification regardless
1558 of -msve-vector-bits. */
1561 /* Single SVE vectors. */
1569 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1571 /* x2 SVE vectors. */
1579 /* x3 SVE vectors. */
1587 /* x4 SVE vectors. */
1595 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1597 /* 64-bit Advanced SIMD vectors. */
1601 /* ...E_V1DImode doesn't exist. */
1605 /* 128-bit Advanced SIMD vectors. */
1613 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1620 /* Return true if MODE is any of the data vector modes, including
1623 aarch64_vector_data_mode_p (machine_mode mode
)
1625 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1628 /* Return true if MODE is any form of SVE mode, including predicates,
1629 vectors and structures. */
1631 aarch64_sve_mode_p (machine_mode mode
)
1633 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1636 /* Return true if MODE is an SVE data vector mode; either a single vector
1637 or a structure of vectors. */
1639 aarch64_sve_data_mode_p (machine_mode mode
)
1641 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1644 /* Implement target hook TARGET_ARRAY_MODE. */
1645 static opt_machine_mode
1646 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1648 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1649 && IN_RANGE (nelems
, 2, 4))
1650 return mode_for_vector (GET_MODE_INNER (mode
),
1651 GET_MODE_NUNITS (mode
) * nelems
);
1653 return opt_machine_mode ();
1656 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1658 aarch64_array_mode_supported_p (machine_mode mode
,
1659 unsigned HOST_WIDE_INT nelems
)
1662 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1663 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1664 && (nelems
>= 2 && nelems
<= 4))
1670 /* Return the SVE predicate mode to use for elements that have
1671 ELEM_NBYTES bytes, if such a mode exists. */
1674 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1678 if (elem_nbytes
== 1)
1680 if (elem_nbytes
== 2)
1682 if (elem_nbytes
== 4)
1684 if (elem_nbytes
== 8)
1687 return opt_machine_mode ();
1690 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1692 static opt_machine_mode
1693 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1695 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1697 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1698 machine_mode pred_mode
;
1699 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1703 return default_get_mask_mode (nunits
, nbytes
);
1706 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1708 static opt_machine_mode
1709 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1711 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1712 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1714 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1715 if (inner_mode
== GET_MODE_INNER (mode
)
1716 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1717 && aarch64_sve_data_mode_p (mode
))
1719 return opt_machine_mode ();
1722 /* Return the integer element mode associated with SVE mode MODE. */
1724 static scalar_int_mode
1725 aarch64_sve_element_int_mode (machine_mode mode
)
1727 unsigned int elt_bits
= vector_element_size (BITS_PER_SVE_VECTOR
,
1728 GET_MODE_NUNITS (mode
));
1729 return int_mode_for_size (elt_bits
, 0).require ();
1732 /* Return the integer vector mode associated with SVE mode MODE.
1733 Unlike mode_for_int_vector, this can handle the case in which
1734 MODE is a predicate (and thus has a different total size). */
1737 aarch64_sve_int_mode (machine_mode mode
)
1739 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1740 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1743 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1744 prefer to use the first arithmetic operand as the else value if
1745 the else value doesn't matter, since that exactly matches the SVE
1746 destructive merging form. For ternary operations we could either
1747 pick the first operand and use FMAD-like instructions or the last
1748 operand and use FMLA-like instructions; the latter seems more
1752 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1754 return nops
== 3 ? ops
[2] : ops
[0];
1757 /* Implement TARGET_HARD_REGNO_NREGS. */
1760 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1762 /* ??? Logically we should only need to provide a value when
1763 HARD_REGNO_MODE_OK says that the combination is valid,
1764 but at the moment we need to handle all modes. Just ignore
1765 any runtime parts for registers that can't store them. */
1766 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1767 switch (aarch64_regno_regclass (regno
))
1772 if (aarch64_sve_data_mode_p (mode
))
1773 return exact_div (GET_MODE_SIZE (mode
),
1774 BYTES_PER_SVE_VECTOR
).to_constant ();
1775 return CEIL (lowest_size
, UNITS_PER_VREG
);
1781 return CEIL (lowest_size
, UNITS_PER_WORD
);
1786 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1789 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1791 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1792 return regno
== CC_REGNUM
;
1794 if (regno
== VG_REGNUM
)
1795 /* This must have the same size as _Unwind_Word. */
1796 return mode
== DImode
;
1798 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1799 if (vec_flags
& VEC_SVE_PRED
)
1800 return PR_REGNUM_P (regno
);
1802 if (PR_REGNUM_P (regno
))
1805 if (regno
== SP_REGNUM
)
1806 /* The purpose of comparing with ptr_mode is to support the
1807 global register variable associated with the stack pointer
1808 register via the syntax of asm ("wsp") in ILP32. */
1809 return mode
== Pmode
|| mode
== ptr_mode
;
1811 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1812 return mode
== Pmode
;
1814 if (GP_REGNUM_P (regno
))
1816 if (known_le (GET_MODE_SIZE (mode
), 8))
1818 else if (known_le (GET_MODE_SIZE (mode
), 16))
1819 return (regno
& 1) == 0;
1821 else if (FP_REGNUM_P (regno
))
1823 if (vec_flags
& VEC_STRUCT
)
1824 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1826 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1832 /* Implement TARGET_FNTYPE_ABI. */
1834 static const predefined_function_abi
&
1835 aarch64_fntype_abi (const_tree fntype
)
1837 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
1838 return aarch64_simd_abi ();
1839 return default_function_abi
;
1842 /* Return true if this is a definition of a vectorized simd function. */
1845 aarch64_simd_decl_p (tree fndecl
)
1851 fntype
= TREE_TYPE (fndecl
);
1855 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1856 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1862 /* Return the mode a register save/restore should use. DImode for integer
1863 registers, DFmode for FP registers in non-SIMD functions (they only save
1864 the bottom half of a 128 bit register), or TFmode for FP registers in
1868 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1870 return GP_REGNUM_P (regno
)
1872 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1875 /* Implement TARGET_INSN_CALLEE_ABI. */
1877 const predefined_function_abi
&
1878 aarch64_insn_callee_abi (const rtx_insn
*insn
)
1880 rtx pat
= PATTERN (insn
);
1881 gcc_assert (GET_CODE (pat
) == PARALLEL
);
1882 rtx unspec
= XVECEXP (pat
, 0, 1);
1883 gcc_assert (GET_CODE (unspec
) == UNSPEC
1884 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
1885 return function_abis
[INTVAL (XVECEXP (unspec
, 0, 0))];
1888 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1889 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1890 clobbers the top 64 bits when restoring the bottom 64 bits. */
1893 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
1897 if (FP_REGNUM_P (regno
))
1899 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
1900 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
1902 per_register_size
= exact_div (per_register_size
, nregs
);
1903 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
1904 return maybe_gt (per_register_size
, 16);
1905 return maybe_gt (per_register_size
, 8);
1910 /* Implement REGMODE_NATURAL_SIZE. */
1912 aarch64_regmode_natural_size (machine_mode mode
)
1914 /* The natural size for SVE data modes is one SVE data vector,
1915 and similarly for predicates. We can't independently modify
1916 anything smaller than that. */
1917 /* ??? For now, only do this for variable-width SVE registers.
1918 Doing it for constant-sized registers breaks lower-subreg.c. */
1919 /* ??? And once that's fixed, we should probably have similar
1920 code for Advanced SIMD. */
1921 if (!aarch64_sve_vg
.is_constant ())
1923 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1924 if (vec_flags
& VEC_SVE_PRED
)
1925 return BYTES_PER_SVE_PRED
;
1926 if (vec_flags
& VEC_SVE_DATA
)
1927 return BYTES_PER_SVE_VECTOR
;
1929 return UNITS_PER_WORD
;
1932 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1934 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1937 /* The predicate mode determines which bits are significant and
1938 which are "don't care". Decreasing the number of lanes would
1939 lose data while increasing the number of lanes would make bits
1940 unnecessarily significant. */
1941 if (PR_REGNUM_P (regno
))
1943 if (known_ge (GET_MODE_SIZE (mode
), 4))
1949 /* Return true if I's bits are consecutive ones from the MSB. */
1951 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1953 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1956 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1957 that strcpy from constants will be faster. */
1959 static HOST_WIDE_INT
1960 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1962 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1963 return MAX (align
, BITS_PER_WORD
);
1967 /* Return true if calls to DECL should be treated as
1968 long-calls (ie called via a register). */
1970 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1975 /* Return true if calls to symbol-ref SYM should be treated as
1976 long-calls (ie called via a register). */
1978 aarch64_is_long_call_p (rtx sym
)
1980 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1983 /* Return true if calls to symbol-ref SYM should not go through
1987 aarch64_is_noplt_call_p (rtx sym
)
1989 const_tree decl
= SYMBOL_REF_DECL (sym
);
1994 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1995 && !targetm
.binds_local_p (decl
))
2001 /* Return true if the offsets to a zero/sign-extract operation
2002 represent an expression that matches an extend operation. The
2003 operands represent the paramters from
2005 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2007 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2010 HOST_WIDE_INT mult_val
, extract_val
;
2012 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2015 mult_val
= INTVAL (mult_imm
);
2016 extract_val
= INTVAL (extract_imm
);
2019 && extract_val
< GET_MODE_BITSIZE (mode
)
2020 && exact_log2 (extract_val
& ~7) > 0
2021 && (extract_val
& 7) <= 4
2022 && mult_val
== (1 << (extract_val
& 7)))
2028 /* Emit an insn that's a simple single-set. Both the operands must be
2029 known to be valid. */
2030 inline static rtx_insn
*
2031 emit_set_insn (rtx x
, rtx y
)
2033 return emit_insn (gen_rtx_SET (x
, y
));
2036 /* X and Y are two things to compare using CODE. Emit the compare insn and
2037 return the rtx for register 0 in the proper mode. */
2039 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2041 machine_mode cmp_mode
= GET_MODE (x
);
2042 machine_mode cc_mode
;
2045 if (cmp_mode
== TImode
)
2047 gcc_assert (code
== NE
);
2050 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2052 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2053 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2054 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2056 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2057 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2058 emit_insn (gen_ccmpdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2059 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2060 GEN_INT (AARCH64_EQ
)));
2064 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2065 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2066 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2071 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2074 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2075 machine_mode y_mode
)
2077 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2079 if (CONST_INT_P (y
))
2080 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2084 machine_mode cc_mode
;
2086 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2087 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2088 cc_mode
= CC_SWPmode
;
2089 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2090 emit_set_insn (cc_reg
, t
);
2095 if (!aarch64_plus_operand (y
, y_mode
))
2096 y
= force_reg (y_mode
, y
);
2098 return aarch64_gen_compare_reg (code
, x
, y
);
2101 /* Build the SYMBOL_REF for __tls_get_addr. */
2103 static GTY(()) rtx tls_get_addr_libfunc
;
2106 aarch64_tls_get_addr (void)
2108 if (!tls_get_addr_libfunc
)
2109 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2110 return tls_get_addr_libfunc
;
2113 /* Return the TLS model to use for ADDR. */
2115 static enum tls_model
2116 tls_symbolic_operand_type (rtx addr
)
2118 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2119 if (GET_CODE (addr
) == CONST
)
2122 rtx sym
= strip_offset (addr
, &addend
);
2123 if (GET_CODE (sym
) == SYMBOL_REF
)
2124 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2126 else if (GET_CODE (addr
) == SYMBOL_REF
)
2127 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2132 /* We'll allow lo_sum's in addresses in our legitimate addresses
2133 so that combine would take care of combining addresses where
2134 necessary, but for generation purposes, we'll generate the address
2137 tmp = hi (symbol_ref); adrp x1, foo
2138 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2142 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2143 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2147 Load TLS symbol, depending on TLS mechanism and TLS access model.
2149 Global Dynamic - Traditional TLS:
2150 adrp tmp, :tlsgd:imm
2151 add dest, tmp, #:tlsgd_lo12:imm
2154 Global Dynamic - TLS Descriptors:
2155 adrp dest, :tlsdesc:imm
2156 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2157 add dest, dest, #:tlsdesc_lo12:imm
2164 adrp tmp, :gottprel:imm
2165 ldr dest, [tmp, #:gottprel_lo12:imm]
2170 add t0, tp, #:tprel_hi12:imm, lsl #12
2171 add t0, t0, #:tprel_lo12_nc:imm
2175 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2176 enum aarch64_symbol_type type
)
2180 case SYMBOL_SMALL_ABSOLUTE
:
2182 /* In ILP32, the mode of dest can be either SImode or DImode. */
2184 machine_mode mode
= GET_MODE (dest
);
2186 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2188 if (can_create_pseudo_p ())
2189 tmp_reg
= gen_reg_rtx (mode
);
2191 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2192 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2196 case SYMBOL_TINY_ABSOLUTE
:
2197 emit_insn (gen_rtx_SET (dest
, imm
));
2200 case SYMBOL_SMALL_GOT_28K
:
2202 machine_mode mode
= GET_MODE (dest
);
2203 rtx gp_rtx
= pic_offset_table_rtx
;
2207 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2208 here before rtl expand. Tree IVOPT will generate rtl pattern to
2209 decide rtx costs, in which case pic_offset_table_rtx is not
2210 initialized. For that case no need to generate the first adrp
2211 instruction as the final cost for global variable access is
2215 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2216 using the page base as GOT base, the first page may be wasted,
2217 in the worst scenario, there is only 28K space for GOT).
2219 The generate instruction sequence for accessing global variable
2222 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2224 Only one instruction needed. But we must initialize
2225 pic_offset_table_rtx properly. We generate initialize insn for
2226 every global access, and allow CSE to remove all redundant.
2228 The final instruction sequences will look like the following
2229 for multiply global variables access.
2231 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2233 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2234 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2235 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2238 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2239 crtl
->uses_pic_offset_table
= 1;
2240 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2242 if (mode
!= GET_MODE (gp_rtx
))
2243 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2247 if (mode
== ptr_mode
)
2250 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2252 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2254 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2258 gcc_assert (mode
== Pmode
);
2260 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2261 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2264 /* The operand is expected to be MEM. Whenever the related insn
2265 pattern changed, above code which calculate mem should be
2267 gcc_assert (GET_CODE (mem
) == MEM
);
2268 MEM_READONLY_P (mem
) = 1;
2269 MEM_NOTRAP_P (mem
) = 1;
2274 case SYMBOL_SMALL_GOT_4G
:
2276 /* In ILP32, the mode of dest can be either SImode or DImode,
2277 while the got entry is always of SImode size. The mode of
2278 dest depends on how dest is used: if dest is assigned to a
2279 pointer (e.g. in the memory), it has SImode; it may have
2280 DImode if dest is dereferenced to access the memeory.
2281 This is why we have to handle three different ldr_got_small
2282 patterns here (two patterns for ILP32). */
2287 machine_mode mode
= GET_MODE (dest
);
2289 if (can_create_pseudo_p ())
2290 tmp_reg
= gen_reg_rtx (mode
);
2292 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2293 if (mode
== ptr_mode
)
2296 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2298 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2300 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2304 gcc_assert (mode
== Pmode
);
2306 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2307 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2310 gcc_assert (GET_CODE (mem
) == MEM
);
2311 MEM_READONLY_P (mem
) = 1;
2312 MEM_NOTRAP_P (mem
) = 1;
2317 case SYMBOL_SMALL_TLSGD
:
2320 machine_mode mode
= GET_MODE (dest
);
2321 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2325 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2327 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2328 insns
= get_insns ();
2331 RTL_CONST_CALL_P (insns
) = 1;
2332 emit_libcall_block (insns
, dest
, result
, imm
);
2336 case SYMBOL_SMALL_TLSDESC
:
2338 machine_mode mode
= GET_MODE (dest
);
2339 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2342 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2344 /* In ILP32, the got entry is always of SImode size. Unlike
2345 small GOT, the dest is fixed at reg 0. */
2347 emit_insn (gen_tlsdesc_small_si (imm
));
2349 emit_insn (gen_tlsdesc_small_di (imm
));
2350 tp
= aarch64_load_tp (NULL
);
2353 tp
= gen_lowpart (mode
, tp
);
2355 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2357 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2361 case SYMBOL_SMALL_TLSIE
:
2363 /* In ILP32, the mode of dest can be either SImode or DImode,
2364 while the got entry is always of SImode size. The mode of
2365 dest depends on how dest is used: if dest is assigned to a
2366 pointer (e.g. in the memory), it has SImode; it may have
2367 DImode if dest is dereferenced to access the memeory.
2368 This is why we have to handle three different tlsie_small
2369 patterns here (two patterns for ILP32). */
2370 machine_mode mode
= GET_MODE (dest
);
2371 rtx tmp_reg
= gen_reg_rtx (mode
);
2372 rtx tp
= aarch64_load_tp (NULL
);
2374 if (mode
== ptr_mode
)
2377 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2380 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2381 tp
= gen_lowpart (mode
, tp
);
2386 gcc_assert (mode
== Pmode
);
2387 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2390 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2392 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2396 case SYMBOL_TLSLE12
:
2397 case SYMBOL_TLSLE24
:
2398 case SYMBOL_TLSLE32
:
2399 case SYMBOL_TLSLE48
:
2401 machine_mode mode
= GET_MODE (dest
);
2402 rtx tp
= aarch64_load_tp (NULL
);
2405 tp
= gen_lowpart (mode
, tp
);
2409 case SYMBOL_TLSLE12
:
2410 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2413 case SYMBOL_TLSLE24
:
2414 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2417 case SYMBOL_TLSLE32
:
2418 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2420 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2423 case SYMBOL_TLSLE48
:
2424 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2426 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2434 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2438 case SYMBOL_TINY_GOT
:
2439 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2442 case SYMBOL_TINY_TLSIE
:
2444 machine_mode mode
= GET_MODE (dest
);
2445 rtx tp
= aarch64_load_tp (NULL
);
2447 if (mode
== ptr_mode
)
2450 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2453 tp
= gen_lowpart (mode
, tp
);
2454 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2459 gcc_assert (mode
== Pmode
);
2460 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2464 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2473 /* Emit a move from SRC to DEST. Assume that the move expanders can
2474 handle all moves if !can_create_pseudo_p (). The distinction is
2475 important because, unlike emit_move_insn, the move expanders know
2476 how to force Pmode objects into the constant pool even when the
2477 constant pool address is not itself legitimate. */
2479 aarch64_emit_move (rtx dest
, rtx src
)
2481 return (can_create_pseudo_p ()
2482 ? emit_move_insn (dest
, src
)
2483 : emit_move_insn_1 (dest
, src
));
2486 /* Apply UNOPTAB to OP and store the result in DEST. */
2489 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2491 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2493 emit_move_insn (dest
, tmp
);
2496 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2499 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2501 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2504 emit_move_insn (dest
, tmp
);
2507 /* Split a 128-bit move operation into two 64-bit move operations,
2508 taking care to handle partial overlap of register to register
2509 copies. Special cases are needed when moving between GP regs and
2510 FP regs. SRC can be a register, constant or memory; DST a register
2511 or memory. If either operand is memory it must not have any side
2514 aarch64_split_128bit_move (rtx dst
, rtx src
)
2519 machine_mode mode
= GET_MODE (dst
);
2521 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2522 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2523 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2525 if (REG_P (dst
) && REG_P (src
))
2527 int src_regno
= REGNO (src
);
2528 int dst_regno
= REGNO (dst
);
2530 /* Handle FP <-> GP regs. */
2531 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2533 src_lo
= gen_lowpart (word_mode
, src
);
2534 src_hi
= gen_highpart (word_mode
, src
);
2536 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2537 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2540 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2542 dst_lo
= gen_lowpart (word_mode
, dst
);
2543 dst_hi
= gen_highpart (word_mode
, dst
);
2545 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2546 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2551 dst_lo
= gen_lowpart (word_mode
, dst
);
2552 dst_hi
= gen_highpart (word_mode
, dst
);
2553 src_lo
= gen_lowpart (word_mode
, src
);
2554 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2556 /* At most one pairing may overlap. */
2557 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2559 aarch64_emit_move (dst_hi
, src_hi
);
2560 aarch64_emit_move (dst_lo
, src_lo
);
2564 aarch64_emit_move (dst_lo
, src_lo
);
2565 aarch64_emit_move (dst_hi
, src_hi
);
2570 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2572 return (! REG_P (src
)
2573 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2576 /* Split a complex SIMD combine. */
2579 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2581 machine_mode src_mode
= GET_MODE (src1
);
2582 machine_mode dst_mode
= GET_MODE (dst
);
2584 gcc_assert (VECTOR_MODE_P (dst_mode
));
2585 gcc_assert (register_operand (dst
, dst_mode
)
2586 && register_operand (src1
, src_mode
)
2587 && register_operand (src2
, src_mode
));
2589 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2593 /* Split a complex SIMD move. */
2596 aarch64_split_simd_move (rtx dst
, rtx src
)
2598 machine_mode src_mode
= GET_MODE (src
);
2599 machine_mode dst_mode
= GET_MODE (dst
);
2601 gcc_assert (VECTOR_MODE_P (dst_mode
));
2603 if (REG_P (dst
) && REG_P (src
))
2605 gcc_assert (VECTOR_MODE_P (src_mode
));
2606 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2611 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2612 machine_mode ymode
, rtx y
)
2614 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2615 gcc_assert (r
!= NULL
);
2616 return rtx_equal_p (x
, r
);
2619 /* Return TARGET if it is nonnull and a register of mode MODE.
2620 Otherwise, return a fresh register of mode MODE if we can,
2621 or TARGET reinterpreted as MODE if we can't. */
2624 aarch64_target_reg (rtx target
, machine_mode mode
)
2626 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2628 if (!can_create_pseudo_p ())
2630 gcc_assert (target
);
2631 return gen_lowpart (mode
, target
);
2633 return gen_reg_rtx (mode
);
2636 /* Return a register that contains the constant in BUILDER, given that
2637 the constant is a legitimate move operand. Use TARGET as the register
2638 if it is nonnull and convenient. */
2641 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2643 rtx src
= builder
.build ();
2644 target
= aarch64_target_reg (target
, GET_MODE (src
));
2645 emit_insn (gen_rtx_SET (target
, src
));
2650 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2652 if (can_create_pseudo_p ())
2653 return force_reg (mode
, value
);
2657 aarch64_emit_move (x
, value
);
2662 /* Return true if predicate value X is a constant in which every element
2663 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2664 value, i.e. as a predicate in which all bits are significant. */
2667 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2669 if (GET_CODE (x
) != CONST_VECTOR
)
2672 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2673 GET_MODE_NUNITS (GET_MODE (x
)));
2674 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2675 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2676 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2678 unsigned int nelts
= const_vector_encoded_nelts (x
);
2679 for (unsigned int i
= 0; i
< nelts
; ++i
)
2681 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2682 if (!CONST_INT_P (elt
))
2685 builder
.quick_push (elt
);
2686 for (unsigned int j
= 1; j
< factor
; ++j
)
2687 builder
.quick_push (const0_rtx
);
2689 builder
.finalize ();
2693 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2694 widest predicate element size it can have (that is, the largest size
2695 for which each element would still be 0 or 1). */
2698 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2700 /* Start with the most optimistic assumption: that we only need
2701 one bit per pattern. This is what we will use if only the first
2702 bit in each pattern is ever set. */
2703 unsigned int mask
= GET_MODE_SIZE (DImode
);
2704 mask
|= builder
.npatterns ();
2706 /* Look for set bits. */
2707 unsigned int nelts
= builder
.encoded_nelts ();
2708 for (unsigned int i
= 1; i
< nelts
; ++i
)
2709 if (INTVAL (builder
.elt (i
)) != 0)
2715 return mask
& -mask
;
2718 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2719 that the constant would have with predicate element size ELT_SIZE
2720 (ignoring the upper bits in each element) and return:
2722 * -1 if all bits are set
2723 * N if the predicate has N leading set bits followed by all clear bits
2724 * 0 if the predicate does not have any of these forms. */
2727 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
2728 unsigned int elt_size
)
2730 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2731 followed by set bits. */
2732 if (builder
.nelts_per_pattern () == 3)
2735 /* Skip over leading set bits. */
2736 unsigned int nelts
= builder
.encoded_nelts ();
2738 for (; i
< nelts
; i
+= elt_size
)
2739 if (INTVAL (builder
.elt (i
)) == 0)
2741 unsigned int vl
= i
/ elt_size
;
2743 /* Check for the all-true case. */
2747 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2748 repeating pattern of set bits followed by clear bits. */
2749 if (builder
.nelts_per_pattern () != 2)
2752 /* We have a "foreground" value and a duplicated "background" value.
2753 If the background might repeat and the last set bit belongs to it,
2754 we might have set bits followed by clear bits followed by set bits. */
2755 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
2758 /* Make sure that the rest are all clear. */
2759 for (; i
< nelts
; i
+= elt_size
)
2760 if (INTVAL (builder
.elt (i
)) != 0)
2766 /* See if there is an svpattern that encodes an SVE predicate of mode
2767 PRED_MODE in which the first VL bits are set and the rest are clear.
2768 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2769 A VL of -1 indicates an all-true vector. */
2772 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
2775 return AARCH64_SV_ALL
;
2777 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
2778 return AARCH64_NUM_SVPATTERNS
;
2780 if (vl
>= 1 && vl
<= 8)
2781 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
2783 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
2784 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
2787 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
2789 if (vl
== (max_vl
/ 3) * 3)
2790 return AARCH64_SV_MUL3
;
2791 /* These would only trigger for non-power-of-2 lengths. */
2792 if (vl
== (max_vl
& -4))
2793 return AARCH64_SV_MUL4
;
2794 if (vl
== (1 << floor_log2 (max_vl
)))
2795 return AARCH64_SV_POW2
;
2797 return AARCH64_SV_ALL
;
2799 return AARCH64_NUM_SVPATTERNS
;
2802 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2803 bits has the lowest bit set and the upper bits clear. This is the
2804 VNx16BImode equivalent of a PTRUE for controlling elements of
2805 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2806 all bits are significant, even the upper zeros. */
2809 aarch64_ptrue_all (unsigned int elt_size
)
2811 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
2812 builder
.quick_push (const1_rtx
);
2813 for (unsigned int i
= 1; i
< elt_size
; ++i
)
2814 builder
.quick_push (const0_rtx
);
2815 return builder
.build ();
2818 /* Return an all-true predicate register of mode MODE. */
2821 aarch64_ptrue_reg (machine_mode mode
)
2823 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2824 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
2825 return gen_lowpart (mode
, reg
);
2828 /* Return an all-false predicate register of mode MODE. */
2831 aarch64_pfalse_reg (machine_mode mode
)
2833 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2834 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
2835 return gen_lowpart (mode
, reg
);
2838 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2839 true, or alternatively if we know that the operation predicated by
2840 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2841 aarch64_sve_gp_strictness operand that describes the operation
2842 predicated by PRED1[0]. */
2845 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
2847 machine_mode mode
= GET_MODE (pred2
);
2848 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2849 && mode
== GET_MODE (pred1
[0])
2850 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
2851 return (pred1
[0] == CONSTM1_RTX (mode
)
2852 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
2853 || rtx_equal_p (pred1
[0], pred2
));
2856 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2857 for it. PRED2[0] is the predicate for the instruction whose result
2858 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2859 for it. Return true if we can prove that the two predicates are
2860 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2861 with PRED1[0] without changing behavior. */
2864 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
2866 machine_mode mode
= GET_MODE (pred1
[0]);
2867 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2868 && mode
== GET_MODE (pred2
[0])
2869 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
2870 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
2872 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
2873 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
2874 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
2875 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
2876 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
2879 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2880 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2881 Use TARGET as the target register if nonnull and convenient. */
2884 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
2885 machine_mode data_mode
, rtx op1
, rtx op2
)
2887 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
2888 expand_operand ops
[5];
2889 create_output_operand (&ops
[0], target
, pred_mode
);
2890 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
2891 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
2892 create_input_operand (&ops
[3], op1
, data_mode
);
2893 create_input_operand (&ops
[4], op2
, data_mode
);
2894 expand_insn (icode
, 5, ops
);
2895 return ops
[0].value
;
2898 /* Use a comparison to convert integer vector SRC into MODE, which is
2899 the corresponding SVE predicate mode. Use TARGET for the result
2900 if it's nonnull and convenient. */
2903 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
2905 machine_mode src_mode
= GET_MODE (src
);
2906 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
2907 src
, CONST0_RTX (src_mode
));
2910 /* Return true if we can move VALUE into a register using a single
2911 CNT[BHWD] instruction. */
2914 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2916 HOST_WIDE_INT factor
= value
.coeffs
[0];
2917 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2918 return (value
.coeffs
[1] == factor
2919 && IN_RANGE (factor
, 2, 16 * 16)
2920 && (factor
& 1) == 0
2921 && factor
<= 16 * (factor
& -factor
));
2924 /* Likewise for rtx X. */
2927 aarch64_sve_cnt_immediate_p (rtx x
)
2930 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2933 /* Return the asm string for an instruction with a CNT-like vector size
2934 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2935 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2936 first part of the operands template (the part that comes before the
2937 vector size itself). PATTERN is the pattern to use. FACTOR is the
2938 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2939 in each quadword. If it is zero, we can use any element size. */
2942 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2943 aarch64_svpattern pattern
,
2944 unsigned int factor
,
2945 unsigned int nelts_per_vq
)
2947 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2949 if (nelts_per_vq
== 0)
2950 /* There is some overlap in the ranges of the four CNT instructions.
2951 Here we always use the smallest possible element size, so that the
2952 multiplier is 1 whereever possible. */
2953 nelts_per_vq
= factor
& -factor
;
2954 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2955 gcc_assert (IN_RANGE (shift
, 1, 4));
2956 char suffix
= "dwhb"[shift
- 1];
2959 unsigned int written
;
2960 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
2961 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2962 prefix
, suffix
, operands
);
2963 else if (factor
== 1)
2964 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
2965 prefix
, suffix
, operands
, svpattern_token (pattern
));
2967 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
2968 prefix
, suffix
, operands
, svpattern_token (pattern
),
2970 gcc_assert (written
< sizeof (buffer
));
2974 /* Return the asm string for an instruction with a CNT-like vector size
2975 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2976 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2977 first part of the operands template (the part that comes before the
2978 vector size itself). X is the value of the vector size operand,
2979 as a polynomial integer rtx; we need to convert this into an "all"
2980 pattern with a multiplier. */
2983 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2986 poly_int64 value
= rtx_to_poly_int64 (x
);
2987 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2988 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
2989 value
.coeffs
[1], 0);
2992 /* Return true if we can add X using a single SVE INC or DEC instruction. */
2995 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
2998 return (poly_int_rtx_p (x
, &value
)
2999 && (aarch64_sve_cnt_immediate_p (value
)
3000 || aarch64_sve_cnt_immediate_p (-value
)));
3003 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3007 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3009 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3010 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3011 if (offset_value
.coeffs
[1] > 0)
3012 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3013 offset_value
.coeffs
[1], 0);
3015 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3016 -offset_value
.coeffs
[1], 0);
3019 /* Return true if we can add VALUE to a register using a single ADDVL
3020 or ADDPL instruction. */
3023 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3025 HOST_WIDE_INT factor
= value
.coeffs
[0];
3026 if (factor
== 0 || value
.coeffs
[1] != factor
)
3028 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3029 and a value of 16 is one vector width. */
3030 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3031 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3034 /* Likewise for rtx X. */
3037 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3040 return (poly_int_rtx_p (x
, &value
)
3041 && aarch64_sve_addvl_addpl_immediate_p (value
));
3044 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3045 to operand 1 and storing the result in operand 0. */
3048 aarch64_output_sve_addvl_addpl (rtx offset
)
3050 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3051 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3052 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3054 int factor
= offset_value
.coeffs
[1];
3055 if ((factor
& 15) == 0)
3056 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3058 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3062 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3063 instruction. If it is, store the number of elements in each vector
3064 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3065 factor in *FACTOR_OUT (if nonnull). */
3068 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3069 unsigned int *nelts_per_vq_out
)
3074 if (!const_vec_duplicate_p (x
, &elt
)
3075 || !poly_int_rtx_p (elt
, &value
))
3078 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3079 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3080 /* There's no vector INCB. */
3083 HOST_WIDE_INT factor
= value
.coeffs
[0];
3084 if (value
.coeffs
[1] != factor
)
3087 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3088 if ((factor
% nelts_per_vq
) != 0
3089 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3093 *factor_out
= factor
;
3094 if (nelts_per_vq_out
)
3095 *nelts_per_vq_out
= nelts_per_vq
;
3099 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3103 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3105 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3108 /* Return the asm template for an SVE vector INC or DEC instruction.
3109 OPERANDS gives the operands before the vector count and X is the
3110 value of the vector count operand itself. */
3113 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3116 unsigned int nelts_per_vq
;
3117 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3120 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3121 -factor
, nelts_per_vq
);
3123 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3124 factor
, nelts_per_vq
);
3128 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3129 scalar_int_mode mode
)
3132 unsigned HOST_WIDE_INT val
, val2
, mask
;
3133 int one_match
, zero_match
;
3138 if (aarch64_move_imm (val
, mode
))
3141 emit_insn (gen_rtx_SET (dest
, imm
));
3145 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3146 (with XXXX non-zero). In that case check to see if the move can be done in
3148 val2
= val
& 0xffffffff;
3150 && aarch64_move_imm (val2
, SImode
)
3151 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3154 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3156 /* Check if we have to emit a second instruction by checking to see
3157 if any of the upper 32 bits of the original DI mode value is set. */
3161 i
= (val
>> 48) ? 48 : 32;
3164 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3165 GEN_INT ((val
>> i
) & 0xffff)));
3170 if ((val
>> 32) == 0 || mode
== SImode
)
3174 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3176 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3177 GEN_INT ((val
>> 16) & 0xffff)));
3179 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3180 GEN_INT ((val
>> 16) & 0xffff)));
3185 /* Remaining cases are all for DImode. */
3188 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3189 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3190 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3191 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3193 if (zero_match
!= 2 && one_match
!= 2)
3195 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3196 For a 64-bit bitmask try whether changing 16 bits to all ones or
3197 zeroes creates a valid bitmask. To check any repeated bitmask,
3198 try using 16 bits from the other 32-bit half of val. */
3200 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3203 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3206 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3208 val2
= val2
& ~mask
;
3209 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3210 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3217 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3218 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3219 GEN_INT ((val
>> i
) & 0xffff)));
3225 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3226 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3227 otherwise skip zero bits. */
3231 val2
= one_match
> zero_match
? ~val
: val
;
3232 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3235 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3236 ? (val
| ~(mask
<< i
))
3237 : (val
& (mask
<< i
)))));
3238 for (i
+= 16; i
< 64; i
+= 16)
3240 if ((val2
& (mask
<< i
)) == 0)
3243 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3244 GEN_INT ((val
>> i
) & 0xffff)));
3251 /* Return whether imm is a 128-bit immediate which is simple enough to
3254 aarch64_mov128_immediate (rtx imm
)
3256 if (GET_CODE (imm
) == CONST_INT
)
3259 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3261 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3262 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3264 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3265 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3269 /* Return the number of temporary registers that aarch64_add_offset_1
3270 would need to add OFFSET to a register. */
3273 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3275 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3278 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3279 a non-polynomial OFFSET. MODE is the mode of the addition.
3280 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3281 be set and CFA adjustments added to the generated instructions.
3283 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3284 temporary if register allocation is already complete. This temporary
3285 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3286 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3287 the immediate again.
3289 Since this function may be used to adjust the stack pointer, we must
3290 ensure that it cannot cause transient stack deallocation (for example
3291 by first incrementing SP and then decrementing when adjusting by a
3292 large immediate). */
3295 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3296 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3297 bool frame_related_p
, bool emit_move_imm
)
3299 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3300 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3302 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3307 if (!rtx_equal_p (dest
, src
))
3309 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3310 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3315 /* Single instruction adjustment. */
3316 if (aarch64_uimm12_shift (moffset
))
3318 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3319 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3323 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3326 a) the offset cannot be loaded by a 16-bit move or
3327 b) there is no spare register into which we can move it. */
3328 if (moffset
< 0x1000000
3329 && ((!temp1
&& !can_create_pseudo_p ())
3330 || !aarch64_move_imm (moffset
, mode
)))
3332 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3334 low_off
= offset
< 0 ? -low_off
: low_off
;
3335 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3336 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3337 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3338 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3342 /* Emit a move immediate if required and an addition/subtraction. */
3345 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3346 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3348 insn
= emit_insn (offset
< 0
3349 ? gen_sub3_insn (dest
, src
, temp1
)
3350 : gen_add3_insn (dest
, src
, temp1
));
3351 if (frame_related_p
)
3353 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3354 rtx adj
= plus_constant (mode
, src
, offset
);
3355 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3359 /* Return the number of temporary registers that aarch64_add_offset
3360 would need to move OFFSET into a register or add OFFSET to a register;
3361 ADD_P is true if we want the latter rather than the former. */
3364 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3366 /* This follows the same structure as aarch64_add_offset. */
3367 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3370 unsigned int count
= 0;
3371 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3372 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3373 poly_int64
poly_offset (factor
, factor
);
3374 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3375 /* Need one register for the ADDVL/ADDPL result. */
3377 else if (factor
!= 0)
3379 factor
= abs (factor
);
3380 if (factor
> 16 * (factor
& -factor
))
3381 /* Need one register for the CNT result and one for the multiplication
3382 factor. If necessary, the second temporary can be reused for the
3383 constant part of the offset. */
3385 /* Need one register for the CNT result (which might then
3389 return count
+ aarch64_add_offset_1_temporaries (constant
);
3392 /* If X can be represented as a poly_int64, return the number
3393 of temporaries that are required to add it to a register.
3394 Return -1 otherwise. */
3397 aarch64_add_offset_temporaries (rtx x
)
3400 if (!poly_int_rtx_p (x
, &offset
))
3402 return aarch64_offset_temporaries (true, offset
);
3405 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3406 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3407 be set and CFA adjustments added to the generated instructions.
3409 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3410 temporary if register allocation is already complete. This temporary
3411 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3412 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3413 false to avoid emitting the immediate again.
3415 TEMP2, if nonnull, is a second temporary register that doesn't
3416 overlap either DEST or REG.
3418 Since this function may be used to adjust the stack pointer, we must
3419 ensure that it cannot cause transient stack deallocation (for example
3420 by first incrementing SP and then decrementing when adjusting by a
3421 large immediate). */
3424 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3425 poly_int64 offset
, rtx temp1
, rtx temp2
,
3426 bool frame_related_p
, bool emit_move_imm
= true)
3428 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3429 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3430 gcc_assert (temp1
== NULL_RTX
3432 || !reg_overlap_mentioned_p (temp1
, dest
));
3433 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3435 /* Try using ADDVL or ADDPL to add the whole value. */
3436 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3438 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3439 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3440 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3444 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3445 SVE vector register, over and above the minimum size of 128 bits.
3446 This is equivalent to half the value returned by CNTD with a
3447 vector shape of ALL. */
3448 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3449 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3451 /* Try using ADDVL or ADDPL to add the VG-based part. */
3452 poly_int64
poly_offset (factor
, factor
);
3453 if (src
!= const0_rtx
3454 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3456 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3457 if (frame_related_p
)
3459 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3460 RTX_FRAME_RELATED_P (insn
) = true;
3465 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3466 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3471 /* Otherwise use a CNT-based sequence. */
3472 else if (factor
!= 0)
3474 /* Use a subtraction if we have a negative factor. */
3475 rtx_code code
= PLUS
;
3482 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3483 into the multiplication. */
3487 /* Use a right shift by 1. */
3491 HOST_WIDE_INT low_bit
= factor
& -factor
;
3492 if (factor
<= 16 * low_bit
)
3494 if (factor
> 16 * 8)
3496 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3497 the value with the minimum multiplier and shift it into
3499 int extra_shift
= exact_log2 (low_bit
);
3500 shift
+= extra_shift
;
3501 factor
>>= extra_shift
;
3503 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3507 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3508 directly, since that should increase the chances of being
3509 able to use a shift and add sequence. If LOW_BIT itself
3510 is out of range, just use CNTD. */
3511 if (low_bit
<= 16 * 8)
3516 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
3517 val
= aarch64_force_temporary (mode
, temp1
, val
);
3519 if (can_create_pseudo_p ())
3521 rtx coeff1
= gen_int_mode (factor
, mode
);
3522 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
3526 /* Go back to using a negative multiplication factor if we have
3527 no register from which to subtract. */
3528 if (code
== MINUS
&& src
== const0_rtx
)
3533 rtx coeff1
= gen_int_mode (factor
, mode
);
3534 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3535 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3541 /* Multiply by 1 << SHIFT. */
3542 val
= aarch64_force_temporary (mode
, temp1
, val
);
3543 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3545 else if (shift
== -1)
3548 val
= aarch64_force_temporary (mode
, temp1
, val
);
3549 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3552 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3553 if (src
!= const0_rtx
)
3555 val
= aarch64_force_temporary (mode
, temp1
, val
);
3556 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3558 else if (code
== MINUS
)
3560 val
= aarch64_force_temporary (mode
, temp1
, val
);
3561 val
= gen_rtx_NEG (mode
, val
);
3564 if (constant
== 0 || frame_related_p
)
3566 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3567 if (frame_related_p
)
3569 RTX_FRAME_RELATED_P (insn
) = true;
3570 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3571 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3580 src
= aarch64_force_temporary (mode
, temp1
, val
);
3585 emit_move_imm
= true;
3588 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3589 frame_related_p
, emit_move_imm
);
3592 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3593 than a poly_int64. */
3596 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3597 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3599 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3600 temp1
, temp2
, false);
3603 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3604 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3605 if TEMP1 already contains abs (DELTA). */
3608 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3610 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3611 temp1
, temp2
, true, emit_move_imm
);
3614 /* Subtract DELTA from the stack pointer, marking the instructions
3615 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3619 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3620 bool emit_move_imm
= true)
3622 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3623 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3626 /* Set DEST to (vec_series BASE STEP). */
3629 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3631 machine_mode mode
= GET_MODE (dest
);
3632 scalar_mode inner
= GET_MODE_INNER (mode
);
3634 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3635 if (!aarch64_sve_index_immediate_p (base
))
3636 base
= force_reg (inner
, base
);
3637 if (!aarch64_sve_index_immediate_p (step
))
3638 step
= force_reg (inner
, step
);
3640 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3643 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3644 register of mode MODE. Use TARGET for the result if it's nonnull
3647 The two vector modes must have the same element mode. The behavior
3648 is to duplicate architectural lane N of SRC into architectural lanes
3649 N + I * STEP of the result. On big-endian targets, architectural
3650 lane 0 of an Advanced SIMD vector is the last element of the vector
3651 in memory layout, so for big-endian targets this operation has the
3652 effect of reversing SRC before duplicating it. Callers need to
3653 account for this. */
3656 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
3658 machine_mode src_mode
= GET_MODE (src
);
3659 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
3660 insn_code icode
= (BYTES_BIG_ENDIAN
3661 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
3662 : code_for_aarch64_vec_duplicate_vq_le (mode
));
3665 expand_operand ops
[3];
3666 create_output_operand (&ops
[i
++], target
, mode
);
3667 create_output_operand (&ops
[i
++], src
, src_mode
);
3668 if (BYTES_BIG_ENDIAN
)
3670 /* Create a PARALLEL describing the reversal of SRC. */
3671 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
3672 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
3673 nelts_per_vq
- 1, -1);
3674 create_fixed_operand (&ops
[i
++], sel
);
3676 expand_insn (icode
, i
, ops
);
3677 return ops
[0].value
;
3680 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3681 the memory image into DEST. Return true on success. */
3684 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
3686 src
= force_const_mem (GET_MODE (src
), src
);
3690 /* Make sure that the address is legitimate. */
3691 if (!aarch64_sve_ld1rq_operand_p (src
))
3693 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3694 src
= replace_equiv_address (src
, addr
);
3697 machine_mode mode
= GET_MODE (dest
);
3698 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3699 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3700 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3701 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
3705 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3706 SVE data mode and isn't a legitimate constant. Use TARGET for the
3707 result if convenient.
3709 The returned register can have whatever mode seems most natural
3710 given the contents of SRC. */
3713 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
3715 machine_mode mode
= GET_MODE (src
);
3716 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3717 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3718 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
3719 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
3720 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* elt_bits
;
3722 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
3724 /* The constant is a duplicated quadword but can't be narrowed
3725 beyond a quadword. Get the memory image of the first quadword
3726 as a 128-bit vector and try using LD1RQ to load it from memory.
3728 The effect for both endiannesses is to load memory lane N into
3729 architectural lanes N + I * STEP of the result. On big-endian
3730 targets, the layout of the 128-bit vector in an Advanced SIMD
3731 register would be different from its layout in an SVE register,
3732 but this 128-bit vector is a memory value only. */
3733 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3734 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
3735 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
3739 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
3741 /* The vector is a repeating sequence of 64 bits or fewer.
3742 See if we can load them using an Advanced SIMD move and then
3743 duplicate it to fill a vector. This is better than using a GPR
3744 move because it keeps everything in the same register file. */
3745 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3746 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
3747 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3749 /* We want memory lane N to go into architectural lane N,
3750 so reverse for big-endian targets. The DUP .Q pattern
3751 has a compensating reverse built-in. */
3752 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
3753 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
3755 rtx vq_src
= builder
.build ();
3756 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
3758 vq_src
= force_reg (vq_mode
, vq_src
);
3759 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
3762 /* Get an integer representation of the repeating part of Advanced
3763 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3764 which for big-endian targets is lane-swapped wrt a normal
3765 Advanced SIMD vector. This means that for both endiannesses,
3766 memory lane N of SVE vector SRC corresponds to architectural
3767 lane N of a register holding VQ_SRC. This in turn means that
3768 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3769 as a single 128-bit value) and thus that memory lane 0 of SRC is
3770 in the lsb of the integer. Duplicating the integer therefore
3771 ensures that memory lane N of SRC goes into architectural lane
3772 N + I * INDEX of the SVE register. */
3773 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
3774 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
3777 /* Pretend that we had a vector of INT_MODE to start with. */
3778 elt_mode
= int_mode
;
3779 mode
= aarch64_full_sve_mode (int_mode
).require ();
3781 /* If the integer can be moved into a general register by a
3782 single instruction, do that and duplicate the result. */
3783 if (CONST_INT_P (elt_value
)
3784 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
3786 elt_value
= force_reg (elt_mode
, elt_value
);
3787 return expand_vector_broadcast (mode
, elt_value
);
3790 else if (npatterns
== 1)
3791 /* We're duplicating a single value, but can't do better than
3792 force it to memory and load from there. This handles things
3793 like symbolic constants. */
3794 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
3798 /* Load the element from memory if we can, otherwise move it into
3799 a register and use a DUP. */
3800 rtx op
= force_const_mem (elt_mode
, elt_value
);
3802 op
= force_reg (elt_mode
, elt_value
);
3803 return expand_vector_broadcast (mode
, op
);
3807 /* Try using INDEX. */
3809 if (const_vec_series_p (src
, &base
, &step
))
3811 aarch64_expand_vec_series (target
, base
, step
);
3815 /* From here on, it's better to force the whole constant to memory
3817 if (GET_MODE_NUNITS (mode
).is_constant ())
3820 /* Expand each pattern individually. */
3821 gcc_assert (npatterns
> 1);
3822 rtx_vector_builder builder
;
3823 auto_vec
<rtx
, 16> vectors (npatterns
);
3824 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3826 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3827 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3828 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3829 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3832 /* Use permutes to interleave the separate vectors. */
3833 while (npatterns
> 1)
3836 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3838 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
3839 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3840 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3844 gcc_assert (vectors
[0] == target
);
3848 /* Use WHILE to set a predicate register of mode MODE in which the first
3849 VL bits are set and the rest are clear. Use TARGET for the register
3850 if it's nonnull and convenient. */
3853 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
3856 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
3857 target
= aarch64_target_reg (target
, mode
);
3858 emit_insn (gen_while_ult (DImode
, mode
, target
, const0_rtx
, limit
));
3863 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
3865 /* BUILDER is a constant predicate in which the index of every set bit
3866 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3867 by inverting every element at a multiple of ELT_SIZE and EORing the
3868 result with an ELT_SIZE PTRUE.
3870 Return a register that contains the constant on success, otherwise
3871 return null. Use TARGET as the register if it is nonnull and
3875 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
3876 unsigned int elt_size
)
3878 /* Invert every element at a multiple of ELT_SIZE, keeping the
3880 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
3881 builder
.nelts_per_pattern ());
3882 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
3883 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
3884 inv_builder
.quick_push (const1_rtx
);
3886 inv_builder
.quick_push (const0_rtx
);
3887 inv_builder
.finalize ();
3889 /* See if we can load the constant cheaply. */
3890 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
3894 /* EOR the result with an ELT_SIZE PTRUE. */
3895 rtx mask
= aarch64_ptrue_all (elt_size
);
3896 mask
= force_reg (VNx16BImode
, mask
);
3897 target
= aarch64_target_reg (target
, VNx16BImode
);
3898 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
3902 /* BUILDER is a constant predicate in which the index of every set bit
3903 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3904 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3905 register on success, otherwise return null. Use TARGET as the register
3906 if nonnull and convenient. */
3909 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
3910 unsigned int elt_size
,
3911 unsigned int permute_size
)
3913 /* We're going to split the constant into two new constants A and B,
3914 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3915 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3917 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3918 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3920 where _ indicates elements that will be discarded by the permute.
3922 First calculate the ELT_SIZEs for A and B. */
3923 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
3924 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
3925 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
3926 if (INTVAL (builder
.elt (i
)) != 0)
3928 if (i
& permute_size
)
3929 b_elt_size
|= i
- permute_size
;
3933 a_elt_size
&= -a_elt_size
;
3934 b_elt_size
&= -b_elt_size
;
3936 /* Now construct the vectors themselves. */
3937 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
3938 builder
.nelts_per_pattern ());
3939 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
3940 builder
.nelts_per_pattern ());
3941 unsigned int nelts
= builder
.encoded_nelts ();
3942 for (unsigned int i
= 0; i
< nelts
; ++i
)
3943 if (i
& (elt_size
- 1))
3945 a_builder
.quick_push (const0_rtx
);
3946 b_builder
.quick_push (const0_rtx
);
3948 else if ((i
& permute_size
) == 0)
3950 /* The A and B elements are significant. */
3951 a_builder
.quick_push (builder
.elt (i
));
3952 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
3956 /* The A and B elements are going to be discarded, so pick whatever
3957 is likely to give a nice constant. We are targeting element
3958 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3959 with the aim of each being a sequence of ones followed by
3960 a sequence of zeros. So:
3962 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3963 duplicate the last X_ELT_SIZE element, to extend the
3964 current sequence of ones or zeros.
3966 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3967 zero, so that the constant really does have X_ELT_SIZE and
3968 not a smaller size. */
3969 if (a_elt_size
> permute_size
)
3970 a_builder
.quick_push (const0_rtx
);
3972 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
3973 if (b_elt_size
> permute_size
)
3974 b_builder
.quick_push (const0_rtx
);
3976 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
3978 a_builder
.finalize ();
3979 b_builder
.finalize ();
3981 /* Try loading A into a register. */
3982 rtx_insn
*last
= get_last_insn ();
3983 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
3987 /* Try loading B into a register. */
3989 if (a_builder
!= b_builder
)
3991 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
3994 delete_insns_since (last
);
3999 /* Emit the TRN1 itself. */
4000 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
4001 target
= aarch64_target_reg (target
, mode
);
4002 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
4003 gen_lowpart (mode
, a
),
4004 gen_lowpart (mode
, b
)));
4008 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4009 constant in BUILDER into an SVE predicate register. Return the register
4010 on success, otherwise return null. Use TARGET for the register if
4011 nonnull and convenient.
4013 ALLOW_RECURSE_P is true if we can use methods that would call this
4014 function recursively. */
4017 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
4018 bool allow_recurse_p
)
4020 if (builder
.encoded_nelts () == 1)
4021 /* A PFALSE or a PTRUE .B ALL. */
4022 return aarch64_emit_set_immediate (target
, builder
);
4024 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4025 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4027 /* If we can load the constant using PTRUE, use it as-is. */
4028 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4029 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4030 return aarch64_emit_set_immediate (target
, builder
);
4032 /* Otherwise use WHILE to set the first VL bits. */
4033 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4036 if (!allow_recurse_p
)
4039 /* Try inverting the vector in element size ELT_SIZE and then EORing
4040 the result with an ELT_SIZE PTRUE. */
4041 if (INTVAL (builder
.elt (0)) == 0)
4042 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4046 /* Try using TRN1 to permute two simpler constants. */
4047 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4048 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4055 /* Return an SVE predicate register that contains the VNx16BImode
4056 constant in BUILDER, without going through the move expanders.
4058 The returned register can have whatever mode seems most natural
4059 given the contents of BUILDER. Use TARGET for the result if
4063 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4065 /* Try loading the constant using pure predicate operations. */
4066 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4069 /* Try forcing the constant to memory. */
4070 if (builder
.full_nelts ().is_constant ())
4071 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4073 target
= aarch64_target_reg (target
, VNx16BImode
);
4074 emit_move_insn (target
, mem
);
4078 /* The last resort is to load the constant as an integer and then
4079 compare it against zero. Use -1 for set bits in order to increase
4080 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4081 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4082 builder
.nelts_per_pattern ());
4083 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4084 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4085 ? constm1_rtx
: const0_rtx
);
4086 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4087 int_builder
.build ());
4090 /* Set DEST to immediate IMM. */
4093 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4095 machine_mode mode
= GET_MODE (dest
);
4097 /* Check on what type of symbol it is. */
4098 scalar_int_mode int_mode
;
4099 if ((GET_CODE (imm
) == SYMBOL_REF
4100 || GET_CODE (imm
) == LABEL_REF
4101 || GET_CODE (imm
) == CONST
4102 || GET_CODE (imm
) == CONST_POLY_INT
)
4103 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4107 HOST_WIDE_INT const_offset
;
4108 enum aarch64_symbol_type sty
;
4110 /* If we have (const (plus symbol offset)), separate out the offset
4111 before we start classifying the symbol. */
4112 rtx base
= strip_offset (imm
, &offset
);
4114 /* We must always add an offset involving VL separately, rather than
4115 folding it into the relocation. */
4116 if (!offset
.is_constant (&const_offset
))
4118 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4119 emit_insn (gen_rtx_SET (dest
, imm
));
4122 /* Do arithmetic on 32-bit values if the result is smaller
4124 if (partial_subreg_p (int_mode
, SImode
))
4126 /* It is invalid to do symbol calculations in modes
4127 narrower than SImode. */
4128 gcc_assert (base
== const0_rtx
);
4129 dest
= gen_lowpart (SImode
, dest
);
4132 if (base
!= const0_rtx
)
4134 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4135 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4136 NULL_RTX
, NULL_RTX
, false);
4139 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4140 dest
, NULL_RTX
, false);
4145 sty
= aarch64_classify_symbol (base
, const_offset
);
4148 case SYMBOL_FORCE_TO_MEM
:
4149 if (const_offset
!= 0
4150 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4152 gcc_assert (can_create_pseudo_p ());
4153 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4154 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4155 NULL_RTX
, NULL_RTX
, false);
4159 mem
= force_const_mem (ptr_mode
, imm
);
4162 /* If we aren't generating PC relative literals, then
4163 we need to expand the literal pool access carefully.
4164 This is something that needs to be done in a number
4165 of places, so could well live as a separate function. */
4166 if (!aarch64_pcrelative_literal_loads
)
4168 gcc_assert (can_create_pseudo_p ());
4169 base
= gen_reg_rtx (ptr_mode
);
4170 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4171 if (ptr_mode
!= Pmode
)
4172 base
= convert_memory_address (Pmode
, base
);
4173 mem
= gen_rtx_MEM (ptr_mode
, base
);
4176 if (int_mode
!= ptr_mode
)
4177 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4179 emit_insn (gen_rtx_SET (dest
, mem
));
4183 case SYMBOL_SMALL_TLSGD
:
4184 case SYMBOL_SMALL_TLSDESC
:
4185 case SYMBOL_SMALL_TLSIE
:
4186 case SYMBOL_SMALL_GOT_28K
:
4187 case SYMBOL_SMALL_GOT_4G
:
4188 case SYMBOL_TINY_GOT
:
4189 case SYMBOL_TINY_TLSIE
:
4190 if (const_offset
!= 0)
4192 gcc_assert(can_create_pseudo_p ());
4193 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4194 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4195 NULL_RTX
, NULL_RTX
, false);
4200 case SYMBOL_SMALL_ABSOLUTE
:
4201 case SYMBOL_TINY_ABSOLUTE
:
4202 case SYMBOL_TLSLE12
:
4203 case SYMBOL_TLSLE24
:
4204 case SYMBOL_TLSLE32
:
4205 case SYMBOL_TLSLE48
:
4206 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4214 if (!CONST_INT_P (imm
))
4216 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4218 /* Only the low bit of each .H, .S and .D element is defined,
4219 so we can set the upper bits to whatever we like. If the
4220 predicate is all-true in MODE, prefer to set all the undefined
4221 bits as well, so that we can share a single .B predicate for
4223 if (imm
== CONSTM1_RTX (mode
))
4224 imm
= CONSTM1_RTX (VNx16BImode
);
4226 /* All methods for constructing predicate modes wider than VNx16BI
4227 will set the upper bits of each element to zero. Expose this
4228 by moving such constants as a VNx16BI, so that all bits are
4229 significant and so that constants for different modes can be
4230 shared. The wider constant will still be available as a
4232 rtx_vector_builder builder
;
4233 if (aarch64_get_sve_pred_bits (builder
, imm
))
4235 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4237 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4242 if (GET_CODE (imm
) == HIGH
4243 || aarch64_simd_valid_immediate (imm
, NULL
))
4245 emit_insn (gen_rtx_SET (dest
, imm
));
4249 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4250 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4253 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4257 rtx mem
= force_const_mem (mode
, imm
);
4259 emit_move_insn (dest
, mem
);
4263 aarch64_internal_mov_immediate (dest
, imm
, true,
4264 as_a
<scalar_int_mode
> (mode
));
4267 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4268 that is known to contain PTRUE. */
4271 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4273 expand_operand ops
[3];
4274 machine_mode mode
= GET_MODE (dest
);
4275 create_output_operand (&ops
[0], dest
, mode
);
4276 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4277 create_input_operand (&ops
[2], src
, mode
);
4278 temporary_volatile_ok
v (true);
4279 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4282 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4283 operand is in memory. In this case we need to use the predicated LD1
4284 and ST1 instead of LDR and STR, both for correctness on big-endian
4285 targets and because LD1 and ST1 support a wider range of addressing modes.
4286 PRED_MODE is the mode of the predicate.
4288 See the comment at the head of aarch64-sve.md for details about the
4289 big-endian handling. */
4292 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4294 machine_mode mode
= GET_MODE (dest
);
4295 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4296 if (!register_operand (src
, mode
)
4297 && !register_operand (dest
, mode
))
4299 rtx tmp
= gen_reg_rtx (mode
);
4301 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4303 emit_move_insn (tmp
, src
);
4306 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4309 /* Called only on big-endian targets. See whether an SVE vector move
4310 from SRC to DEST is effectively a REV[BHW] instruction, because at
4311 least one operand is a subreg of an SVE vector that has wider or
4312 narrower elements. Return true and emit the instruction if so.
4316 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4318 represents a VIEW_CONVERT between the following vectors, viewed
4321 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4322 R1: { [0], [1], [2], [3], ... }
4324 The high part of lane X in R2 should therefore correspond to lane X*2
4325 of R1, but the register representations are:
4328 R2: ...... [1].high [1].low [0].high [0].low
4329 R1: ...... [3] [2] [1] [0]
4331 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4332 We therefore need a reverse operation to swap the high and low values
4335 This is purely an optimization. Without it we would spill the
4336 subreg operand to the stack in one mode and reload it in the
4337 other mode, which has the same effect as the REV. */
4340 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4342 gcc_assert (BYTES_BIG_ENDIAN
);
4343 if (GET_CODE (dest
) == SUBREG
)
4344 dest
= SUBREG_REG (dest
);
4345 if (GET_CODE (src
) == SUBREG
)
4346 src
= SUBREG_REG (src
);
4348 /* The optimization handles two single SVE REGs with different element
4352 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4353 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4354 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4355 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4358 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4359 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4360 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4362 emit_insn (gen_rtx_SET (dest
, unspec
));
4366 /* Return a copy of X with mode MODE, without changing its other
4367 attributes. Unlike gen_lowpart, this doesn't care whether the
4368 mode change is valid. */
4371 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4373 if (GET_MODE (x
) == mode
)
4376 x
= shallow_copy_rtx (x
);
4377 set_mode_and_regno (x
, mode
, REGNO (x
));
4381 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4382 stored in wider integer containers. */
4385 aarch64_sve_rev_unspec (machine_mode mode
)
4387 switch (GET_MODE_UNIT_SIZE (mode
))
4389 case 1: return UNSPEC_REVB
;
4390 case 2: return UNSPEC_REVH
;
4391 case 4: return UNSPEC_REVW
;
4396 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4400 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4402 /* Decide which REV operation we need. The mode with wider elements
4403 determines the mode of the operands and the mode with the narrower
4404 elements determines the reverse width. */
4405 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
4406 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
4407 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4408 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4409 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4411 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
4412 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
4413 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
4415 /* Get the operands in the appropriate modes and emit the instruction. */
4416 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4417 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
4418 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
4419 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
4424 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
4425 tree exp ATTRIBUTE_UNUSED
)
4427 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
4433 /* Implement TARGET_PASS_BY_REFERENCE. */
4436 aarch64_pass_by_reference (cumulative_args_t
, const function_arg_info
&arg
)
4439 machine_mode dummymode
;
4442 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4443 if (arg
.mode
== BLKmode
&& arg
.type
)
4444 size
= int_size_in_bytes (arg
.type
);
4446 /* No frontends can create types with variable-sized modes, so we
4447 shouldn't be asked to pass or return them. */
4448 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
4450 /* Aggregates are passed by reference based on their size. */
4451 if (arg
.aggregate_type_p ())
4452 size
= int_size_in_bytes (arg
.type
);
4454 /* Variable sized arguments are always returned by reference. */
4458 /* Can this be a candidate to be passed in fp/simd register(s)? */
4459 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
4464 /* Arguments which are variable sized or larger than 2 registers are
4465 passed by reference unless they are a homogenous floating point
4467 return size
> 2 * UNITS_PER_WORD
;
4470 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4472 aarch64_return_in_msb (const_tree valtype
)
4474 machine_mode dummy_mode
;
4477 /* Never happens in little-endian mode. */
4478 if (!BYTES_BIG_ENDIAN
)
4481 /* Only composite types smaller than or equal to 16 bytes can
4482 be potentially returned in registers. */
4483 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4484 || int_size_in_bytes (valtype
) <= 0
4485 || int_size_in_bytes (valtype
) > 16)
4488 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4489 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4490 is always passed/returned in the least significant bits of fp/simd
4492 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4493 &dummy_mode
, &dummy_int
, NULL
))
4499 /* Implement TARGET_FUNCTION_VALUE.
4500 Define how to find the value returned by a function. */
4503 aarch64_function_value (const_tree type
, const_tree func
,
4504 bool outgoing ATTRIBUTE_UNUSED
)
4509 machine_mode ag_mode
;
4511 mode
= TYPE_MODE (type
);
4512 if (INTEGRAL_TYPE_P (type
))
4513 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
4515 if (aarch64_return_in_msb (type
))
4517 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4519 if (size
% UNITS_PER_WORD
!= 0)
4521 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4522 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4526 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4527 &ag_mode
, &count
, NULL
))
4529 if (!aarch64_composite_type_p (type
, mode
))
4531 gcc_assert (count
== 1 && mode
== ag_mode
);
4532 return gen_rtx_REG (mode
, V0_REGNUM
);
4539 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4540 for (i
= 0; i
< count
; i
++)
4542 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4543 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
4544 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4545 XVECEXP (par
, 0, i
) = tmp
;
4551 return gen_rtx_REG (mode
, R0_REGNUM
);
4554 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4555 Return true if REGNO is the number of a hard register in which the values
4556 of called function may come back. */
4559 aarch64_function_value_regno_p (const unsigned int regno
)
4561 /* Maximum of 16 bytes can be returned in the general registers. Examples
4562 of 16-byte return values are: 128-bit integers and 16-byte small
4563 structures (excluding homogeneous floating-point aggregates). */
4564 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
4567 /* Up to four fp/simd registers can return a function value, e.g. a
4568 homogeneous floating-point aggregate having four members. */
4569 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
4570 return TARGET_FLOAT
;
4575 /* Implement TARGET_RETURN_IN_MEMORY.
4577 If the type T of the result of a function is such that
4579 would require that arg be passed as a value in a register (or set of
4580 registers) according to the parameter passing rules, then the result
4581 is returned in the same registers as would be used for such an
4585 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
4588 machine_mode ag_mode
;
4591 if (!AGGREGATE_TYPE_P (type
)
4592 && TREE_CODE (type
) != COMPLEX_TYPE
4593 && TREE_CODE (type
) != VECTOR_TYPE
)
4594 /* Simple scalar types always returned in registers. */
4597 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
4604 /* Types larger than 2 registers returned in memory. */
4605 size
= int_size_in_bytes (type
);
4606 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
4610 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
4611 const_tree type
, int *nregs
)
4613 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4614 return aarch64_vfp_is_call_or_return_candidate (mode
,
4616 &pcum
->aapcs_vfp_rmode
,
4621 /* Given MODE and TYPE of a function argument, return the alignment in
4622 bits. The idea is to suppress any stronger alignment requested by
4623 the user and opt for the natural alignment (specified in AAPCS64 \S
4624 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4625 calculated in versions of GCC prior to GCC-9. This is a helper
4626 function for local use only. */
4629 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
4634 return GET_MODE_ALIGNMENT (mode
);
4636 if (integer_zerop (TYPE_SIZE (type
)))
4639 gcc_assert (TYPE_MODE (type
) == mode
);
4641 if (!AGGREGATE_TYPE_P (type
))
4642 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
4644 if (TREE_CODE (type
) == ARRAY_TYPE
)
4645 return TYPE_ALIGN (TREE_TYPE (type
));
4647 unsigned int alignment
= 0;
4648 unsigned int bitfield_alignment
= 0;
4649 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
4650 if (TREE_CODE (field
) == FIELD_DECL
)
4652 alignment
= std::max (alignment
, DECL_ALIGN (field
));
4653 if (DECL_BIT_FIELD_TYPE (field
))
4655 = std::max (bitfield_alignment
,
4656 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
4659 if (bitfield_alignment
> alignment
)
4662 return bitfield_alignment
;
4668 /* Layout a function argument according to the AAPCS64 rules. The rule
4669 numbers refer to the rule numbers in the AAPCS64. */
4672 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4674 bool named ATTRIBUTE_UNUSED
)
4676 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4677 int ncrn
, nvrn
, nregs
;
4678 bool allocate_ncrn
, allocate_nvrn
;
4682 /* We need to do this once per argument. */
4683 if (pcum
->aapcs_arg_processed
)
4686 pcum
->aapcs_arg_processed
= true;
4688 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4690 size
= int_size_in_bytes (type
);
4692 /* No frontends can create types with variable-sized modes, so we
4693 shouldn't be asked to pass or return them. */
4694 size
= GET_MODE_SIZE (mode
).to_constant ();
4695 size
= ROUND_UP (size
, UNITS_PER_WORD
);
4697 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
4698 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
4703 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4704 The following code thus handles passing by SIMD/FP registers first. */
4706 nvrn
= pcum
->aapcs_nvrn
;
4708 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4709 and homogenous short-vector aggregates (HVA). */
4713 aarch64_err_no_fpadvsimd (mode
);
4715 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
4717 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
4718 if (!aarch64_composite_type_p (type
, mode
))
4720 gcc_assert (nregs
== 1);
4721 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
4727 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4728 for (i
= 0; i
< nregs
; i
++)
4730 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
4731 V0_REGNUM
+ nvrn
+ i
);
4732 rtx offset
= gen_int_mode
4733 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
4734 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4735 XVECEXP (par
, 0, i
) = tmp
;
4737 pcum
->aapcs_reg
= par
;
4743 /* C.3 NSRN is set to 8. */
4744 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
4749 ncrn
= pcum
->aapcs_ncrn
;
4750 nregs
= size
/ UNITS_PER_WORD
;
4752 /* C6 - C9. though the sign and zero extension semantics are
4753 handled elsewhere. This is the case where the argument fits
4754 entirely general registers. */
4755 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
4757 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
4759 /* C.8 if the argument has an alignment of 16 then the NGRN is
4760 rounded up to the next even number. */
4763 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4764 comparison is there because for > 16 * BITS_PER_UNIT
4765 alignment nregs should be > 2 and therefore it should be
4766 passed by reference rather than value. */
4767 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4768 == 16 * BITS_PER_UNIT
))
4770 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4771 inform (input_location
, "parameter passing for argument of type "
4772 "%qT changed in GCC 9.1", type
);
4774 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
4777 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4778 A reg is still generated for it, but the caller should be smart
4779 enough not to use it. */
4780 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
4781 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
4787 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4788 for (i
= 0; i
< nregs
; i
++)
4790 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
4791 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
4792 GEN_INT (i
* UNITS_PER_WORD
));
4793 XVECEXP (par
, 0, i
) = tmp
;
4795 pcum
->aapcs_reg
= par
;
4798 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
4803 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
4805 /* The argument is passed on stack; record the needed number of words for
4806 this argument and align the total size if necessary. */
4808 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
4810 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4811 == 16 * BITS_PER_UNIT
)
4813 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
4814 if (pcum
->aapcs_stack_size
!= new_size
)
4816 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4817 inform (input_location
, "parameter passing for argument of type "
4818 "%qT changed in GCC 9.1", type
);
4819 pcum
->aapcs_stack_size
= new_size
;
4825 /* Implement TARGET_FUNCTION_ARG. */
4828 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
4830 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4831 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
4832 || pcum
->pcs_variant
== ARM_PCS_SIMD
);
4834 if (arg
.end_marker_p ())
4835 return gen_int_mode (pcum
->pcs_variant
, DImode
);
4837 aarch64_layout_arg (pcum_v
, arg
.mode
, arg
.type
, arg
.named
);
4838 return pcum
->aapcs_reg
;
4842 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4844 rtx libname ATTRIBUTE_UNUSED
,
4845 const_tree fndecl ATTRIBUTE_UNUSED
,
4846 unsigned n_named ATTRIBUTE_UNUSED
)
4848 pcum
->aapcs_ncrn
= 0;
4849 pcum
->aapcs_nvrn
= 0;
4850 pcum
->aapcs_nextncrn
= 0;
4851 pcum
->aapcs_nextnvrn
= 0;
4853 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
4855 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4856 pcum
->aapcs_reg
= NULL_RTX
;
4857 pcum
->aapcs_arg_processed
= false;
4858 pcum
->aapcs_stack_words
= 0;
4859 pcum
->aapcs_stack_size
= 0;
4862 && fndecl
&& TREE_PUBLIC (fndecl
)
4863 && fntype
&& fntype
!= error_mark_node
)
4865 const_tree type
= TREE_TYPE (fntype
);
4866 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4867 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4868 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4869 &mode
, &nregs
, NULL
))
4870 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4876 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4877 const function_arg_info
&arg
)
4879 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4880 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
4881 || pcum
->pcs_variant
== ARM_PCS_SIMD
)
4883 aarch64_layout_arg (pcum_v
, arg
.mode
, arg
.type
, arg
.named
);
4884 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4885 != (pcum
->aapcs_stack_words
!= 0));
4886 pcum
->aapcs_arg_processed
= false;
4887 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4888 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4889 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4890 pcum
->aapcs_stack_words
= 0;
4891 pcum
->aapcs_reg
= NULL_RTX
;
4896 aarch64_function_arg_regno_p (unsigned regno
)
4898 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4899 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4902 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4903 PARM_BOUNDARY bits of alignment, but will be given anything up
4904 to STACK_BOUNDARY bits if the type requires it. This makes sure
4905 that both before and after the layout of each argument, the Next
4906 Stacked Argument Address (NSAA) will have a minimum alignment of
4910 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4913 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4915 if (abi_break
& warn_psabi
)
4916 inform (input_location
, "parameter passing for argument of type "
4917 "%qT changed in GCC 9.1", type
);
4919 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4922 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4924 static fixed_size_mode
4925 aarch64_get_reg_raw_mode (int regno
)
4927 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4928 /* Don't use the SVE part of the register for __builtin_apply and
4929 __builtin_return. The SVE registers aren't used by the normal PCS,
4930 so using them there would be a waste of time. The PCS extensions
4931 for SVE types are fundamentally incompatible with the
4932 __builtin_return/__builtin_apply interface. */
4933 return as_a
<fixed_size_mode
> (V16QImode
);
4934 return default_get_reg_raw_mode (regno
);
4937 /* Implement TARGET_FUNCTION_ARG_PADDING.
4939 Small aggregate types are placed in the lowest memory address.
4941 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4943 static pad_direction
4944 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4946 /* On little-endian targets, the least significant byte of every stack
4947 argument is passed at the lowest byte address of the stack slot. */
4948 if (!BYTES_BIG_ENDIAN
)
4951 /* Otherwise, integral, floating-point and pointer types are padded downward:
4952 the least significant byte of a stack argument is passed at the highest
4953 byte address of the stack slot. */
4955 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4956 || POINTER_TYPE_P (type
))
4957 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4958 return PAD_DOWNWARD
;
4960 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4964 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4966 It specifies padding for the last (may also be the only)
4967 element of a block move between registers and memory. If
4968 assuming the block is in the memory, padding upward means that
4969 the last element is padded after its highest significant byte,
4970 while in downward padding, the last element is padded at the
4971 its least significant byte side.
4973 Small aggregates and small complex types are always padded
4976 We don't need to worry about homogeneous floating-point or
4977 short-vector aggregates; their move is not affected by the
4978 padding direction determined here. Regardless of endianness,
4979 each element of such an aggregate is put in the least
4980 significant bits of a fp/simd register.
4982 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4983 register has useful data, and return the opposite if the most
4984 significant byte does. */
4987 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4988 bool first ATTRIBUTE_UNUSED
)
4991 /* Small composite types are always padded upward. */
4992 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4996 size
= int_size_in_bytes (type
);
4998 /* No frontends can create types with variable-sized modes, so we
4999 shouldn't be asked to pass or return them. */
5000 size
= GET_MODE_SIZE (mode
).to_constant ();
5001 if (size
< 2 * UNITS_PER_WORD
)
5005 /* Otherwise, use the default padding. */
5006 return !BYTES_BIG_ENDIAN
;
5009 static scalar_int_mode
5010 aarch64_libgcc_cmp_return_mode (void)
5015 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5017 /* We use the 12-bit shifted immediate arithmetic instructions so values
5018 must be multiple of (1 << 12), i.e. 4096. */
5019 #define ARITH_FACTOR 4096
5021 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5022 #error Cannot use simple address calculation for stack probing
5025 /* The pair of scratch registers used for stack probing. */
5026 #define PROBE_STACK_FIRST_REG R9_REGNUM
5027 #define PROBE_STACK_SECOND_REG R10_REGNUM
5029 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5030 inclusive. These are offsets from the current stack pointer. */
5033 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
5036 if (!poly_size
.is_constant (&size
))
5038 sorry ("stack probes for SVE frames");
5042 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
5044 /* See the same assertion on PROBE_INTERVAL above. */
5045 gcc_assert ((first
% ARITH_FACTOR
) == 0);
5047 /* See if we have a constant small number of probes to generate. If so,
5048 that's the easy case. */
5049 if (size
<= PROBE_INTERVAL
)
5051 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
5053 emit_set_insn (reg1
,
5054 plus_constant (Pmode
,
5055 stack_pointer_rtx
, -(first
+ base
)));
5056 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
5059 /* The run-time loop is made up of 8 insns in the generic case while the
5060 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5061 else if (size
<= 4 * PROBE_INTERVAL
)
5063 HOST_WIDE_INT i
, rem
;
5065 emit_set_insn (reg1
,
5066 plus_constant (Pmode
,
5068 -(first
+ PROBE_INTERVAL
)));
5069 emit_stack_probe (reg1
);
5071 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5072 it exceeds SIZE. If only two probes are needed, this will not
5073 generate any code. Then probe at FIRST + SIZE. */
5074 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
5076 emit_set_insn (reg1
,
5077 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
5078 emit_stack_probe (reg1
);
5081 rem
= size
- (i
- PROBE_INTERVAL
);
5084 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5086 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
5087 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
5090 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
5093 /* Otherwise, do the same as above, but in a loop. Note that we must be
5094 extra careful with variables wrapping around because we might be at
5095 the very top (or the very bottom) of the address space and we have
5096 to be able to handle this case properly; in particular, we use an
5097 equality test for the loop condition. */
5100 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
5102 /* Step 1: round SIZE to the previous multiple of the interval. */
5104 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5107 /* Step 2: compute initial and final value of the loop counter. */
5109 /* TEST_ADDR = SP + FIRST. */
5110 emit_set_insn (reg1
,
5111 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5113 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5114 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5115 if (! aarch64_uimm12_shift (adjustment
))
5117 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5119 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5122 emit_set_insn (reg2
,
5123 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5129 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5132 while (TEST_ADDR != LAST_ADDR)
5134 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5135 until it is equal to ROUNDED_SIZE. */
5137 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5140 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5141 that SIZE is equal to ROUNDED_SIZE. */
5143 if (size
!= rounded_size
)
5145 HOST_WIDE_INT rem
= size
- rounded_size
;
5149 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5151 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5152 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5155 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5159 /* Make sure nothing is scheduled before we are done. */
5160 emit_insn (gen_blockage ());
5163 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5164 absolute addresses. */
5167 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5169 static int labelno
= 0;
5173 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5176 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5178 HOST_WIDE_INT stack_clash_probe_interval
5179 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5181 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5183 HOST_WIDE_INT interval
;
5184 if (flag_stack_clash_protection
)
5185 interval
= stack_clash_probe_interval
;
5187 interval
= PROBE_INTERVAL
;
5189 gcc_assert (aarch64_uimm12_shift (interval
));
5190 xops
[1] = GEN_INT (interval
);
5192 output_asm_insn ("sub\t%0, %0, %1", xops
);
5194 /* If doing stack clash protection then we probe up by the ABI specified
5195 amount. We do this because we're dropping full pages at a time in the
5196 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5197 if (flag_stack_clash_protection
)
5198 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5200 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5202 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5203 by this amount for each iteration. */
5204 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5206 /* Test if TEST_ADDR == LAST_ADDR. */
5208 output_asm_insn ("cmp\t%0, %1", xops
);
5211 fputs ("\tb.ne\t", asm_out_file
);
5212 assemble_name_raw (asm_out_file
, loop_lab
);
5213 fputc ('\n', asm_out_file
);
5218 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5219 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5220 of GUARD_SIZE. When a probe is emitted it is done at most
5221 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5222 at most MIN_PROBE_THRESHOLD. By the end of this function
5223 BASE = BASE - ADJUSTMENT. */
5226 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5227 rtx min_probe_threshold
, rtx guard_size
)
5229 /* This function is not allowed to use any instruction generation function
5230 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5231 so instead emit the code you want using output_asm_insn. */
5232 gcc_assert (flag_stack_clash_protection
);
5233 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5234 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5236 /* The minimum required allocation before the residual requires probing. */
5237 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5239 /* Clamp the value down to the nearest value that can be used with a cmp. */
5240 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5241 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5243 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5244 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5246 static int labelno
= 0;
5247 char loop_start_lab
[32];
5248 char loop_end_lab
[32];
5251 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5252 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5254 /* Emit loop start label. */
5255 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5257 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5258 xops
[0] = adjustment
;
5259 xops
[1] = probe_offset_value_rtx
;
5260 output_asm_insn ("cmp\t%0, %1", xops
);
5262 /* Branch to end if not enough adjustment to probe. */
5263 fputs ("\tb.lt\t", asm_out_file
);
5264 assemble_name_raw (asm_out_file
, loop_end_lab
);
5265 fputc ('\n', asm_out_file
);
5267 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5269 xops
[1] = probe_offset_value_rtx
;
5270 output_asm_insn ("sub\t%0, %0, %1", xops
);
5272 /* Probe at BASE. */
5273 xops
[1] = const0_rtx
;
5274 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5276 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5277 xops
[0] = adjustment
;
5278 xops
[1] = probe_offset_value_rtx
;
5279 output_asm_insn ("sub\t%0, %0, %1", xops
);
5281 /* Branch to start if still more bytes to allocate. */
5282 fputs ("\tb\t", asm_out_file
);
5283 assemble_name_raw (asm_out_file
, loop_start_lab
);
5284 fputc ('\n', asm_out_file
);
5286 /* No probe leave. */
5287 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5289 /* BASE = BASE - ADJUSTMENT. */
5291 xops
[1] = adjustment
;
5292 output_asm_insn ("sub\t%0, %0, %1", xops
);
5296 /* Determine whether a frame chain needs to be generated. */
5298 aarch64_needs_frame_chain (void)
5300 /* Force a frame chain for EH returns so the return address is at FP+8. */
5301 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5304 /* A leaf function cannot have calls or write LR. */
5305 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5307 /* Don't use a frame chain in leaf functions if leaf frame pointers
5309 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5312 return aarch64_use_frame_pointer
;
5315 /* Mark the registers that need to be saved by the callee and calculate
5316 the size of the callee-saved registers area and frame record (both FP
5317 and LR may be omitted). */
5319 aarch64_layout_frame (void)
5321 HOST_WIDE_INT offset
= 0;
5322 int regno
, last_fp_reg
= INVALID_REGNUM
;
5323 bool simd_function
= (crtl
->abi
->id () == ARM_PCS_SIMD
);
5325 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5327 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5328 the mid-end is doing. */
5329 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5331 #define SLOT_NOT_REQUIRED (-2)
5332 #define SLOT_REQUIRED (-1)
5334 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
5335 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
5337 /* First mark all the registers that really need to be saved... */
5338 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5339 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5341 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5342 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5344 /* ... that includes the eh data registers (if needed)... */
5345 if (crtl
->calls_eh_return
)
5346 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5347 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
5350 /* ... and any callee saved register that dataflow says is live. */
5351 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5352 if (df_regs_ever_live_p (regno
)
5353 && !fixed_regs
[regno
]
5354 && (regno
== R30_REGNUM
5355 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
5356 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5358 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5359 if (df_regs_ever_live_p (regno
)
5360 && !fixed_regs
[regno
]
5361 && !crtl
->abi
->clobbers_full_reg_p (regno
))
5363 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5364 last_fp_reg
= regno
;
5367 if (cfun
->machine
->frame
.emit_frame_chain
)
5369 /* FP and LR are placed in the linkage record. */
5370 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
5371 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
5372 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
5373 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
5374 offset
= 2 * UNITS_PER_WORD
;
5377 /* With stack-clash, LR must be saved in non-leaf functions. */
5378 gcc_assert (crtl
->is_leaf
5379 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
5380 != SLOT_NOT_REQUIRED
));
5382 /* Now assign stack slots for them. */
5383 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5384 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5386 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5387 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5388 cfun
->machine
->frame
.wb_candidate1
= regno
;
5389 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
5390 cfun
->machine
->frame
.wb_candidate2
= regno
;
5391 offset
+= UNITS_PER_WORD
;
5394 HOST_WIDE_INT max_int_offset
= offset
;
5395 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5396 bool has_align_gap
= offset
!= max_int_offset
;
5398 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5399 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5401 /* If there is an alignment gap between integer and fp callee-saves,
5402 allocate the last fp register to it if possible. */
5403 if (regno
== last_fp_reg
5406 && (offset
& 8) == 0)
5408 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
5412 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5413 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5414 cfun
->machine
->frame
.wb_candidate1
= regno
;
5415 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
5416 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
5417 cfun
->machine
->frame
.wb_candidate2
= regno
;
5418 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
5421 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5423 cfun
->machine
->frame
.saved_regs_size
= offset
;
5425 HOST_WIDE_INT varargs_and_saved_regs_size
5426 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
5428 cfun
->machine
->frame
.hard_fp_offset
5429 = aligned_upper_bound (varargs_and_saved_regs_size
5430 + get_frame_size (),
5431 STACK_BOUNDARY
/ BITS_PER_UNIT
);
5433 /* Both these values are already aligned. */
5434 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
5435 STACK_BOUNDARY
/ BITS_PER_UNIT
));
5436 cfun
->machine
->frame
.frame_size
5437 = (cfun
->machine
->frame
.hard_fp_offset
5438 + crtl
->outgoing_args_size
);
5440 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
5442 cfun
->machine
->frame
.initial_adjust
= 0;
5443 cfun
->machine
->frame
.final_adjust
= 0;
5444 cfun
->machine
->frame
.callee_adjust
= 0;
5445 cfun
->machine
->frame
.callee_offset
= 0;
5447 HOST_WIDE_INT max_push_offset
= 0;
5448 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
5449 max_push_offset
= 512;
5450 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
5451 max_push_offset
= 256;
5453 HOST_WIDE_INT const_size
, const_fp_offset
;
5454 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
5455 && const_size
< max_push_offset
5456 && known_eq (crtl
->outgoing_args_size
, 0))
5458 /* Simple, small frame with no outgoing arguments:
5459 stp reg1, reg2, [sp, -frame_size]!
5460 stp reg3, reg4, [sp, 16] */
5461 cfun
->machine
->frame
.callee_adjust
= const_size
;
5463 else if (known_lt (crtl
->outgoing_args_size
5464 + cfun
->machine
->frame
.saved_regs_size
, 512)
5465 && !(cfun
->calls_alloca
5466 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
5469 /* Frame with small outgoing arguments:
5470 sub sp, sp, frame_size
5471 stp reg1, reg2, [sp, outgoing_args_size]
5472 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5473 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
5474 cfun
->machine
->frame
.callee_offset
5475 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
5477 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
5478 && const_fp_offset
< max_push_offset
)
5480 /* Frame with large outgoing arguments but a small local area:
5481 stp reg1, reg2, [sp, -hard_fp_offset]!
5482 stp reg3, reg4, [sp, 16]
5483 sub sp, sp, outgoing_args_size */
5484 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
5485 cfun
->machine
->frame
.final_adjust
5486 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
5490 /* Frame with large local area and outgoing arguments using frame pointer:
5491 sub sp, sp, hard_fp_offset
5492 stp x29, x30, [sp, 0]
5494 stp reg3, reg4, [sp, 16]
5495 sub sp, sp, outgoing_args_size */
5496 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
5497 cfun
->machine
->frame
.final_adjust
5498 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
5501 cfun
->machine
->frame
.laid_out
= true;
5504 /* Return true if the register REGNO is saved on entry to
5505 the current function. */
5508 aarch64_register_saved_on_entry (int regno
)
5510 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
5513 /* Return the next register up from REGNO up to LIMIT for the callee
5517 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
5519 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
5524 /* Push the register number REGNO of mode MODE to the stack with write-back
5525 adjusting the stack by ADJUSTMENT. */
5528 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
5529 HOST_WIDE_INT adjustment
)
5531 rtx base_rtx
= stack_pointer_rtx
;
5534 reg
= gen_rtx_REG (mode
, regno
);
5535 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
5536 plus_constant (Pmode
, base_rtx
, -adjustment
));
5537 mem
= gen_frame_mem (mode
, mem
);
5539 insn
= emit_move_insn (mem
, reg
);
5540 RTX_FRAME_RELATED_P (insn
) = 1;
5543 /* Generate and return an instruction to store the pair of registers
5544 REG and REG2 of mode MODE to location BASE with write-back adjusting
5545 the stack location BASE by ADJUSTMENT. */
5548 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5549 HOST_WIDE_INT adjustment
)
5554 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
5555 GEN_INT (-adjustment
),
5556 GEN_INT (UNITS_PER_WORD
- adjustment
));
5558 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
5559 GEN_INT (-adjustment
),
5560 GEN_INT (UNITS_PER_WORD
- adjustment
));
5562 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
5563 GEN_INT (-adjustment
),
5564 GEN_INT (UNITS_PER_VREG
- adjustment
));
5570 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5571 stack pointer by ADJUSTMENT. */
5574 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
5577 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5579 if (regno2
== INVALID_REGNUM
)
5580 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
5582 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5583 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5585 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
5587 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
5588 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5589 RTX_FRAME_RELATED_P (insn
) = 1;
5592 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5593 adjusting it by ADJUSTMENT afterwards. */
5596 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5597 HOST_WIDE_INT adjustment
)
5602 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5603 GEN_INT (UNITS_PER_WORD
));
5605 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5606 GEN_INT (UNITS_PER_WORD
));
5608 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5609 GEN_INT (UNITS_PER_VREG
));
5615 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5616 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5620 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
5623 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5624 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5626 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
5628 if (regno2
== INVALID_REGNUM
)
5630 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
5631 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
5632 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
5636 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5637 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5638 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
5643 /* Generate and return a store pair instruction of mode MODE to store
5644 register REG1 to MEM1 and register REG2 to MEM2. */
5647 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
5653 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
5656 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
5659 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
5666 /* Generate and regurn a load pair isntruction of mode MODE to load register
5667 REG1 from MEM1 and register REG2 from MEM2. */
5670 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
5676 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
5679 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
5682 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
5689 /* Return TRUE if return address signing should be enabled for the current
5690 function, otherwise return FALSE. */
5693 aarch64_return_address_signing_enabled (void)
5695 /* This function should only be called after frame laid out. */
5696 gcc_assert (cfun
->machine
->frame
.laid_out
);
5698 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5699 if its LR is pushed onto stack. */
5700 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
5701 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
5702 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
5705 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5707 aarch64_bti_enabled (void)
5709 return (aarch64_enable_bti
== 1);
5712 /* Emit code to save the callee-saved registers from register number START
5713 to LIMIT to the stack at the location starting at offset START_OFFSET,
5714 skipping any write-back candidates if SKIP_WB is true. */
5717 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
5718 unsigned start
, unsigned limit
, bool skip_wb
)
5724 for (regno
= aarch64_next_callee_save (start
, limit
);
5726 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5733 && (regno
== cfun
->machine
->frame
.wb_candidate1
5734 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5737 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5740 reg
= gen_rtx_REG (mode
, regno
);
5741 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5742 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5745 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5746 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5747 - cfun
->machine
->frame
.reg_offset
[regno
];
5750 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5751 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5753 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5756 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5757 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5759 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
5762 /* The first part of a frame-related parallel insn is
5763 always assumed to be relevant to the frame
5764 calculations; subsequent parts, are only
5765 frame-related if explicitly marked. */
5766 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5770 insn
= emit_move_insn (mem
, reg
);
5772 RTX_FRAME_RELATED_P (insn
) = 1;
5776 /* Emit code to restore the callee registers of mode MODE from register
5777 number START up to and including LIMIT. Restore from the stack offset
5778 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5779 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5782 aarch64_restore_callee_saves (machine_mode mode
,
5783 poly_int64 start_offset
, unsigned start
,
5784 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
5786 rtx base_rtx
= stack_pointer_rtx
;
5791 for (regno
= aarch64_next_callee_save (start
, limit
);
5793 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5795 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5802 && (regno
== cfun
->machine
->frame
.wb_candidate1
5803 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5806 reg
= gen_rtx_REG (mode
, regno
);
5807 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5808 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5810 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5811 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5812 - cfun
->machine
->frame
.reg_offset
[regno
];
5815 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5816 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5818 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5821 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5822 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5823 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5825 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5829 emit_move_insn (reg
, mem
);
5830 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5834 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5838 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5840 HOST_WIDE_INT multiple
;
5841 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5842 && IN_RANGE (multiple
, -8, 7));
5845 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5849 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5851 HOST_WIDE_INT multiple
;
5852 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5853 && IN_RANGE (multiple
, 0, 63));
5856 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5860 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5862 HOST_WIDE_INT multiple
;
5863 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5864 && IN_RANGE (multiple
, -64, 63));
5867 /* Return true if OFFSET is a signed 9-bit value. */
5870 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5873 HOST_WIDE_INT const_offset
;
5874 return (offset
.is_constant (&const_offset
)
5875 && IN_RANGE (const_offset
, -256, 255));
5878 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5882 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5884 HOST_WIDE_INT multiple
;
5885 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5886 && IN_RANGE (multiple
, -256, 255));
5889 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5893 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5895 HOST_WIDE_INT multiple
;
5896 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5897 && IN_RANGE (multiple
, 0, 4095));
5900 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5903 aarch64_get_separate_components (void)
5905 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5906 bitmap_clear (components
);
5908 /* The registers we need saved to the frame. */
5909 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5910 if (aarch64_register_saved_on_entry (regno
))
5912 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5913 if (!frame_pointer_needed
)
5914 offset
+= cfun
->machine
->frame
.frame_size
5915 - cfun
->machine
->frame
.hard_fp_offset
;
5916 /* Check that we can access the stack slot of the register with one
5917 direct load with no adjustments needed. */
5918 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5919 bitmap_set_bit (components
, regno
);
5922 /* Don't mess with the hard frame pointer. */
5923 if (frame_pointer_needed
)
5924 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5926 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5927 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5928 /* If registers have been chosen to be stored/restored with
5929 writeback don't interfere with them to avoid having to output explicit
5930 stack adjustment instructions. */
5931 if (reg2
!= INVALID_REGNUM
)
5932 bitmap_clear_bit (components
, reg2
);
5933 if (reg1
!= INVALID_REGNUM
)
5934 bitmap_clear_bit (components
, reg1
);
5936 bitmap_clear_bit (components
, LR_REGNUM
);
5937 bitmap_clear_bit (components
, SP_REGNUM
);
5942 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5945 aarch64_components_for_bb (basic_block bb
)
5947 bitmap in
= DF_LIVE_IN (bb
);
5948 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5949 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5951 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5952 bitmap_clear (components
);
5954 /* Clobbered registers don't generate values in any meaningful sense,
5955 since nothing after the clobber can rely on their value. And we can't
5956 say that partially-clobbered registers are unconditionally killed,
5957 because whether they're killed or not depends on the mode of the
5958 value they're holding. Thus partially call-clobbered registers
5959 appear in neither the kill set nor the gen set.
5961 Check manually for any calls that clobber more of a register than the
5962 current function can. */
5963 function_abi_aggregator callee_abis
;
5965 FOR_BB_INSNS (bb
, insn
)
5967 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
5968 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
5970 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5971 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5972 if (!fixed_regs
[regno
]
5973 && !crtl
->abi
->clobbers_full_reg_p (regno
)
5974 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
5975 || bitmap_bit_p (in
, regno
)
5976 || bitmap_bit_p (gen
, regno
)
5977 || bitmap_bit_p (kill
, regno
)))
5979 unsigned regno2
, offset
, offset2
;
5980 bitmap_set_bit (components
, regno
);
5982 /* If there is a callee-save at an adjacent offset, add it too
5983 to increase the use of LDP/STP. */
5984 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5985 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5987 if (regno2
<= LAST_SAVED_REGNUM
)
5989 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5990 if ((offset
& ~8) == (offset2
& ~8))
5991 bitmap_set_bit (components
, regno2
);
5998 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5999 Nothing to do for aarch64. */
6002 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
6006 /* Return the next set bit in BMP from START onwards. Return the total number
6007 of bits in BMP if no set bit is found at or after START. */
6010 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
6012 unsigned int nbits
= SBITMAP_SIZE (bmp
);
6016 gcc_assert (start
< nbits
);
6017 for (unsigned int i
= start
; i
< nbits
; i
++)
6018 if (bitmap_bit_p (bmp
, i
))
6024 /* Do the work for aarch64_emit_prologue_components and
6025 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6026 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6027 for these components or the epilogue sequence. That is, it determines
6028 whether we should emit stores or loads and what kind of CFA notes to attach
6029 to the insns. Otherwise the logic for the two sequences is very
6033 aarch64_process_components (sbitmap components
, bool prologue_p
)
6035 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
6036 ? HARD_FRAME_POINTER_REGNUM
6037 : STACK_POINTER_REGNUM
);
6039 unsigned last_regno
= SBITMAP_SIZE (components
);
6040 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
6041 rtx_insn
*insn
= NULL
;
6043 while (regno
!= last_regno
)
6045 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6046 so DFmode for the vector registers is enough. For simd functions
6047 we want to save the low 128 bits. */
6048 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
6050 rtx reg
= gen_rtx_REG (mode
, regno
);
6051 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6052 if (!frame_pointer_needed
)
6053 offset
+= cfun
->machine
->frame
.frame_size
6054 - cfun
->machine
->frame
.hard_fp_offset
;
6055 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
6056 rtx mem
= gen_frame_mem (mode
, addr
);
6058 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
6059 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
6060 /* No more registers to handle after REGNO.
6061 Emit a single save/restore and exit. */
6062 if (regno2
== last_regno
)
6064 insn
= emit_insn (set
);
6065 RTX_FRAME_RELATED_P (insn
) = 1;
6067 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6069 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6073 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6074 /* The next register is not of the same class or its offset is not
6075 mergeable with the current one into a pair. */
6076 if (!satisfies_constraint_Ump (mem
)
6077 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
6078 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
6079 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
6080 GET_MODE_SIZE (mode
)))
6082 insn
= emit_insn (set
);
6083 RTX_FRAME_RELATED_P (insn
) = 1;
6085 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6087 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6093 /* REGNO2 can be saved/restored in a pair with REGNO. */
6094 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6095 if (!frame_pointer_needed
)
6096 offset2
+= cfun
->machine
->frame
.frame_size
6097 - cfun
->machine
->frame
.hard_fp_offset
;
6098 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
6099 rtx mem2
= gen_frame_mem (mode
, addr2
);
6100 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
6101 : gen_rtx_SET (reg2
, mem2
);
6104 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
6106 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6108 RTX_FRAME_RELATED_P (insn
) = 1;
6111 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6112 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6116 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6117 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6120 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6124 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6127 aarch64_emit_prologue_components (sbitmap components
)
6129 aarch64_process_components (components
, true);
6132 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6135 aarch64_emit_epilogue_components (sbitmap components
)
6137 aarch64_process_components (components
, false);
6140 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6143 aarch64_set_handled_components (sbitmap components
)
6145 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6146 if (bitmap_bit_p (components
, regno
))
6147 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
6150 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6151 determining the probe offset for alloca. */
6153 static HOST_WIDE_INT
6154 aarch64_stack_clash_protection_alloca_probe_range (void)
6156 return STACK_CLASH_CALLER_GUARD
;
6160 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6161 registers. If POLY_SIZE is not large enough to require a probe this function
6162 will only adjust the stack. When allocating the stack space
6163 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6164 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6165 arguments. If we are then we ensure that any allocation larger than the ABI
6166 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6169 We emit barriers after each stack adjustment to prevent optimizations from
6170 breaking the invariant that we never drop the stack more than a page. This
6171 invariant is needed to make it easier to correctly handle asynchronous
6172 events, e.g. if we were to allow the stack to be dropped by more than a page
6173 and then have multiple probes up and we take a signal somewhere in between
6174 then the signal handler doesn't know the state of the stack and can make no
6175 assumptions about which pages have been probed. */
6178 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
6179 poly_int64 poly_size
,
6180 bool frame_related_p
,
6181 bool final_adjustment_p
)
6183 HOST_WIDE_INT guard_size
6184 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6185 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6186 /* When doing the final adjustment for the outgoing argument size we can't
6187 assume that LR was saved at position 0. So subtract it's offset from the
6188 ABI safe buffer so that we don't accidentally allow an adjustment that
6189 would result in an allocation larger than the ABI buffer without
6191 HOST_WIDE_INT min_probe_threshold
6192 = final_adjustment_p
6193 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
6194 : guard_size
- guard_used_by_caller
;
6196 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6198 /* We should always have a positive probe threshold. */
6199 gcc_assert (min_probe_threshold
> 0);
6201 if (flag_stack_clash_protection
&& !final_adjustment_p
)
6203 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6204 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6206 if (known_eq (frame_size
, 0))
6208 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
6210 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
6211 && known_lt (final_adjust
, guard_used_by_caller
))
6213 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
6217 /* If SIZE is not large enough to require probing, just adjust the stack and
6219 if (known_lt (poly_size
, min_probe_threshold
)
6220 || !flag_stack_clash_protection
)
6222 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
6227 /* Handle the SVE non-constant case first. */
6228 if (!poly_size
.is_constant (&size
))
6232 fprintf (dump_file
, "Stack clash SVE prologue: ");
6233 print_dec (poly_size
, dump_file
);
6234 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
6237 /* First calculate the amount of bytes we're actually spilling. */
6238 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
6239 poly_size
, temp1
, temp2
, false, true);
6241 rtx_insn
*insn
= get_last_insn ();
6243 if (frame_related_p
)
6245 /* This is done to provide unwinding information for the stack
6246 adjustments we're about to do, however to prevent the optimizers
6247 from removing the R11 move and leaving the CFA note (which would be
6248 very wrong) we tie the old and new stack pointer together.
6249 The tie will expand to nothing but the optimizers will not touch
6251 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6252 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
6253 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
6255 /* We want the CFA independent of the stack pointer for the
6256 duration of the loop. */
6257 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
6258 RTX_FRAME_RELATED_P (insn
) = 1;
6261 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
6262 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
6264 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
6265 stack_pointer_rtx
, temp1
,
6266 probe_const
, guard_const
));
6268 /* Now reset the CFA register if needed. */
6269 if (frame_related_p
)
6271 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6272 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
6273 gen_int_mode (poly_size
, Pmode
)));
6274 RTX_FRAME_RELATED_P (insn
) = 1;
6282 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6283 " bytes, probing will be required.\n", size
);
6285 /* Round size to the nearest multiple of guard_size, and calculate the
6286 residual as the difference between the original size and the rounded
6288 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
6289 HOST_WIDE_INT residual
= size
- rounded_size
;
6291 /* We can handle a small number of allocations/probes inline. Otherwise
6293 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
6295 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
6297 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
6298 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6299 guard_used_by_caller
));
6300 emit_insn (gen_blockage ());
6302 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
6306 /* Compute the ending address. */
6307 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
6308 temp1
, NULL
, false, true);
6309 rtx_insn
*insn
= get_last_insn ();
6311 /* For the initial allocation, we don't have a frame pointer
6312 set up, so we always need CFI notes. If we're doing the
6313 final allocation, then we may have a frame pointer, in which
6314 case it is the CFA, otherwise we need CFI notes.
6316 We can determine which allocation we are doing by looking at
6317 the value of FRAME_RELATED_P since the final allocations are not
6319 if (frame_related_p
)
6321 /* We want the CFA independent of the stack pointer for the
6322 duration of the loop. */
6323 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6324 plus_constant (Pmode
, temp1
, rounded_size
));
6325 RTX_FRAME_RELATED_P (insn
) = 1;
6328 /* This allocates and probes the stack. Note that this re-uses some of
6329 the existing Ada stack protection code. However we are guaranteed not
6330 to enter the non loop or residual branches of that code.
6332 The non-loop part won't be entered because if our allocation amount
6333 doesn't require a loop, the case above would handle it.
6335 The residual amount won't be entered because TEMP1 is a mutliple of
6336 the allocation size. The residual will always be 0. As such, the only
6337 part we are actually using from that code is the loop setup. The
6338 actual probing is done in aarch64_output_probe_stack_range. */
6339 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
6340 stack_pointer_rtx
, temp1
));
6342 /* Now reset the CFA register if needed. */
6343 if (frame_related_p
)
6345 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6346 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
6347 RTX_FRAME_RELATED_P (insn
) = 1;
6350 emit_insn (gen_blockage ());
6351 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
6354 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6355 be probed. This maintains the requirement that each page is probed at
6356 least once. For initial probing we probe only if the allocation is
6357 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6358 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6359 GUARD_SIZE. This works that for any allocation that is large enough to
6360 trigger a probe here, we'll have at least one, and if they're not large
6361 enough for this code to emit anything for them, The page would have been
6362 probed by the saving of FP/LR either by this function or any callees. If
6363 we don't have any callees then we won't have more stack adjustments and so
6367 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
6368 /* If we're doing final adjustments, and we've done any full page
6369 allocations then any residual needs to be probed. */
6370 if (final_adjustment_p
&& rounded_size
!= 0)
6371 min_probe_threshold
= 0;
6372 /* If doing a small final adjustment, we always probe at offset 0.
6373 This is done to avoid issues when LR is not at position 0 or when
6374 the final adjustment is smaller than the probing offset. */
6375 else if (final_adjustment_p
&& rounded_size
== 0)
6376 residual_probe_offset
= 0;
6378 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
6379 if (residual
>= min_probe_threshold
)
6383 "Stack clash AArch64 prologue residuals: "
6384 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
6387 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6388 residual_probe_offset
));
6389 emit_insn (gen_blockage ());
6394 /* Return 1 if the register is used by the epilogue. We need to say the
6395 return register is used, but only after epilogue generation is complete.
6396 Note that in the case of sibcalls, the values "used by the epilogue" are
6397 considered live at the start of the called function.
6399 For SIMD functions we need to return 1 for FP registers that are saved and
6400 restored by a function but are not zero in call_used_regs. If we do not do
6401 this optimizations may remove the restore of the register. */
6404 aarch64_epilogue_uses (int regno
)
6406 if (epilogue_completed
)
6408 if (regno
== LR_REGNUM
)
6414 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6415 is saved at BASE + OFFSET. */
6418 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
6419 rtx base
, poly_int64 offset
)
6421 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
6422 add_reg_note (insn
, REG_CFA_EXPRESSION
,
6423 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
6426 /* AArch64 stack frames generated by this compiler look like:
6428 +-------------------------------+
6430 | incoming stack arguments |
6432 +-------------------------------+
6433 | | <-- incoming stack pointer (aligned)
6434 | callee-allocated save area |
6435 | for register varargs |
6437 +-------------------------------+
6438 | local variables | <-- frame_pointer_rtx
6440 +-------------------------------+
6442 +-------------------------------+ |
6443 | callee-saved registers | | frame.saved_regs_size
6444 +-------------------------------+ |
6446 +-------------------------------+ |
6447 | FP' | / <- hard_frame_pointer_rtx (aligned)
6448 +-------------------------------+
6449 | dynamic allocation |
6450 +-------------------------------+
6452 +-------------------------------+
6453 | outgoing stack arguments | <-- arg_pointer
6455 +-------------------------------+
6456 | | <-- stack_pointer_rtx (aligned)
6458 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6459 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6462 By default for stack-clash we assume the guard is at least 64KB, but this
6463 value is configurable to either 4KB or 64KB. We also force the guard size to
6464 be the same as the probing interval and both values are kept in sync.
6466 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6467 on the guard size) of stack space without probing.
6469 When probing is needed, we emit a probe at the start of the prologue
6470 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6472 We have to track how much space has been allocated and the only stores
6473 to the stack we track as implicit probes are the FP/LR stores.
6475 For outgoing arguments we probe if the size is larger than 1KB, such that
6476 the ABI specified buffer is maintained for the next callee.
6478 The following registers are reserved during frame layout and should not be
6479 used for any other purpose:
6481 - r11: Used by stack clash protection when SVE is enabled.
6482 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6483 - r14 and r15: Used for speculation tracking.
6484 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6485 - r30(LR), r29(FP): Used by standard frame layout.
6487 These registers must be avoided in frame layout related code unless the
6488 explicit intention is to interact with one of the features listed above. */
6490 /* Generate the prologue instructions for entry into a function.
6491 Establish the stack frame by decreasing the stack pointer with a
6492 properly calculated size and, if necessary, create a frame record
6493 filled with the values of LR and previous frame pointer. The
6494 current FP is also set up if it is in use. */
6497 aarch64_expand_prologue (void)
6499 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6500 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6501 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6502 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6503 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6504 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6505 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6506 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
6509 /* Sign return address for functions. */
6510 if (aarch64_return_address_signing_enabled ())
6512 switch (aarch64_ra_sign_key
)
6515 insn
= emit_insn (gen_paciasp ());
6518 insn
= emit_insn (gen_pacibsp ());
6523 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6524 RTX_FRAME_RELATED_P (insn
) = 1;
6527 if (flag_stack_usage_info
)
6528 current_function_static_stack_size
= constant_lower_bound (frame_size
);
6530 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
6532 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
6534 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
6535 && maybe_gt (frame_size
, get_stack_check_protect ()))
6536 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6538 - get_stack_check_protect ()));
6540 else if (maybe_gt (frame_size
, 0))
6541 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
6544 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6545 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6547 /* In theory we should never have both an initial adjustment
6548 and a callee save adjustment. Verify that is the case since the
6549 code below does not handle it for -fstack-clash-protection. */
6550 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
6552 /* Will only probe if the initial adjustment is larger than the guard
6553 less the amount of the guard reserved for use by the caller's
6555 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6558 if (callee_adjust
!= 0)
6559 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
6561 if (emit_frame_chain
)
6563 poly_int64 reg_offset
= callee_adjust
;
6564 if (callee_adjust
== 0)
6568 reg_offset
= callee_offset
;
6569 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
6571 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
6572 stack_pointer_rtx
, callee_offset
,
6573 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
6574 if (frame_pointer_needed
&& !frame_size
.is_constant ())
6576 /* Variable-sized frames need to describe the save slot
6577 address using DW_CFA_expression rather than DW_CFA_offset.
6578 This means that, without taking further action, the
6579 locations of the registers that we've already saved would
6580 remain based on the stack pointer even after we redefine
6581 the CFA based on the frame pointer. We therefore need new
6582 DW_CFA_expressions to re-express the save slots with addresses
6583 based on the frame pointer. */
6584 rtx_insn
*insn
= get_last_insn ();
6585 gcc_assert (RTX_FRAME_RELATED_P (insn
));
6587 /* Add an explicit CFA definition if this was previously
6589 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
6591 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
6593 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6594 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
6597 /* Change the save slot expressions for the registers that
6598 we've already saved. */
6599 reg_offset
-= callee_offset
;
6600 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
6601 reg_offset
+ UNITS_PER_WORD
);
6602 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
6605 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
6608 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6609 callee_adjust
!= 0 || emit_frame_chain
);
6610 if (crtl
->abi
->id () == ARM_PCS_SIMD
)
6611 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6612 callee_adjust
!= 0 || emit_frame_chain
);
6614 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6615 callee_adjust
!= 0 || emit_frame_chain
);
6617 /* We may need to probe the final adjustment if it is larger than the guard
6618 that is assumed by the called. */
6619 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
6620 !frame_pointer_needed
, true);
6623 /* Return TRUE if we can use a simple_return insn.
6625 This function checks whether the callee saved stack is empty, which
6626 means no restore actions are need. The pro_and_epilogue will use
6627 this to check whether shrink-wrapping opt is feasible. */
6630 aarch64_use_return_insn_p (void)
6632 if (!reload_completed
)
6638 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
6641 /* Generate the epilogue instructions for returning from a function.
6642 This is almost exactly the reverse of the prolog sequence, except
6643 that we need to insert barriers to avoid scheduling loads that read
6644 from a deallocated stack, and we optimize the unwind records by
6645 emitting them all together if possible. */
6647 aarch64_expand_epilogue (bool for_sibcall
)
6649 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6650 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6651 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6652 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6653 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6654 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6657 /* A stack clash protection prologue may not have left EP0_REGNUM or
6658 EP1_REGNUM in a usable state. The same is true for allocations
6659 with an SVE component, since we then need both temporary registers
6660 for each allocation. For stack clash we are in a usable state if
6661 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6662 HOST_WIDE_INT guard_size
6663 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6664 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6666 /* We can re-use the registers when the allocation amount is smaller than
6667 guard_size - guard_used_by_caller because we won't be doing any probes
6668 then. In such situations the register should remain live with the correct
6670 bool can_inherit_p
= (initial_adjust
.is_constant ()
6671 && final_adjust
.is_constant ())
6672 && (!flag_stack_clash_protection
6673 || known_lt (initial_adjust
,
6674 guard_size
- guard_used_by_caller
));
6676 /* We need to add memory barrier to prevent read from deallocated stack. */
6678 = maybe_ne (get_frame_size ()
6679 + cfun
->machine
->frame
.saved_varargs_size
, 0);
6681 /* Emit a barrier to prevent loads from a deallocated stack. */
6682 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
6683 || cfun
->calls_alloca
6684 || crtl
->calls_eh_return
)
6686 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6687 need_barrier_p
= false;
6690 /* Restore the stack pointer from the frame pointer if it may not
6691 be the same as the stack pointer. */
6692 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6693 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6694 if (frame_pointer_needed
6695 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
6696 /* If writeback is used when restoring callee-saves, the CFA
6697 is restored on the instruction doing the writeback. */
6698 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
6699 hard_frame_pointer_rtx
, -callee_offset
,
6700 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
6702 /* The case where we need to re-use the register here is very rare, so
6703 avoid the complicated condition and just always emit a move if the
6704 immediate doesn't fit. */
6705 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
6707 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6708 callee_adjust
!= 0, &cfi_ops
);
6709 if (crtl
->abi
->id () == ARM_PCS_SIMD
)
6710 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6711 callee_adjust
!= 0, &cfi_ops
);
6713 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6714 callee_adjust
!= 0, &cfi_ops
);
6717 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6719 if (callee_adjust
!= 0)
6720 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
6722 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
6724 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6725 insn
= get_last_insn ();
6726 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
6727 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
6728 RTX_FRAME_RELATED_P (insn
) = 1;
6732 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6733 add restriction on emit_move optimization to leaf functions. */
6734 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6735 (!can_inherit_p
|| !crtl
->is_leaf
6736 || df_regs_ever_live_p (EP0_REGNUM
)));
6740 /* Emit delayed restores and reset the CFA to be SP. */
6741 insn
= get_last_insn ();
6742 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
6743 REG_NOTES (insn
) = cfi_ops
;
6744 RTX_FRAME_RELATED_P (insn
) = 1;
6747 /* We prefer to emit the combined return/authenticate instruction RETAA,
6748 however there are three cases in which we must instead emit an explicit
6749 authentication instruction.
6751 1) Sibcalls don't return in a normal way, so if we're about to call one
6752 we must authenticate.
6754 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6755 generating code for !TARGET_ARMV8_3 we can't use it and must
6756 explicitly authenticate.
6758 3) On an eh_return path we make extra stack adjustments to update the
6759 canonical frame address to be the exception handler's CFA. We want
6760 to authenticate using the CFA of the function which calls eh_return.
6762 if (aarch64_return_address_signing_enabled ()
6763 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
6765 switch (aarch64_ra_sign_key
)
6768 insn
= emit_insn (gen_autiasp ());
6771 insn
= emit_insn (gen_autibsp ());
6776 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6777 RTX_FRAME_RELATED_P (insn
) = 1;
6780 /* Stack adjustment for exception handler. */
6781 if (crtl
->calls_eh_return
&& !for_sibcall
)
6783 /* We need to unwind the stack by the offset computed by
6784 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6785 to be SP; letting the CFA move during this adjustment
6786 is just as correct as retaining the CFA from the body
6787 of the function. Therefore, do nothing special. */
6788 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
6791 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
6793 emit_jump_insn (ret_rtx
);
6796 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6797 normally or return to a previous frame after unwinding.
6799 An EH return uses a single shared return sequence. The epilogue is
6800 exactly like a normal epilogue except that it has an extra input
6801 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6802 that must be applied after the frame has been destroyed. An extra label
6803 is inserted before the epilogue which initializes this register to zero,
6804 and this is the entry point for a normal return.
6806 An actual EH return updates the return address, initializes the stack
6807 adjustment and jumps directly into the epilogue (bypassing the zeroing
6808 of the adjustment). Since the return address is typically saved on the
6809 stack when a function makes a call, the saved LR must be updated outside
6812 This poses problems as the store is generated well before the epilogue,
6813 so the offset of LR is not known yet. Also optimizations will remove the
6814 store as it appears dead, even after the epilogue is generated (as the
6815 base or offset for loading LR is different in many cases).
6817 To avoid these problems this implementation forces the frame pointer
6818 in eh_return functions so that the location of LR is fixed and known early.
6819 It also marks the store volatile, so no optimization is permitted to
6820 remove the store. */
6822 aarch64_eh_return_handler_rtx (void)
6824 rtx tmp
= gen_frame_mem (Pmode
,
6825 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
6827 /* Mark the store volatile, so no optimization is permitted to remove it. */
6828 MEM_VOLATILE_P (tmp
) = true;
6832 /* Output code to add DELTA to the first argument, and then jump
6833 to FUNCTION. Used for C++ multiple inheritance. */
6835 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6836 HOST_WIDE_INT delta
,
6837 HOST_WIDE_INT vcall_offset
,
6840 /* The this pointer is always in x0. Note that this differs from
6841 Arm where the this pointer maybe bumped to r1 if r0 is required
6842 to return a pointer to an aggregate. On AArch64 a result value
6843 pointer will be in x8. */
6844 int this_regno
= R0_REGNUM
;
6845 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6847 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6849 if (aarch64_bti_enabled ())
6850 emit_insn (gen_bti_c());
6852 reload_completed
= 1;
6853 emit_note (NOTE_INSN_PROLOGUE_END
);
6855 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6856 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6857 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6859 if (vcall_offset
== 0)
6860 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6863 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6868 if (delta
>= -256 && delta
< 256)
6869 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6870 plus_constant (Pmode
, this_rtx
, delta
));
6872 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6873 temp1
, temp0
, false);
6876 if (Pmode
== ptr_mode
)
6877 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6879 aarch64_emit_move (temp0
,
6880 gen_rtx_ZERO_EXTEND (Pmode
,
6881 gen_rtx_MEM (ptr_mode
, addr
)));
6883 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6884 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6887 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6889 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6892 if (Pmode
== ptr_mode
)
6893 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6895 aarch64_emit_move (temp1
,
6896 gen_rtx_SIGN_EXTEND (Pmode
,
6897 gen_rtx_MEM (ptr_mode
, addr
)));
6899 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6902 /* Generate a tail call to the target function. */
6903 if (!TREE_USED (function
))
6905 assemble_external (function
);
6906 TREE_USED (function
) = 1;
6908 funexp
= XEXP (DECL_RTL (function
), 0);
6909 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6910 rtx callee_abi
= gen_int_mode (fndecl_abi (function
).id (), DImode
);
6911 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
6912 SIBLING_CALL_P (insn
) = 1;
6914 insn
= get_insns ();
6915 shorten_branches (insn
);
6917 assemble_start_function (thunk
, fnname
);
6918 final_start_function (insn
, file
, 1);
6919 final (insn
, file
, 1);
6920 final_end_function ();
6921 assemble_end_function (thunk
, fnname
);
6923 /* Stop pretending to be a post-reload pass. */
6924 reload_completed
= 0;
6928 aarch64_tls_referenced_p (rtx x
)
6930 if (!TARGET_HAVE_TLS
)
6932 subrtx_iterator::array_type array
;
6933 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6935 const_rtx x
= *iter
;
6936 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6938 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6939 TLS offsets, not real symbol references. */
6940 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6941 iter
.skip_subrtxes ();
6947 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6948 a left shift of 0 or 12 bits. */
6950 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6952 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6953 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6957 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6958 that can be created with a left shift of 0 or 12. */
6959 static HOST_WIDE_INT
6960 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6962 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6963 handle correctly. */
6964 gcc_assert ((val
& 0xffffff) == val
);
6966 if (((val
& 0xfff) << 0) == val
)
6969 return val
& (0xfff << 12);
6972 /* Return true if val is an immediate that can be loaded into a
6973 register by a MOVZ instruction. */
6975 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6977 if (GET_MODE_SIZE (mode
) > 4)
6979 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6980 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6985 /* Ignore sign extension. */
6986 val
&= (HOST_WIDE_INT
) 0xffffffff;
6988 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6989 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6992 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6993 64-bit (DImode) integer. */
6995 static unsigned HOST_WIDE_INT
6996 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
6998 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
7001 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
7008 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7010 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
7012 0x0000000100000001ull
,
7013 0x0001000100010001ull
,
7014 0x0101010101010101ull
,
7015 0x1111111111111111ull
,
7016 0x5555555555555555ull
,
7020 /* Return true if val is a valid bitmask immediate. */
7023 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
7025 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
7028 /* Check for a single sequence of one bits and return quickly if so.
7029 The special cases of all ones and all zeroes returns false. */
7030 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
7031 tmp
= val
+ (val
& -val
);
7033 if (tmp
== (tmp
& -tmp
))
7034 return (val
+ 1) > 1;
7036 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7038 val
= (val
<< 32) | (val
& 0xffffffff);
7040 /* Invert if the immediate doesn't start with a zero bit - this means we
7041 only need to search for sequences of one bits. */
7045 /* Find the first set bit and set tmp to val with the first sequence of one
7046 bits removed. Return success if there is a single sequence of ones. */
7047 first_one
= val
& -val
;
7048 tmp
= val
& (val
+ first_one
);
7053 /* Find the next set bit and compute the difference in bit position. */
7054 next_one
= tmp
& -tmp
;
7055 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
7058 /* Check the bit position difference is a power of 2, and that the first
7059 sequence of one bits fits within 'bits' bits. */
7060 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
7063 /* Check the sequence of one bits is repeated 64/bits times. */
7064 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
7067 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7068 Assumed precondition: VAL_IN Is not zero. */
7070 unsigned HOST_WIDE_INT
7071 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
7073 int lowest_bit_set
= ctz_hwi (val_in
);
7074 int highest_bit_set
= floor_log2 (val_in
);
7075 gcc_assert (val_in
!= 0);
7077 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
7078 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
7081 /* Create constant where bits outside of lowest bit set to highest bit set
7084 unsigned HOST_WIDE_INT
7085 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
7087 return val_in
| ~aarch64_and_split_imm1 (val_in
);
7090 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7093 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
7095 scalar_int_mode int_mode
;
7096 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7099 if (aarch64_bitmask_imm (val_in
, int_mode
))
7102 if (aarch64_move_imm (val_in
, int_mode
))
7105 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
7107 return aarch64_bitmask_imm (imm2
, int_mode
);
7110 /* Return true if val is an immediate that can be loaded into a
7111 register in a single instruction. */
7113 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
7115 scalar_int_mode int_mode
;
7116 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7119 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
7121 return aarch64_bitmask_imm (val
, int_mode
);
7125 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
7129 if (GET_CODE (x
) == HIGH
)
7132 /* There's no way to calculate VL-based values using relocations. */
7133 subrtx_iterator::array_type array
;
7134 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7135 if (GET_CODE (*iter
) == CONST_POLY_INT
)
7138 split_const (x
, &base
, &offset
);
7139 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
7141 if (aarch64_classify_symbol (base
, INTVAL (offset
))
7142 != SYMBOL_FORCE_TO_MEM
)
7145 /* Avoid generating a 64-bit relocation in ILP32; leave
7146 to aarch64_expand_mov_immediate to handle it properly. */
7147 return mode
!= ptr_mode
;
7150 return aarch64_tls_referenced_p (x
);
7153 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7154 The expansion for a table switch is quite expensive due to the number
7155 of instructions, the table lookup and hard to predict indirect jump.
7156 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7157 set, otherwise use tables for > 16 cases as a tradeoff between size and
7158 performance. When optimizing for size, use the default setting. */
7161 aarch64_case_values_threshold (void)
7163 /* Use the specified limit for the number of cases before using jump
7164 tables at higher optimization levels. */
7166 && selected_cpu
->tune
->max_case_values
!= 0)
7167 return selected_cpu
->tune
->max_case_values
;
7169 return optimize_size
? default_case_values_threshold () : 17;
7172 /* Return true if register REGNO is a valid index register.
7173 STRICT_P is true if REG_OK_STRICT is in effect. */
7176 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
7178 if (!HARD_REGISTER_NUM_P (regno
))
7186 regno
= reg_renumber
[regno
];
7188 return GP_REGNUM_P (regno
);
7191 /* Return true if register REGNO is a valid base register for mode MODE.
7192 STRICT_P is true if REG_OK_STRICT is in effect. */
7195 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
7197 if (!HARD_REGISTER_NUM_P (regno
))
7205 regno
= reg_renumber
[regno
];
7208 /* The fake registers will be eliminated to either the stack or
7209 hard frame pointer, both of which are usually valid base registers.
7210 Reload deals with the cases where the eliminated form isn't valid. */
7211 return (GP_REGNUM_P (regno
)
7212 || regno
== SP_REGNUM
7213 || regno
== FRAME_POINTER_REGNUM
7214 || regno
== ARG_POINTER_REGNUM
);
7217 /* Return true if X is a valid base register for mode MODE.
7218 STRICT_P is true if REG_OK_STRICT is in effect. */
7221 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
7224 && GET_CODE (x
) == SUBREG
7225 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
7228 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
7231 /* Return true if address offset is a valid index. If it is, fill in INFO
7232 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7235 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
7236 machine_mode mode
, bool strict_p
)
7238 enum aarch64_address_type type
;
7243 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
7244 && GET_MODE (x
) == Pmode
)
7246 type
= ADDRESS_REG_REG
;
7250 /* (sign_extend:DI (reg:SI)) */
7251 else if ((GET_CODE (x
) == SIGN_EXTEND
7252 || GET_CODE (x
) == ZERO_EXTEND
)
7253 && GET_MODE (x
) == DImode
7254 && GET_MODE (XEXP (x
, 0)) == SImode
)
7256 type
= (GET_CODE (x
) == SIGN_EXTEND
)
7257 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7258 index
= XEXP (x
, 0);
7261 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7262 else if (GET_CODE (x
) == MULT
7263 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7264 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7265 && GET_MODE (XEXP (x
, 0)) == DImode
7266 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7267 && CONST_INT_P (XEXP (x
, 1)))
7269 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7270 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7271 index
= XEXP (XEXP (x
, 0), 0);
7272 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7274 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7275 else if (GET_CODE (x
) == ASHIFT
7276 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7277 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7278 && GET_MODE (XEXP (x
, 0)) == DImode
7279 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7280 && CONST_INT_P (XEXP (x
, 1)))
7282 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7283 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7284 index
= XEXP (XEXP (x
, 0), 0);
7285 shift
= INTVAL (XEXP (x
, 1));
7287 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7288 else if ((GET_CODE (x
) == SIGN_EXTRACT
7289 || GET_CODE (x
) == ZERO_EXTRACT
)
7290 && GET_MODE (x
) == DImode
7291 && GET_CODE (XEXP (x
, 0)) == MULT
7292 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7293 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7295 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7296 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7297 index
= XEXP (XEXP (x
, 0), 0);
7298 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7299 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7300 || INTVAL (XEXP (x
, 2)) != 0)
7303 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7304 (const_int 0xffffffff<<shift)) */
7305 else if (GET_CODE (x
) == AND
7306 && GET_MODE (x
) == DImode
7307 && GET_CODE (XEXP (x
, 0)) == MULT
7308 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7309 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7310 && CONST_INT_P (XEXP (x
, 1)))
7312 type
= ADDRESS_REG_UXTW
;
7313 index
= XEXP (XEXP (x
, 0), 0);
7314 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7315 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7318 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7319 else if ((GET_CODE (x
) == SIGN_EXTRACT
7320 || GET_CODE (x
) == ZERO_EXTRACT
)
7321 && GET_MODE (x
) == DImode
7322 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7323 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7324 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7326 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7327 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7328 index
= XEXP (XEXP (x
, 0), 0);
7329 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7330 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7331 || INTVAL (XEXP (x
, 2)) != 0)
7334 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7335 (const_int 0xffffffff<<shift)) */
7336 else if (GET_CODE (x
) == AND
7337 && GET_MODE (x
) == DImode
7338 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7339 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7340 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7341 && CONST_INT_P (XEXP (x
, 1)))
7343 type
= ADDRESS_REG_UXTW
;
7344 index
= XEXP (XEXP (x
, 0), 0);
7345 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7346 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7349 /* (mult:P (reg:P) (const_int scale)) */
7350 else if (GET_CODE (x
) == MULT
7351 && GET_MODE (x
) == Pmode
7352 && GET_MODE (XEXP (x
, 0)) == Pmode
7353 && CONST_INT_P (XEXP (x
, 1)))
7355 type
= ADDRESS_REG_REG
;
7356 index
= XEXP (x
, 0);
7357 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7359 /* (ashift:P (reg:P) (const_int shift)) */
7360 else if (GET_CODE (x
) == ASHIFT
7361 && GET_MODE (x
) == Pmode
7362 && GET_MODE (XEXP (x
, 0)) == Pmode
7363 && CONST_INT_P (XEXP (x
, 1)))
7365 type
= ADDRESS_REG_REG
;
7366 index
= XEXP (x
, 0);
7367 shift
= INTVAL (XEXP (x
, 1));
7373 && GET_CODE (index
) == SUBREG
7374 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
7375 index
= SUBREG_REG (index
);
7377 if (aarch64_sve_data_mode_p (mode
))
7379 if (type
!= ADDRESS_REG_REG
7380 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
7386 && !(IN_RANGE (shift
, 1, 3)
7387 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
7392 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
7395 info
->offset
= index
;
7396 info
->shift
= shift
;
7403 /* Return true if MODE is one of the modes for which we
7404 support LDP/STP operations. */
7407 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
7409 return mode
== SImode
|| mode
== DImode
7410 || mode
== SFmode
|| mode
== DFmode
7411 || (aarch64_vector_mode_supported_p (mode
)
7412 && (known_eq (GET_MODE_SIZE (mode
), 8)
7413 || (known_eq (GET_MODE_SIZE (mode
), 16)
7414 && (aarch64_tune_params
.extra_tuning_flags
7415 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
7418 /* Return true if REGNO is a virtual pointer register, or an eliminable
7419 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7420 include stack_pointer or hard_frame_pointer. */
7422 virt_or_elim_regno_p (unsigned regno
)
7424 return ((regno
>= FIRST_VIRTUAL_REGISTER
7425 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
7426 || regno
== FRAME_POINTER_REGNUM
7427 || regno
== ARG_POINTER_REGNUM
);
7430 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7431 If it is, fill in INFO appropriately. STRICT_P is true if
7432 REG_OK_STRICT is in effect. */
7435 aarch64_classify_address (struct aarch64_address_info
*info
,
7436 rtx x
, machine_mode mode
, bool strict_p
,
7437 aarch64_addr_query_type type
)
7439 enum rtx_code code
= GET_CODE (x
);
7443 HOST_WIDE_INT const_size
;
7445 /* On BE, we use load/store pair for all large int mode load/stores.
7446 TI/TFmode may also use a load/store pair. */
7447 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7448 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
7449 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
7450 || type
== ADDR_QUERY_LDP_STP_N
7453 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
7455 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7456 corresponds to the actual size of the memory being loaded/stored and the
7457 mode of the corresponding addressing mode is half of that. */
7458 if (type
== ADDR_QUERY_LDP_STP_N
7459 && known_eq (GET_MODE_SIZE (mode
), 16))
7462 bool allow_reg_index_p
= (!load_store_pair_p
7463 && (known_lt (GET_MODE_SIZE (mode
), 16)
7464 || vec_flags
== VEC_ADVSIMD
7465 || vec_flags
& VEC_SVE_DATA
));
7467 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7468 [Rn, #offset, MUL VL]. */
7469 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
7470 && (code
!= REG
&& code
!= PLUS
))
7473 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7475 if (advsimd_struct_p
7476 && !BYTES_BIG_ENDIAN
7477 && (code
!= POST_INC
&& code
!= REG
))
7480 gcc_checking_assert (GET_MODE (x
) == VOIDmode
7481 || SCALAR_INT_MODE_P (GET_MODE (x
)));
7487 info
->type
= ADDRESS_REG_IMM
;
7489 info
->offset
= const0_rtx
;
7490 info
->const_offset
= 0;
7491 return aarch64_base_register_rtx_p (x
, strict_p
);
7499 && virt_or_elim_regno_p (REGNO (op0
))
7500 && poly_int_rtx_p (op1
, &offset
))
7502 info
->type
= ADDRESS_REG_IMM
;
7505 info
->const_offset
= offset
;
7510 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
7511 && aarch64_base_register_rtx_p (op0
, strict_p
)
7512 && poly_int_rtx_p (op1
, &offset
))
7514 info
->type
= ADDRESS_REG_IMM
;
7517 info
->const_offset
= offset
;
7519 /* TImode and TFmode values are allowed in both pairs of X
7520 registers and individual Q registers. The available
7522 X,X: 7-bit signed scaled offset
7523 Q: 9-bit signed offset
7524 We conservatively require an offset representable in either mode.
7525 When performing the check for pairs of X registers i.e. LDP/STP
7526 pass down DImode since that is the natural size of the LDP/STP
7527 instruction memory accesses. */
7528 if (mode
== TImode
|| mode
== TFmode
)
7529 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
7530 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7531 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
7533 /* A 7bit offset check because OImode will emit a ldp/stp
7534 instruction (only big endian will get here).
7535 For ldp/stp instructions, the offset is scaled for the size of a
7536 single element of the pair. */
7538 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
7540 /* Three 9/12 bit offsets checks because CImode will emit three
7541 ldr/str instructions (only big endian will get here). */
7543 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7544 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
7546 || offset_12bit_unsigned_scaled_p (V16QImode
,
7549 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7550 instructions (only big endian will get here). */
7552 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7553 && aarch64_offset_7bit_signed_scaled_p (TImode
,
7556 /* Make "m" use the LD1 offset range for SVE data modes, so
7557 that pre-RTL optimizers like ivopts will work to that
7558 instead of the wider LDR/STR range. */
7559 if (vec_flags
== VEC_SVE_DATA
)
7560 return (type
== ADDR_QUERY_M
7561 ? offset_4bit_signed_scaled_p (mode
, offset
)
7562 : offset_9bit_signed_scaled_p (mode
, offset
));
7564 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
7566 poly_int64 end_offset
= (offset
7567 + GET_MODE_SIZE (mode
)
7568 - BYTES_PER_SVE_VECTOR
);
7569 return (type
== ADDR_QUERY_M
7570 ? offset_4bit_signed_scaled_p (mode
, offset
)
7571 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
7572 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
7576 if (vec_flags
== VEC_SVE_PRED
)
7577 return offset_9bit_signed_scaled_p (mode
, offset
);
7579 if (load_store_pair_p
)
7580 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7581 || known_eq (GET_MODE_SIZE (mode
), 8)
7582 || known_eq (GET_MODE_SIZE (mode
), 16))
7583 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7585 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7586 || offset_12bit_unsigned_scaled_p (mode
, offset
));
7589 if (allow_reg_index_p
)
7591 /* Look for base + (scaled/extended) index register. */
7592 if (aarch64_base_register_rtx_p (op0
, strict_p
)
7593 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
7598 if (aarch64_base_register_rtx_p (op1
, strict_p
)
7599 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
7612 info
->type
= ADDRESS_REG_WB
;
7613 info
->base
= XEXP (x
, 0);
7614 info
->offset
= NULL_RTX
;
7615 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
7619 info
->type
= ADDRESS_REG_WB
;
7620 info
->base
= XEXP (x
, 0);
7621 if (GET_CODE (XEXP (x
, 1)) == PLUS
7622 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
7623 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
7624 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7626 info
->offset
= XEXP (XEXP (x
, 1), 1);
7627 info
->const_offset
= offset
;
7629 /* TImode and TFmode values are allowed in both pairs of X
7630 registers and individual Q registers. The available
7632 X,X: 7-bit signed scaled offset
7633 Q: 9-bit signed offset
7634 We conservatively require an offset representable in either mode.
7636 if (mode
== TImode
|| mode
== TFmode
)
7637 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
7638 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
7640 if (load_store_pair_p
)
7641 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7642 || known_eq (GET_MODE_SIZE (mode
), 8)
7643 || known_eq (GET_MODE_SIZE (mode
), 16))
7644 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7646 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
7653 /* load literal: pc-relative constant pool entry. Only supported
7654 for SI mode or larger. */
7655 info
->type
= ADDRESS_SYMBOLIC
;
7657 if (!load_store_pair_p
7658 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
7663 split_const (x
, &sym
, &addend
);
7664 return ((GET_CODE (sym
) == LABEL_REF
7665 || (GET_CODE (sym
) == SYMBOL_REF
7666 && CONSTANT_POOL_ADDRESS_P (sym
)
7667 && aarch64_pcrelative_literal_loads
)));
7672 info
->type
= ADDRESS_LO_SUM
;
7673 info
->base
= XEXP (x
, 0);
7674 info
->offset
= XEXP (x
, 1);
7675 if (allow_reg_index_p
7676 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7679 split_const (info
->offset
, &sym
, &offs
);
7680 if (GET_CODE (sym
) == SYMBOL_REF
7681 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
7682 == SYMBOL_SMALL_ABSOLUTE
))
7684 /* The symbol and offset must be aligned to the access size. */
7687 if (CONSTANT_POOL_ADDRESS_P (sym
))
7688 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
7689 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
7691 tree exp
= SYMBOL_REF_DECL (sym
);
7692 align
= TYPE_ALIGN (TREE_TYPE (exp
));
7693 align
= aarch64_constant_alignment (exp
, align
);
7695 else if (SYMBOL_REF_DECL (sym
))
7696 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
7697 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
7698 && SYMBOL_REF_BLOCK (sym
) != NULL
)
7699 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
7701 align
= BITS_PER_UNIT
;
7703 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
7704 if (known_eq (ref_size
, 0))
7705 ref_size
= GET_MODE_SIZE (DImode
);
7707 return (multiple_p (INTVAL (offs
), ref_size
)
7708 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
7718 /* Return true if the address X is valid for a PRFM instruction.
7719 STRICT_P is true if we should do strict checking with
7720 aarch64_classify_address. */
7723 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
7725 struct aarch64_address_info addr
;
7727 /* PRFM accepts the same addresses as DImode... */
7728 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
7732 /* ... except writeback forms. */
7733 return addr
.type
!= ADDRESS_REG_WB
;
7737 aarch64_symbolic_address_p (rtx x
)
7741 split_const (x
, &x
, &offset
);
7742 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
7745 /* Classify the base of symbolic expression X. */
7747 enum aarch64_symbol_type
7748 aarch64_classify_symbolic_expression (rtx x
)
7752 split_const (x
, &x
, &offset
);
7753 return aarch64_classify_symbol (x
, INTVAL (offset
));
7757 /* Return TRUE if X is a legitimate address for accessing memory in
7760 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
7762 struct aarch64_address_info addr
;
7764 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
7767 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7768 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7770 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
7771 aarch64_addr_query_type type
)
7773 struct aarch64_address_info addr
;
7775 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
7778 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7781 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
7782 poly_int64 orig_offset
,
7786 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7788 HOST_WIDE_INT const_offset
, second_offset
;
7790 /* A general SVE offset is A * VQ + B. Remove the A component from
7791 coefficient 0 in order to get the constant B. */
7792 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
7794 /* Split an out-of-range address displacement into a base and
7795 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7796 range otherwise to increase opportunities for sharing the base
7797 address of different sizes. Unaligned accesses use the signed
7798 9-bit range, TImode/TFmode use the intersection of signed
7799 scaled 7-bit and signed 9-bit offset. */
7800 if (mode
== TImode
|| mode
== TFmode
)
7801 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
7802 else if ((const_offset
& (size
- 1)) != 0)
7803 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
7805 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
7807 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
7810 /* Split the offset into second_offset and the rest. */
7811 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7812 *offset2
= gen_int_mode (second_offset
, Pmode
);
7817 /* Get the mode we should use as the basis of the range. For structure
7818 modes this is the mode of one vector. */
7819 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7820 machine_mode step_mode
7821 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
7823 /* Get the "mul vl" multiplier we'd like to use. */
7824 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
7825 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
7826 if (vec_flags
& VEC_SVE_DATA
)
7827 /* LDR supports a 9-bit range, but the move patterns for
7828 structure modes require all vectors to be in range of the
7829 same base. The simplest way of accomodating that while still
7830 promoting reuse of anchor points between different modes is
7831 to use an 8-bit range unconditionally. */
7832 vnum
= ((vnum
+ 128) & 255) - 128;
7834 /* Predicates are only handled singly, so we might as well use
7836 vnum
= ((vnum
+ 256) & 511) - 256;
7840 /* Convert the "mul vl" multiplier into a byte offset. */
7841 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7842 if (known_eq (second_offset
, orig_offset
))
7845 /* Split the offset into second_offset and the rest. */
7846 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7847 *offset2
= gen_int_mode (second_offset
, Pmode
);
7852 /* Return the binary representation of floating point constant VALUE in INTVAL.
7853 If the value cannot be converted, return false without setting INTVAL.
7854 The conversion is done in the given MODE. */
7856 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7859 /* We make a general exception for 0. */
7860 if (aarch64_float_const_zero_rtx_p (value
))
7866 scalar_float_mode mode
;
7867 if (GET_CODE (value
) != CONST_DOUBLE
7868 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7869 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7870 /* Only support up to DF mode. */
7871 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7874 unsigned HOST_WIDE_INT ival
= 0;
7877 real_to_target (res
,
7878 CONST_DOUBLE_REAL_VALUE (value
),
7879 REAL_MODE_FORMAT (mode
));
7883 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7884 ival
= zext_hwi (res
[order
], 32);
7885 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7888 ival
= zext_hwi (res
[0], 32);
7894 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7895 single MOV(+MOVK) followed by an FMOV. */
7897 aarch64_float_const_rtx_p (rtx x
)
7899 machine_mode mode
= GET_MODE (x
);
7900 if (mode
== VOIDmode
)
7903 /* Determine whether it's cheaper to write float constants as
7904 mov/movk pairs over ldr/adrp pairs. */
7905 unsigned HOST_WIDE_INT ival
;
7907 if (GET_CODE (x
) == CONST_DOUBLE
7908 && SCALAR_FLOAT_MODE_P (mode
)
7909 && aarch64_reinterpret_float_as_int (x
, &ival
))
7911 scalar_int_mode imode
= (mode
== HFmode
7913 : int_mode_for_mode (mode
).require ());
7914 int num_instr
= aarch64_internal_mov_immediate
7915 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7916 return num_instr
< 3;
7922 /* Return TRUE if rtx X is immediate constant 0.0 */
7924 aarch64_float_const_zero_rtx_p (rtx x
)
7926 if (GET_MODE (x
) == VOIDmode
)
7929 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7930 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7931 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7934 /* Return TRUE if rtx X is immediate constant that fits in a single
7935 MOVI immediate operation. */
7937 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7943 scalar_int_mode imode
;
7944 unsigned HOST_WIDE_INT ival
;
7946 if (GET_CODE (x
) == CONST_DOUBLE
7947 && SCALAR_FLOAT_MODE_P (mode
))
7949 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7952 /* We make a general exception for 0. */
7953 if (aarch64_float_const_zero_rtx_p (x
))
7956 imode
= int_mode_for_mode (mode
).require ();
7958 else if (GET_CODE (x
) == CONST_INT
7959 && is_a
<scalar_int_mode
> (mode
, &imode
))
7964 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7965 a 128 bit vector mode. */
7966 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7968 vmode
= aarch64_simd_container_mode (imode
, width
);
7969 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7971 return aarch64_simd_valid_immediate (v_op
, NULL
);
7975 /* Return the fixed registers used for condition codes. */
7978 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7981 *p2
= INVALID_REGNUM
;
7985 /* This function is used by the call expanders of the machine description.
7986 RESULT is the register in which the result is returned. It's NULL for
7987 "call" and "sibcall".
7988 MEM is the location of the function call.
7989 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
7990 SIBCALL indicates whether this function call is normal call or sibling call.
7991 It will generate different pattern accordingly. */
7994 aarch64_expand_call (rtx result
, rtx mem
, rtx callee_abi
, bool sibcall
)
7996 rtx call
, callee
, tmp
;
8000 gcc_assert (MEM_P (mem
));
8001 callee
= XEXP (mem
, 0);
8002 mode
= GET_MODE (callee
);
8003 gcc_assert (mode
== Pmode
);
8005 /* Decide if we should generate indirect calls by loading the
8006 address of the callee into a register before performing
8007 the branch-and-link. */
8008 if (SYMBOL_REF_P (callee
)
8009 ? (aarch64_is_long_call_p (callee
)
8010 || aarch64_is_noplt_call_p (callee
))
8012 XEXP (mem
, 0) = force_reg (mode
, callee
);
8014 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
8016 if (result
!= NULL_RTX
)
8017 call
= gen_rtx_SET (result
, call
);
8022 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
8024 gcc_assert (CONST_INT_P (callee_abi
));
8025 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
8028 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
8029 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
8031 aarch64_emit_call_insn (call
);
8034 /* Emit call insn with PAT and do aarch64-specific handling. */
8037 aarch64_emit_call_insn (rtx pat
)
8039 rtx insn
= emit_call_insn (pat
);
8041 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
8042 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
8043 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
8047 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
8049 machine_mode mode_x
= GET_MODE (x
);
8050 rtx_code code_x
= GET_CODE (x
);
8052 /* All floating point compares return CCFP if it is an equality
8053 comparison, and CCFPE otherwise. */
8054 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
8081 /* Equality comparisons of short modes against zero can be performed
8082 using the TST instruction with the appropriate bitmask. */
8083 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
8084 && (code
== EQ
|| code
== NE
)
8085 && (mode_x
== HImode
|| mode_x
== QImode
))
8088 /* Similarly, comparisons of zero_extends from shorter modes can
8089 be performed using an ANDS with an immediate mask. */
8090 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
8091 && (mode_x
== SImode
|| mode_x
== DImode
)
8092 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
8093 && (code
== EQ
|| code
== NE
))
8096 if ((mode_x
== SImode
|| mode_x
== DImode
)
8098 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
8099 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
8101 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
8102 && CONST_INT_P (XEXP (x
, 2)))))
8105 /* A compare with a shifted operand. Because of canonicalization,
8106 the comparison will have to be swapped when we emit the assembly
8108 if ((mode_x
== SImode
|| mode_x
== DImode
)
8109 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
8110 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
8111 || code_x
== LSHIFTRT
8112 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
8115 /* Similarly for a negated operand, but we can only do this for
8117 if ((mode_x
== SImode
|| mode_x
== DImode
)
8118 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
8119 && (code
== EQ
|| code
== NE
)
8123 /* A test for unsigned overflow from an addition. */
8124 if ((mode_x
== DImode
|| mode_x
== TImode
)
8125 && (code
== LTU
|| code
== GEU
)
8127 && rtx_equal_p (XEXP (x
, 0), y
))
8130 /* A test for unsigned overflow from an add with carry. */
8131 if ((mode_x
== DImode
|| mode_x
== TImode
)
8132 && (code
== LTU
|| code
== GEU
)
8134 && CONST_SCALAR_INT_P (y
)
8135 && (rtx_mode_t (y
, mode_x
)
8136 == (wi::shwi (1, mode_x
)
8137 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
8140 /* A test for signed overflow. */
8141 if ((mode_x
== DImode
|| mode_x
== TImode
)
8144 && GET_CODE (y
) == SIGN_EXTEND
)
8147 /* For everything else, return CCmode. */
8152 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
8155 aarch64_get_condition_code (rtx x
)
8157 machine_mode mode
= GET_MODE (XEXP (x
, 0));
8158 enum rtx_code comp_code
= GET_CODE (x
);
8160 if (GET_MODE_CLASS (mode
) != MODE_CC
)
8161 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
8162 return aarch64_get_condition_code_1 (mode
, comp_code
);
8166 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
8174 case GE
: return AARCH64_GE
;
8175 case GT
: return AARCH64_GT
;
8176 case LE
: return AARCH64_LS
;
8177 case LT
: return AARCH64_MI
;
8178 case NE
: return AARCH64_NE
;
8179 case EQ
: return AARCH64_EQ
;
8180 case ORDERED
: return AARCH64_VC
;
8181 case UNORDERED
: return AARCH64_VS
;
8182 case UNLT
: return AARCH64_LT
;
8183 case UNLE
: return AARCH64_LE
;
8184 case UNGT
: return AARCH64_HI
;
8185 case UNGE
: return AARCH64_PL
;
8193 case NE
: return AARCH64_NE
;
8194 case EQ
: return AARCH64_EQ
;
8195 case GE
: return AARCH64_GE
;
8196 case GT
: return AARCH64_GT
;
8197 case LE
: return AARCH64_LE
;
8198 case LT
: return AARCH64_LT
;
8199 case GEU
: return AARCH64_CS
;
8200 case GTU
: return AARCH64_HI
;
8201 case LEU
: return AARCH64_LS
;
8202 case LTU
: return AARCH64_CC
;
8210 case NE
: return AARCH64_NE
;
8211 case EQ
: return AARCH64_EQ
;
8212 case GE
: return AARCH64_LE
;
8213 case GT
: return AARCH64_LT
;
8214 case LE
: return AARCH64_GE
;
8215 case LT
: return AARCH64_GT
;
8216 case GEU
: return AARCH64_LS
;
8217 case GTU
: return AARCH64_CC
;
8218 case LEU
: return AARCH64_CS
;
8219 case LTU
: return AARCH64_HI
;
8227 case NE
: return AARCH64_NE
; /* = any */
8228 case EQ
: return AARCH64_EQ
; /* = none */
8229 case GE
: return AARCH64_PL
; /* = nfrst */
8230 case LT
: return AARCH64_MI
; /* = first */
8231 case GEU
: return AARCH64_CS
; /* = nlast */
8232 case GTU
: return AARCH64_HI
; /* = pmore */
8233 case LEU
: return AARCH64_LS
; /* = plast */
8234 case LTU
: return AARCH64_CC
; /* = last */
8242 case NE
: return AARCH64_NE
;
8243 case EQ
: return AARCH64_EQ
;
8244 case GE
: return AARCH64_PL
;
8245 case LT
: return AARCH64_MI
;
8253 case NE
: return AARCH64_NE
;
8254 case EQ
: return AARCH64_EQ
;
8262 case LTU
: return AARCH64_CS
;
8263 case GEU
: return AARCH64_CC
;
8271 case GEU
: return AARCH64_CS
;
8272 case LTU
: return AARCH64_CC
;
8280 case NE
: return AARCH64_VS
;
8281 case EQ
: return AARCH64_VC
;
8294 aarch64_const_vec_all_same_in_range_p (rtx x
,
8295 HOST_WIDE_INT minval
,
8296 HOST_WIDE_INT maxval
)
8299 return (const_vec_duplicate_p (x
, &elt
)
8300 && CONST_INT_P (elt
)
8301 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
8305 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
8307 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
8310 /* Return true if VEC is a constant in which every element is in the range
8311 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8314 aarch64_const_vec_all_in_range_p (rtx vec
,
8315 HOST_WIDE_INT minval
,
8316 HOST_WIDE_INT maxval
)
8318 if (GET_CODE (vec
) != CONST_VECTOR
8319 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
8323 if (!CONST_VECTOR_STEPPED_P (vec
))
8324 nunits
= const_vector_encoded_nelts (vec
);
8325 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
8328 for (int i
= 0; i
< nunits
; i
++)
8330 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
8331 if (!CONST_INT_P (vec_elem
)
8332 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
8339 #define AARCH64_CC_V 1
8340 #define AARCH64_CC_C (1 << 1)
8341 #define AARCH64_CC_Z (1 << 2)
8342 #define AARCH64_CC_N (1 << 3)
8344 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8345 static const int aarch64_nzcv_codes
[] =
8347 0, /* EQ, Z == 1. */
8348 AARCH64_CC_Z
, /* NE, Z == 0. */
8349 0, /* CS, C == 1. */
8350 AARCH64_CC_C
, /* CC, C == 0. */
8351 0, /* MI, N == 1. */
8352 AARCH64_CC_N
, /* PL, N == 0. */
8353 0, /* VS, V == 1. */
8354 AARCH64_CC_V
, /* VC, V == 0. */
8355 0, /* HI, C ==1 && Z == 0. */
8356 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
8357 AARCH64_CC_V
, /* GE, N == V. */
8358 0, /* LT, N != V. */
8359 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
8360 0, /* LE, !(Z == 0 && N == V). */
8365 /* Print floating-point vector immediate operand X to F, negating it
8366 first if NEGATE is true. Return true on success, false if it isn't
8367 a constant we can handle. */
8370 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
8374 if (!const_vec_duplicate_p (x
, &elt
))
8377 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
8379 r
= real_value_negate (&r
);
8381 /* Handle the SVE single-bit immediates specially, since they have a
8382 fixed form in the assembly syntax. */
8383 if (real_equal (&r
, &dconst0
))
8384 asm_fprintf (f
, "0.0");
8385 else if (real_equal (&r
, &dconst2
))
8386 asm_fprintf (f
, "2.0");
8387 else if (real_equal (&r
, &dconst1
))
8388 asm_fprintf (f
, "1.0");
8389 else if (real_equal (&r
, &dconsthalf
))
8390 asm_fprintf (f
, "0.5");
8393 const int buf_size
= 20;
8394 char float_buf
[buf_size
] = {'\0'};
8395 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
8397 asm_fprintf (f
, "%s", float_buf
);
8403 /* Return the equivalent letter for size. */
8405 sizetochar (int size
)
8409 case 64: return 'd';
8410 case 32: return 's';
8411 case 16: return 'h';
8412 case 8 : return 'b';
8413 default: gcc_unreachable ();
8417 /* Print operand X to file F in a target specific manner according to CODE.
8418 The acceptable formatting commands given by CODE are:
8419 'c': An integer or symbol address without a preceding #
8421 'C': Take the duplicated element in a vector constant
8422 and print it in hex.
8423 'D': Take the duplicated element in a vector constant
8424 and print it as an unsigned integer, in decimal.
8425 'e': Print the sign/zero-extend size as a character 8->b,
8426 16->h, 32->w. Can also be used for masks:
8427 0xff->b, 0xffff->h, 0xffffffff->w.
8428 'I': If the operand is a duplicated vector constant,
8429 replace it with the duplicated scalar. If the
8430 operand is then a floating-point constant, replace
8431 it with the integer bit representation. Print the
8432 transformed constant as a signed decimal number.
8433 'p': Prints N such that 2^N == X (X must be power of 2 and
8435 'P': Print the number of non-zero bits in X (a const_int).
8436 'H': Print the higher numbered register of a pair (TImode)
8438 'm': Print a condition (eq, ne, etc).
8439 'M': Same as 'm', but invert condition.
8440 'N': Take the duplicated element in a vector constant
8441 and print the negative of it in decimal.
8442 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8443 'S/T/U/V': Print a FP/SIMD register name for a register list.
8444 The register printed is the FP/SIMD register name
8445 of X + 0/1/2/3 for S/T/U/V.
8446 'R': Print a scalar Integer/FP/SIMD register name + 1.
8447 'X': Print bottom 16 bits of integer constant in hex.
8448 'w/x': Print a general register name or the zero register
8450 '0': Print a normal operand, if it's a general register,
8451 then we assume DImode.
8452 'k': Print NZCV for conditional compare instructions.
8453 'A': Output address constant representing the first
8454 argument of X, specifying a relocation offset
8456 'L': Output constant address specified by X
8457 with a relocation offset if appropriate.
8458 'G': Prints address of X, specifying a PC relative
8459 relocation mode if appropriate.
8460 'y': Output address of LDP or STP - this is used for
8461 some LDP/STPs which don't use a PARALLEL in their
8462 pattern (so the mode needs to be adjusted).
8463 'z': Output address of a typical LDP or STP. */
8466 aarch64_print_operand (FILE *f
, rtx x
, int code
)
8472 switch (GET_CODE (x
))
8475 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
8479 output_addr_const (f
, x
);
8483 if (GET_CODE (XEXP (x
, 0)) == PLUS
8484 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
8486 output_addr_const (f
, x
);
8492 output_operand_lossage ("unsupported operand for code '%c'", code
);
8498 x
= unwrap_const_vec_duplicate (x
);
8499 if (!CONST_INT_P (x
))
8501 output_operand_lossage ("invalid operand for '%%%c'", code
);
8505 HOST_WIDE_INT val
= INTVAL (x
);
8506 if ((val
& ~7) == 8 || val
== 0xff)
8508 else if ((val
& ~7) == 16 || val
== 0xffff)
8510 else if ((val
& ~7) == 32 || val
== 0xffffffff)
8514 output_operand_lossage ("invalid operand for '%%%c'", code
);
8524 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
8526 output_operand_lossage ("invalid operand for '%%%c'", code
);
8530 asm_fprintf (f
, "%d", n
);
8535 if (!CONST_INT_P (x
))
8537 output_operand_lossage ("invalid operand for '%%%c'", code
);
8541 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
8545 if (x
== const0_rtx
)
8547 asm_fprintf (f
, "xzr");
8551 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
8553 output_operand_lossage ("invalid operand for '%%%c'", code
);
8557 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
8562 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
8563 if (CONST_INT_P (x
))
8564 asm_fprintf (f
, "%wd", INTVAL (x
));
8567 output_operand_lossage ("invalid operand for '%%%c'", code
);
8577 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8578 if (x
== const_true_rtx
)
8585 if (!COMPARISON_P (x
))
8587 output_operand_lossage ("invalid operand for '%%%c'", code
);
8591 cond_code
= aarch64_get_condition_code (x
);
8592 gcc_assert (cond_code
>= 0);
8594 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
8595 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
8596 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
8598 fputs (aarch64_condition_codes
[cond_code
], f
);
8603 if (!const_vec_duplicate_p (x
, &elt
))
8605 output_operand_lossage ("invalid vector constant");
8609 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8610 asm_fprintf (f
, "%wd", -INTVAL (elt
));
8611 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8612 && aarch64_print_vector_float_operand (f
, x
, true))
8616 output_operand_lossage ("invalid vector constant");
8626 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8628 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8631 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
8638 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8640 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8643 asm_fprintf (f
, "%c%d",
8644 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
8645 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
8649 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
8650 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
8651 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8652 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
8654 output_operand_lossage ("incompatible register operand for '%%%c'",
8659 if (!CONST_INT_P (x
))
8661 output_operand_lossage ("invalid operand for '%%%c'", code
);
8664 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
8669 /* Print a replicated constant in hex. */
8670 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8672 output_operand_lossage ("invalid operand for '%%%c'", code
);
8675 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8676 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8682 /* Print a replicated constant in decimal, treating it as
8684 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8686 output_operand_lossage ("invalid operand for '%%%c'", code
);
8689 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8690 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8697 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
8699 asm_fprintf (f
, "%czr", code
);
8703 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8705 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
8709 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
8711 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
8720 output_operand_lossage ("missing operand");
8724 switch (GET_CODE (x
))
8727 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
8729 if (REG_NREGS (x
) == 1)
8730 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
8734 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
8735 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
8736 REGNO (x
) - V0_REGNUM
, suffix
,
8737 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
8741 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
8745 output_address (GET_MODE (x
), XEXP (x
, 0));
8750 output_addr_const (asm_out_file
, x
);
8754 asm_fprintf (f
, "%wd", INTVAL (x
));
8758 if (!VECTOR_MODE_P (GET_MODE (x
)))
8760 output_addr_const (asm_out_file
, x
);
8766 if (!const_vec_duplicate_p (x
, &elt
))
8768 output_operand_lossage ("invalid vector constant");
8772 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8773 asm_fprintf (f
, "%wd", INTVAL (elt
));
8774 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8775 && aarch64_print_vector_float_operand (f
, x
, false))
8779 output_operand_lossage ("invalid vector constant");
8785 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8786 be getting CONST_DOUBLEs holding integers. */
8787 gcc_assert (GET_MODE (x
) != VOIDmode
);
8788 if (aarch64_float_const_zero_rtx_p (x
))
8793 else if (aarch64_float_const_representable_p (x
))
8796 char float_buf
[buf_size
] = {'\0'};
8797 real_to_decimal_for_mode (float_buf
,
8798 CONST_DOUBLE_REAL_VALUE (x
),
8801 asm_fprintf (asm_out_file
, "%s", float_buf
);
8805 output_operand_lossage ("invalid constant");
8808 output_operand_lossage ("invalid operand");
8814 if (GET_CODE (x
) == HIGH
)
8817 switch (aarch64_classify_symbolic_expression (x
))
8819 case SYMBOL_SMALL_GOT_4G
:
8820 asm_fprintf (asm_out_file
, ":got:");
8823 case SYMBOL_SMALL_TLSGD
:
8824 asm_fprintf (asm_out_file
, ":tlsgd:");
8827 case SYMBOL_SMALL_TLSDESC
:
8828 asm_fprintf (asm_out_file
, ":tlsdesc:");
8831 case SYMBOL_SMALL_TLSIE
:
8832 asm_fprintf (asm_out_file
, ":gottprel:");
8835 case SYMBOL_TLSLE24
:
8836 asm_fprintf (asm_out_file
, ":tprel:");
8839 case SYMBOL_TINY_GOT
:
8846 output_addr_const (asm_out_file
, x
);
8850 switch (aarch64_classify_symbolic_expression (x
))
8852 case SYMBOL_SMALL_GOT_4G
:
8853 asm_fprintf (asm_out_file
, ":lo12:");
8856 case SYMBOL_SMALL_TLSGD
:
8857 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
8860 case SYMBOL_SMALL_TLSDESC
:
8861 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
8864 case SYMBOL_SMALL_TLSIE
:
8865 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
8868 case SYMBOL_TLSLE12
:
8869 asm_fprintf (asm_out_file
, ":tprel_lo12:");
8872 case SYMBOL_TLSLE24
:
8873 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
8876 case SYMBOL_TINY_GOT
:
8877 asm_fprintf (asm_out_file
, ":got:");
8880 case SYMBOL_TINY_TLSIE
:
8881 asm_fprintf (asm_out_file
, ":gottprel:");
8887 output_addr_const (asm_out_file
, x
);
8891 switch (aarch64_classify_symbolic_expression (x
))
8893 case SYMBOL_TLSLE24
:
8894 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8899 output_addr_const (asm_out_file
, x
);
8904 HOST_WIDE_INT cond_code
;
8906 if (!CONST_INT_P (x
))
8908 output_operand_lossage ("invalid operand for '%%%c'", code
);
8912 cond_code
= INTVAL (x
);
8913 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8914 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8921 machine_mode mode
= GET_MODE (x
);
8923 if (GET_CODE (x
) != MEM
8924 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8926 output_operand_lossage ("invalid operand for '%%%c'", code
);
8930 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8932 ? ADDR_QUERY_LDP_STP_N
8933 : ADDR_QUERY_LDP_STP
))
8934 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8939 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8944 /* Print address 'x' of a memory access with mode 'mode'.
8945 'op' is the context required by aarch64_classify_address. It can either be
8946 MEM for a normal memory access or PARALLEL for LDP/STP. */
8948 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8949 aarch64_addr_query_type type
)
8951 struct aarch64_address_info addr
;
8954 /* Check all addresses are Pmode - including ILP32. */
8955 if (GET_MODE (x
) != Pmode
8956 && (!CONST_INT_P (x
)
8957 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8959 output_operand_lossage ("invalid address mode");
8963 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8966 case ADDRESS_REG_IMM
:
8967 if (known_eq (addr
.const_offset
, 0))
8968 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8969 else if (aarch64_sve_data_mode_p (mode
))
8972 = exact_div (addr
.const_offset
,
8973 BYTES_PER_SVE_VECTOR
).to_constant ();
8974 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8975 reg_names
[REGNO (addr
.base
)], vnum
);
8977 else if (aarch64_sve_pred_mode_p (mode
))
8980 = exact_div (addr
.const_offset
,
8981 BYTES_PER_SVE_PRED
).to_constant ();
8982 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8983 reg_names
[REGNO (addr
.base
)], vnum
);
8986 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
8987 INTVAL (addr
.offset
));
8990 case ADDRESS_REG_REG
:
8991 if (addr
.shift
== 0)
8992 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
8993 reg_names
[REGNO (addr
.offset
)]);
8995 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
8996 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
8999 case ADDRESS_REG_UXTW
:
9000 if (addr
.shift
== 0)
9001 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
9002 REGNO (addr
.offset
) - R0_REGNUM
);
9004 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
9005 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9008 case ADDRESS_REG_SXTW
:
9009 if (addr
.shift
== 0)
9010 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
9011 REGNO (addr
.offset
) - R0_REGNUM
);
9013 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
9014 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9017 case ADDRESS_REG_WB
:
9018 /* Writeback is only supported for fixed-width modes. */
9019 size
= GET_MODE_SIZE (mode
).to_constant ();
9020 switch (GET_CODE (x
))
9023 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
9026 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
9029 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
9032 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
9035 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
9036 INTVAL (addr
.offset
));
9039 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
9040 INTVAL (addr
.offset
));
9047 case ADDRESS_LO_SUM
:
9048 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
9049 output_addr_const (f
, addr
.offset
);
9050 asm_fprintf (f
, "]");
9053 case ADDRESS_SYMBOLIC
:
9054 output_addr_const (f
, x
);
9061 /* Print address 'x' of a memory access with mode 'mode'. */
9063 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
9065 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
9066 output_addr_const (f
, x
);
9070 aarch64_label_mentioned_p (rtx x
)
9075 if (GET_CODE (x
) == LABEL_REF
)
9078 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9079 referencing instruction, but they are constant offsets, not
9081 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
9084 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
9085 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
9091 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
9092 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
9095 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
9102 /* Implement REGNO_REG_CLASS. */
9105 aarch64_regno_regclass (unsigned regno
)
9107 if (GP_REGNUM_P (regno
))
9108 return GENERAL_REGS
;
9110 if (regno
== SP_REGNUM
)
9113 if (regno
== FRAME_POINTER_REGNUM
9114 || regno
== ARG_POINTER_REGNUM
)
9115 return POINTER_REGS
;
9117 if (FP_REGNUM_P (regno
))
9118 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
9119 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
9121 if (PR_REGNUM_P (regno
))
9122 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
9127 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9128 If OFFSET is out of range, return an offset of an anchor point
9129 that is in range. Return 0 otherwise. */
9131 static HOST_WIDE_INT
9132 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
9135 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9137 return (offset
+ 0x400) & ~0x7f0;
9139 /* For offsets that aren't a multiple of the access size, the limit is
9141 if (offset
& (size
- 1))
9143 /* BLKmode typically uses LDP of X-registers. */
9144 if (mode
== BLKmode
)
9145 return (offset
+ 512) & ~0x3ff;
9146 return (offset
+ 0x100) & ~0x1ff;
9149 /* Small negative offsets are supported. */
9150 if (IN_RANGE (offset
, -256, 0))
9153 if (mode
== TImode
|| mode
== TFmode
)
9154 return (offset
+ 0x100) & ~0x1ff;
9156 /* Use 12-bit offset by access size. */
9157 return offset
& (~0xfff * size
);
9161 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
9163 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9164 where mask is selected by alignment and size of the offset.
9165 We try to pick as large a range for the offset as possible to
9166 maximize the chance of a CSE. However, for aligned addresses
9167 we limit the range to 4k so that structures with different sized
9168 elements are likely to use the same base. We need to be careful
9169 not to split a CONST for some forms of address expression, otherwise
9170 it will generate sub-optimal code. */
9172 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
9174 rtx base
= XEXP (x
, 0);
9175 rtx offset_rtx
= XEXP (x
, 1);
9176 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
9178 if (GET_CODE (base
) == PLUS
)
9180 rtx op0
= XEXP (base
, 0);
9181 rtx op1
= XEXP (base
, 1);
9183 /* Force any scaling into a temp for CSE. */
9184 op0
= force_reg (Pmode
, op0
);
9185 op1
= force_reg (Pmode
, op1
);
9187 /* Let the pointer register be in op0. */
9188 if (REG_POINTER (op1
))
9189 std::swap (op0
, op1
);
9191 /* If the pointer is virtual or frame related, then we know that
9192 virtual register instantiation or register elimination is going
9193 to apply a second constant. We want the two constants folded
9194 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9195 if (virt_or_elim_regno_p (REGNO (op0
)))
9197 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
9198 NULL_RTX
, true, OPTAB_DIRECT
);
9199 return gen_rtx_PLUS (Pmode
, base
, op1
);
9202 /* Otherwise, in order to encourage CSE (and thence loop strength
9203 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9204 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
9205 NULL_RTX
, true, OPTAB_DIRECT
);
9206 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
9210 if (GET_MODE_SIZE (mode
).is_constant (&size
))
9212 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
9214 if (base_offset
!= 0)
9216 base
= plus_constant (Pmode
, base
, base_offset
);
9217 base
= force_operand (base
, NULL_RTX
);
9218 return plus_constant (Pmode
, base
, offset
- base_offset
);
9227 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
9230 secondary_reload_info
*sri
)
9232 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9233 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9234 comment at the head of aarch64-sve.md for more details about the
9235 big-endian handling. */
9236 if (BYTES_BIG_ENDIAN
9237 && reg_class_subset_p (rclass
, FP_REGS
)
9238 && !((REG_P (x
) && HARD_REGISTER_P (x
))
9239 || aarch64_simd_valid_immediate (x
, NULL
))
9240 && aarch64_sve_data_mode_p (mode
))
9242 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
9246 /* If we have to disable direct literal pool loads and stores because the
9247 function is too big, then we need a scratch register. */
9248 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
9249 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
9250 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
9251 && !aarch64_pcrelative_literal_loads
)
9253 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
9257 /* Without the TARGET_SIMD instructions we cannot move a Q register
9258 to a Q register directly. We need a scratch. */
9259 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
9260 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
9261 && reg_class_subset_p (rclass
, FP_REGS
))
9263 sri
->icode
= code_for_aarch64_reload_mov (mode
);
9267 /* A TFmode or TImode memory access should be handled via an FP_REGS
9268 because AArch64 has richer addressing modes for LDR/STR instructions
9269 than LDP/STP instructions. */
9270 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
9271 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
9274 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
9275 return GENERAL_REGS
;
9281 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
9283 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
9285 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9286 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9287 if (frame_pointer_needed
)
9288 return to
== HARD_FRAME_POINTER_REGNUM
;
9293 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
9295 if (to
== HARD_FRAME_POINTER_REGNUM
)
9297 if (from
== ARG_POINTER_REGNUM
)
9298 return cfun
->machine
->frame
.hard_fp_offset
;
9300 if (from
== FRAME_POINTER_REGNUM
)
9301 return cfun
->machine
->frame
.hard_fp_offset
9302 - cfun
->machine
->frame
.locals_offset
;
9305 if (to
== STACK_POINTER_REGNUM
)
9307 if (from
== FRAME_POINTER_REGNUM
)
9308 return cfun
->machine
->frame
.frame_size
9309 - cfun
->machine
->frame
.locals_offset
;
9312 return cfun
->machine
->frame
.frame_size
;
9315 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9319 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
9323 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
9328 aarch64_asm_trampoline_template (FILE *f
)
9333 if (aarch64_bti_enabled ())
9335 asm_fprintf (f
, "\thint\t34 // bti c\n");
9342 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
9343 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
9348 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
9349 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
9352 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
9354 /* The trampoline needs an extra padding instruction. In case if BTI is
9355 enabled the padding instruction is replaced by the BTI instruction at
9357 if (!aarch64_bti_enabled ())
9358 assemble_aligned_integer (4, const0_rtx
);
9360 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9361 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9365 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
9367 rtx fnaddr
, mem
, a_tramp
;
9368 const int tramp_code_sz
= 16;
9370 /* Don't need to copy the trailing D-words, we fill those in below. */
9371 emit_block_move (m_tramp
, assemble_trampoline_template (),
9372 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
9373 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
9374 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
9375 if (GET_MODE (fnaddr
) != ptr_mode
)
9376 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
9377 emit_move_insn (mem
, fnaddr
);
9379 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
9380 emit_move_insn (mem
, chain_value
);
9382 /* XXX We should really define a "clear_cache" pattern and use
9383 gen_clear_cache(). */
9384 a_tramp
= XEXP (m_tramp
, 0);
9385 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
9386 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
9387 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
9391 static unsigned char
9392 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
9394 /* ??? Logically we should only need to provide a value when
9395 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9396 can hold MODE, but at the moment we need to handle all modes.
9397 Just ignore any runtime parts for registers that can't store them. */
9398 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
9402 case TAILCALL_ADDR_REGS
:
9406 case POINTER_AND_FP_REGS
:
9410 if (aarch64_sve_data_mode_p (mode
)
9411 && constant_multiple_p (GET_MODE_SIZE (mode
),
9412 BYTES_PER_SVE_VECTOR
, &nregs
))
9414 return (aarch64_vector_data_mode_p (mode
)
9415 ? CEIL (lowest_size
, UNITS_PER_VREG
)
9416 : CEIL (lowest_size
, UNITS_PER_WORD
));
9433 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
9435 if (regclass
== POINTER_REGS
)
9436 return GENERAL_REGS
;
9438 if (regclass
== STACK_REG
)
9441 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
9447 /* Register eliminiation can result in a request for
9448 SP+constant->FP_REGS. We cannot support such operations which
9449 use SP as source and an FP_REG as destination, so reject out
9451 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
9453 rtx lhs
= XEXP (x
, 0);
9455 /* Look through a possible SUBREG introduced by ILP32. */
9456 if (GET_CODE (lhs
) == SUBREG
)
9457 lhs
= SUBREG_REG (lhs
);
9459 gcc_assert (REG_P (lhs
));
9460 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
9469 aarch64_asm_output_labelref (FILE* f
, const char *name
)
9471 asm_fprintf (f
, "%U%s", name
);
9475 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
9477 if (priority
== DEFAULT_INIT_PRIORITY
)
9478 default_ctor_section_asm_out_constructor (symbol
, priority
);
9482 /* While priority is known to be in range [0, 65535], so 18 bytes
9483 would be enough, the compiler might not know that. To avoid
9484 -Wformat-truncation false positive, use a larger size. */
9486 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
9487 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9488 switch_to_section (s
);
9489 assemble_align (POINTER_SIZE
);
9490 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9495 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
9497 if (priority
== DEFAULT_INIT_PRIORITY
)
9498 default_dtor_section_asm_out_destructor (symbol
, priority
);
9502 /* While priority is known to be in range [0, 65535], so 18 bytes
9503 would be enough, the compiler might not know that. To avoid
9504 -Wformat-truncation false positive, use a larger size. */
9506 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
9507 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9508 switch_to_section (s
);
9509 assemble_align (POINTER_SIZE
);
9510 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9515 aarch64_output_casesi (rtx
*operands
)
9519 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
9521 static const char *const patterns
[4][2] =
9524 "ldrb\t%w3, [%0,%w1,uxtw]",
9525 "add\t%3, %4, %w3, sxtb #2"
9528 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9529 "add\t%3, %4, %w3, sxth #2"
9532 "ldr\t%w3, [%0,%w1,uxtw #2]",
9533 "add\t%3, %4, %w3, sxtw #2"
9535 /* We assume that DImode is only generated when not optimizing and
9536 that we don't really need 64-bit address offsets. That would
9537 imply an object file with 8GB of code in a single function! */
9539 "ldr\t%w3, [%0,%w1,uxtw #2]",
9540 "add\t%3, %4, %w3, sxtw #2"
9544 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
9546 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
9547 index
= exact_log2 (GET_MODE_SIZE (mode
));
9549 gcc_assert (index
>= 0 && index
<= 3);
9551 /* Need to implement table size reduction, by chaning the code below. */
9552 output_asm_insn (patterns
[index
][0], operands
);
9553 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
9554 snprintf (buf
, sizeof (buf
),
9555 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
9556 output_asm_insn (buf
, operands
);
9557 output_asm_insn (patterns
[index
][1], operands
);
9558 output_asm_insn ("br\t%3", operands
);
9559 assemble_label (asm_out_file
, label
);
9564 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9565 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9569 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
9571 if (shift
>= 0 && shift
<= 3)
9574 for (size
= 8; size
<= 32; size
*= 2)
9576 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
9577 if (mask
== bits
<< shift
)
9584 /* Constant pools are per function only when PC relative
9585 literal loads are true or we are in the large memory
9589 aarch64_can_use_per_function_literal_pools_p (void)
9591 return (aarch64_pcrelative_literal_loads
9592 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
9596 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
9598 /* We can't use blocks for constants when we're using a per-function
9600 return !aarch64_can_use_per_function_literal_pools_p ();
9603 /* Select appropriate section for constants depending
9604 on where we place literal pools. */
9607 aarch64_select_rtx_section (machine_mode mode
,
9609 unsigned HOST_WIDE_INT align
)
9611 if (aarch64_can_use_per_function_literal_pools_p ())
9612 return function_section (current_function_decl
);
9614 return default_elf_select_rtx_section (mode
, x
, align
);
9617 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9619 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
9620 HOST_WIDE_INT offset
)
9622 /* When using per-function literal pools, we must ensure that any code
9623 section is aligned to the minimal instruction length, lest we get
9624 errors from the assembler re "unaligned instructions". */
9625 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
9626 ASM_OUTPUT_ALIGN (f
, 2);
9631 /* Helper function for rtx cost calculation. Strip a shift expression
9632 from X. Returns the inner operand if successful, or the original
9633 expression on failure. */
9635 aarch64_strip_shift (rtx x
)
9639 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9640 we can convert both to ROR during final output. */
9641 if ((GET_CODE (op
) == ASHIFT
9642 || GET_CODE (op
) == ASHIFTRT
9643 || GET_CODE (op
) == LSHIFTRT
9644 || GET_CODE (op
) == ROTATERT
9645 || GET_CODE (op
) == ROTATE
)
9646 && CONST_INT_P (XEXP (op
, 1)))
9647 return XEXP (op
, 0);
9649 if (GET_CODE (op
) == MULT
9650 && CONST_INT_P (XEXP (op
, 1))
9651 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
9652 return XEXP (op
, 0);
9657 /* Helper function for rtx cost calculation. Strip an extend
9658 expression from X. Returns the inner operand if successful, or the
9659 original expression on failure. We deal with a number of possible
9660 canonicalization variations here. If STRIP_SHIFT is true, then
9661 we can strip off a shift also. */
9663 aarch64_strip_extend (rtx x
, bool strip_shift
)
9665 scalar_int_mode mode
;
9668 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
9671 /* Zero and sign extraction of a widened value. */
9672 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
9673 && XEXP (op
, 2) == const0_rtx
9674 && GET_CODE (XEXP (op
, 0)) == MULT
9675 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
9677 return XEXP (XEXP (op
, 0), 0);
9679 /* It can also be represented (for zero-extend) as an AND with an
9681 if (GET_CODE (op
) == AND
9682 && GET_CODE (XEXP (op
, 0)) == MULT
9683 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
9684 && CONST_INT_P (XEXP (op
, 1))
9685 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
9686 INTVAL (XEXP (op
, 1))) != 0)
9687 return XEXP (XEXP (op
, 0), 0);
9689 /* Now handle extended register, as this may also have an optional
9690 left shift by 1..4. */
9692 && GET_CODE (op
) == ASHIFT
9693 && CONST_INT_P (XEXP (op
, 1))
9694 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
9697 if (GET_CODE (op
) == ZERO_EXTEND
9698 || GET_CODE (op
) == SIGN_EXTEND
)
9707 /* Return true iff CODE is a shift supported in combination
9708 with arithmetic instructions. */
9711 aarch64_shift_p (enum rtx_code code
)
9713 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
9717 /* Return true iff X is a cheap shift without a sign extend. */
9720 aarch64_cheap_mult_shift_p (rtx x
)
9727 if (!(aarch64_tune_params
.extra_tuning_flags
9728 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
9731 if (GET_CODE (op0
) == SIGN_EXTEND
)
9734 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
9735 && UINTVAL (op1
) <= 4)
9738 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
9741 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
9743 if (l2
> 0 && l2
<= 4)
9749 /* Helper function for rtx cost calculation. Calculate the cost of
9750 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9751 Return the calculated cost of the expression, recursing manually in to
9752 operands where needed. */
9755 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
9758 const struct cpu_cost_table
*extra_cost
9759 = aarch64_tune_params
.insn_extra_cost
;
9761 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
9762 machine_mode mode
= GET_MODE (x
);
9764 gcc_checking_assert (code
== MULT
);
9769 if (VECTOR_MODE_P (mode
))
9770 mode
= GET_MODE_INNER (mode
);
9772 /* Integer multiply/fma. */
9773 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9775 /* The multiply will be canonicalized as a shift, cost it as such. */
9776 if (aarch64_shift_p (GET_CODE (x
))
9777 || (CONST_INT_P (op1
)
9778 && exact_log2 (INTVAL (op1
)) > 0))
9780 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
9781 || GET_CODE (op0
) == SIGN_EXTEND
;
9786 /* If the shift is considered cheap,
9787 then don't add any cost. */
9788 if (aarch64_cheap_mult_shift_p (x
))
9790 else if (REG_P (op1
))
9791 /* ARITH + shift-by-register. */
9792 cost
+= extra_cost
->alu
.arith_shift_reg
;
9794 /* ARITH + extended register. We don't have a cost field
9795 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9796 cost
+= extra_cost
->alu
.extend_arith
;
9798 /* ARITH + shift-by-immediate. */
9799 cost
+= extra_cost
->alu
.arith_shift
;
9802 /* LSL (immediate). */
9803 cost
+= extra_cost
->alu
.shift
;
9806 /* Strip extends as we will have costed them in the case above. */
9808 op0
= aarch64_strip_extend (op0
, true);
9810 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
9815 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9816 compound and let the below cases handle it. After all, MNEG is a
9817 special-case alias of MSUB. */
9818 if (GET_CODE (op0
) == NEG
)
9820 op0
= XEXP (op0
, 0);
9824 /* Integer multiplies or FMAs have zero/sign extending variants. */
9825 if ((GET_CODE (op0
) == ZERO_EXTEND
9826 && GET_CODE (op1
) == ZERO_EXTEND
)
9827 || (GET_CODE (op0
) == SIGN_EXTEND
9828 && GET_CODE (op1
) == SIGN_EXTEND
))
9830 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
9831 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
9836 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9837 cost
+= extra_cost
->mult
[0].extend_add
;
9839 /* MUL/SMULL/UMULL. */
9840 cost
+= extra_cost
->mult
[0].extend
;
9846 /* This is either an integer multiply or a MADD. In both cases
9847 we want to recurse and cost the operands. */
9848 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9849 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9855 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
9858 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
9867 /* Floating-point FMA/FMUL can also support negations of the
9868 operands, unless the rounding mode is upward or downward in
9869 which case FNMUL is different than FMUL with operand negation. */
9870 bool neg0
= GET_CODE (op0
) == NEG
;
9871 bool neg1
= GET_CODE (op1
) == NEG
;
9872 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
9875 op0
= XEXP (op0
, 0);
9877 op1
= XEXP (op1
, 0);
9881 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9882 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9885 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9888 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9889 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9895 aarch64_address_cost (rtx x
,
9897 addr_space_t as ATTRIBUTE_UNUSED
,
9900 enum rtx_code c
= GET_CODE (x
);
9901 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9902 struct aarch64_address_info info
;
9906 if (!aarch64_classify_address (&info
, x
, mode
, false))
9908 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9910 /* This is a CONST or SYMBOL ref which will be split
9911 in a different way depending on the code model in use.
9912 Cost it through the generic infrastructure. */
9913 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9914 /* Divide through by the cost of one instruction to
9915 bring it to the same units as the address costs. */
9916 cost_symbol_ref
/= COSTS_N_INSNS (1);
9917 /* The cost is then the cost of preparing the address,
9918 followed by an immediate (possibly 0) offset. */
9919 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9923 /* This is most likely a jump table from a case
9925 return addr_cost
->register_offset
;
9931 case ADDRESS_LO_SUM
:
9932 case ADDRESS_SYMBOLIC
:
9933 case ADDRESS_REG_IMM
:
9934 cost
+= addr_cost
->imm_offset
;
9937 case ADDRESS_REG_WB
:
9938 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9939 cost
+= addr_cost
->pre_modify
;
9940 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9941 cost
+= addr_cost
->post_modify
;
9947 case ADDRESS_REG_REG
:
9948 cost
+= addr_cost
->register_offset
;
9951 case ADDRESS_REG_SXTW
:
9952 cost
+= addr_cost
->register_sextend
;
9955 case ADDRESS_REG_UXTW
:
9956 cost
+= addr_cost
->register_zextend
;
9966 /* For the sake of calculating the cost of the shifted register
9967 component, we can treat same sized modes in the same way. */
9968 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9969 cost
+= addr_cost
->addr_scale_costs
.hi
;
9970 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9971 cost
+= addr_cost
->addr_scale_costs
.si
;
9972 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9973 cost
+= addr_cost
->addr_scale_costs
.di
;
9975 /* We can't tell, or this is a 128-bit vector. */
9976 cost
+= addr_cost
->addr_scale_costs
.ti
;
9982 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9983 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9987 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
9989 /* When optimizing for speed, use the cost of unpredictable branches. */
9990 const struct cpu_branch_cost
*branch_costs
=
9991 aarch64_tune_params
.branch_costs
;
9993 if (!speed_p
|| predictable_p
)
9994 return branch_costs
->predictable
;
9996 return branch_costs
->unpredictable
;
9999 /* Return true if the RTX X in mode MODE is a zero or sign extract
10000 usable in an ADD or SUB (extended register) instruction. */
10002 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
10004 /* Catch add with a sign extract.
10005 This is add_<optab><mode>_multp2. */
10006 if (GET_CODE (x
) == SIGN_EXTRACT
10007 || GET_CODE (x
) == ZERO_EXTRACT
)
10009 rtx op0
= XEXP (x
, 0);
10010 rtx op1
= XEXP (x
, 1);
10011 rtx op2
= XEXP (x
, 2);
10013 if (GET_CODE (op0
) == MULT
10014 && CONST_INT_P (op1
)
10015 && op2
== const0_rtx
10016 && CONST_INT_P (XEXP (op0
, 1))
10017 && aarch64_is_extend_from_extract (mode
,
10024 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10026 else if (GET_CODE (x
) == SIGN_EXTEND
10027 || GET_CODE (x
) == ZERO_EXTEND
)
10028 return REG_P (XEXP (x
, 0));
10034 aarch64_frint_unspec_p (unsigned int u
)
10038 case UNSPEC_FRINTZ
:
10039 case UNSPEC_FRINTP
:
10040 case UNSPEC_FRINTM
:
10041 case UNSPEC_FRINTA
:
10042 case UNSPEC_FRINTN
:
10043 case UNSPEC_FRINTX
:
10044 case UNSPEC_FRINTI
:
10052 /* Return true iff X is an rtx that will match an extr instruction
10053 i.e. as described in the *extr<mode>5_insn family of patterns.
10054 OP0 and OP1 will be set to the operands of the shifts involved
10055 on success and will be NULL_RTX otherwise. */
10058 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
10061 scalar_int_mode mode
;
10062 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
10065 *res_op0
= NULL_RTX
;
10066 *res_op1
= NULL_RTX
;
10068 if (GET_CODE (x
) != IOR
)
10074 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
10075 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
10077 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10078 if (GET_CODE (op1
) == ASHIFT
)
10079 std::swap (op0
, op1
);
10081 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
10084 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
10085 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
10087 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
10088 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
10090 *res_op0
= XEXP (op0
, 0);
10091 *res_op1
= XEXP (op1
, 0);
10099 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10100 storing it in *COST. Result is true if the total cost of the operation
10101 has now been calculated. */
10103 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
10107 enum rtx_code cmpcode
;
10109 if (COMPARISON_P (op0
))
10111 inner
= XEXP (op0
, 0);
10112 comparator
= XEXP (op0
, 1);
10113 cmpcode
= GET_CODE (op0
);
10118 comparator
= const0_rtx
;
10122 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
10124 /* Conditional branch. */
10125 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10129 if (cmpcode
== NE
|| cmpcode
== EQ
)
10131 if (comparator
== const0_rtx
)
10133 /* TBZ/TBNZ/CBZ/CBNZ. */
10134 if (GET_CODE (inner
) == ZERO_EXTRACT
)
10136 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
10137 ZERO_EXTRACT
, 0, speed
);
10140 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
10145 else if (cmpcode
== LT
|| cmpcode
== GE
)
10148 if (comparator
== const0_rtx
)
10153 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10156 if (GET_CODE (op1
) == COMPARE
)
10158 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10159 if (XEXP (op1
, 1) == const0_rtx
)
10163 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
10164 const struct cpu_cost_table
*extra_cost
10165 = aarch64_tune_params
.insn_extra_cost
;
10167 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10168 *cost
+= extra_cost
->alu
.arith
;
10170 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10175 /* It's a conditional operation based on the status flags,
10176 so it must be some flavor of CSEL. */
10178 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10179 if (GET_CODE (op1
) == NEG
10180 || GET_CODE (op1
) == NOT
10181 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
10182 op1
= XEXP (op1
, 0);
10183 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
10185 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10186 op1
= XEXP (op1
, 0);
10187 op2
= XEXP (op2
, 0);
10190 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
10191 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
10195 /* We don't know what this is, cost all operands. */
10199 /* Check whether X is a bitfield operation of the form shift + extend that
10200 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10201 operand to which the bitfield operation is applied. Otherwise return
10205 aarch64_extend_bitfield_pattern_p (rtx x
)
10207 rtx_code outer_code
= GET_CODE (x
);
10208 machine_mode outer_mode
= GET_MODE (x
);
10210 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
10211 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
10214 rtx inner
= XEXP (x
, 0);
10215 rtx_code inner_code
= GET_CODE (inner
);
10216 machine_mode inner_mode
= GET_MODE (inner
);
10219 switch (inner_code
)
10222 if (CONST_INT_P (XEXP (inner
, 1))
10223 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10224 op
= XEXP (inner
, 0);
10227 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10228 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10229 op
= XEXP (inner
, 0);
10232 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10233 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10234 op
= XEXP (inner
, 0);
10243 /* Return true if the mask and a shift amount from an RTX of the form
10244 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10245 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10248 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
10251 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
10252 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
10253 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
10255 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
10258 /* Return true if the masks and a shift amount from an RTX of the form
10259 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10260 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10263 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
10264 unsigned HOST_WIDE_INT mask1
,
10265 unsigned HOST_WIDE_INT shft_amnt
,
10266 unsigned HOST_WIDE_INT mask2
)
10268 unsigned HOST_WIDE_INT t
;
10270 /* Verify that there is no overlap in what bits are set in the two masks. */
10271 if (mask1
!= ~mask2
)
10274 /* Verify that mask2 is not all zeros or ones. */
10275 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
10278 /* The shift amount should always be less than the mode size. */
10279 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
10281 /* Verify that the mask being shifted is contiguous and would be in the
10282 least significant bits after shifting by shft_amnt. */
10283 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
10284 return (t
== (t
& -t
));
10287 /* Calculate the cost of calculating X, storing it in *COST. Result
10288 is true if the total cost of the operation has now been calculated. */
10290 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
10291 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
10294 const struct cpu_cost_table
*extra_cost
10295 = aarch64_tune_params
.insn_extra_cost
;
10296 int code
= GET_CODE (x
);
10297 scalar_int_mode int_mode
;
10299 /* By default, assume that everything has equivalent cost to the
10300 cheapest instruction. Any additional costs are applied as a delta
10301 above this default. */
10302 *cost
= COSTS_N_INSNS (1);
10307 /* The cost depends entirely on the operands to SET. */
10309 op0
= SET_DEST (x
);
10312 switch (GET_CODE (op0
))
10317 rtx address
= XEXP (op0
, 0);
10318 if (VECTOR_MODE_P (mode
))
10319 *cost
+= extra_cost
->ldst
.storev
;
10320 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10321 *cost
+= extra_cost
->ldst
.store
;
10322 else if (mode
== SFmode
)
10323 *cost
+= extra_cost
->ldst
.storef
;
10324 else if (mode
== DFmode
)
10325 *cost
+= extra_cost
->ldst
.stored
;
10328 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10332 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10336 if (! REG_P (SUBREG_REG (op0
)))
10337 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
10339 /* Fall through. */
10341 /* The cost is one per vector-register copied. */
10342 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
10344 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
10345 *cost
= COSTS_N_INSNS (nregs
);
10347 /* const0_rtx is in general free, but we will use an
10348 instruction to set a register to 0. */
10349 else if (REG_P (op1
) || op1
== const0_rtx
)
10351 /* The cost is 1 per register copied. */
10352 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
10353 *cost
= COSTS_N_INSNS (nregs
);
10356 /* Cost is just the cost of the RHS of the set. */
10357 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10362 /* Bit-field insertion. Strip any redundant widening of
10363 the RHS to meet the width of the target. */
10364 if (GET_CODE (op1
) == SUBREG
)
10365 op1
= SUBREG_REG (op1
);
10366 if ((GET_CODE (op1
) == ZERO_EXTEND
10367 || GET_CODE (op1
) == SIGN_EXTEND
)
10368 && CONST_INT_P (XEXP (op0
, 1))
10369 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
10370 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
10371 op1
= XEXP (op1
, 0);
10373 if (CONST_INT_P (op1
))
10375 /* MOV immediate is assumed to always be cheap. */
10376 *cost
= COSTS_N_INSNS (1);
10382 *cost
+= extra_cost
->alu
.bfi
;
10383 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
10389 /* We can't make sense of this, assume default cost. */
10390 *cost
= COSTS_N_INSNS (1);
10396 /* If an instruction can incorporate a constant within the
10397 instruction, the instruction's expression avoids calling
10398 rtx_cost() on the constant. If rtx_cost() is called on a
10399 constant, then it is usually because the constant must be
10400 moved into a register by one or more instructions.
10402 The exception is constant 0, which can be expressed
10403 as XZR/WZR and is therefore free. The exception to this is
10404 if we have (set (reg) (const0_rtx)) in which case we must cost
10405 the move. However, we can catch that when we cost the SET, so
10406 we don't need to consider that here. */
10407 if (x
== const0_rtx
)
10411 /* To an approximation, building any other constant is
10412 proportionally expensive to the number of instructions
10413 required to build that constant. This is true whether we
10414 are compiling for SPEED or otherwise. */
10415 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
10416 int_mode
= word_mode
;
10417 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
10418 (NULL_RTX
, x
, false, int_mode
));
10424 /* First determine number of instructions to do the move
10425 as an integer constant. */
10426 if (!aarch64_float_const_representable_p (x
)
10427 && !aarch64_can_const_movi_rtx_p (x
, mode
)
10428 && aarch64_float_const_rtx_p (x
))
10430 unsigned HOST_WIDE_INT ival
;
10431 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
10432 gcc_assert (succeed
);
10434 scalar_int_mode imode
= (mode
== HFmode
10436 : int_mode_for_mode (mode
).require ());
10437 int ncost
= aarch64_internal_mov_immediate
10438 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
10439 *cost
+= COSTS_N_INSNS (ncost
);
10445 /* mov[df,sf]_aarch64. */
10446 if (aarch64_float_const_representable_p (x
))
10447 /* FMOV (scalar immediate). */
10448 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
10449 else if (!aarch64_float_const_zero_rtx_p (x
))
10451 /* This will be a load from memory. */
10452 if (mode
== DFmode
)
10453 *cost
+= extra_cost
->ldst
.loadd
;
10455 *cost
+= extra_cost
->ldst
.loadf
;
10458 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10459 or MOV v0.s[0], wzr - neither of which are modeled by the
10460 cost tables. Just use the default cost. */
10470 /* For loads we want the base cost of a load, plus an
10471 approximation for the additional cost of the addressing
10473 rtx address
= XEXP (x
, 0);
10474 if (VECTOR_MODE_P (mode
))
10475 *cost
+= extra_cost
->ldst
.loadv
;
10476 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10477 *cost
+= extra_cost
->ldst
.load
;
10478 else if (mode
== SFmode
)
10479 *cost
+= extra_cost
->ldst
.loadf
;
10480 else if (mode
== DFmode
)
10481 *cost
+= extra_cost
->ldst
.loadd
;
10484 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10493 if (VECTOR_MODE_P (mode
))
10498 *cost
+= extra_cost
->vect
.alu
;
10503 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10505 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10506 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10509 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
10513 /* Cost this as SUB wzr, X. */
10514 op0
= CONST0_RTX (mode
);
10519 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10521 /* Support (neg(fma...)) as a single instruction only if
10522 sign of zeros is unimportant. This matches the decision
10523 making in aarch64.md. */
10524 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
10527 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10530 if (GET_CODE (op0
) == MULT
)
10533 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10538 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10548 if (VECTOR_MODE_P (mode
))
10549 *cost
+= extra_cost
->vect
.alu
;
10551 *cost
+= extra_cost
->alu
.clz
;
10560 if (op1
== const0_rtx
10561 && GET_CODE (op0
) == AND
)
10564 mode
= GET_MODE (op0
);
10568 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
10570 /* TODO: A write to the CC flags possibly costs extra, this
10571 needs encoding in the cost tables. */
10573 mode
= GET_MODE (op0
);
10575 if (GET_CODE (op0
) == AND
)
10581 if (GET_CODE (op0
) == PLUS
)
10583 /* ADDS (and CMN alias). */
10588 if (GET_CODE (op0
) == MINUS
)
10595 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
10596 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
10597 && CONST_INT_P (XEXP (op0
, 2)))
10599 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10600 Handle it here directly rather than going to cost_logic
10601 since we know the immediate generated for the TST is valid
10602 so we can avoid creating an intermediate rtx for it only
10603 for costing purposes. */
10605 *cost
+= extra_cost
->alu
.logical
;
10607 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
10608 ZERO_EXTRACT
, 0, speed
);
10612 if (GET_CODE (op1
) == NEG
)
10616 *cost
+= extra_cost
->alu
.arith
;
10618 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
10619 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
10625 Compare can freely swap the order of operands, and
10626 canonicalization puts the more complex operation first.
10627 But the integer MINUS logic expects the shift/extend
10628 operation in op1. */
10630 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
10638 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
10642 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10644 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
10646 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
10647 /* FCMP supports constant 0.0 for no extra cost. */
10653 if (VECTOR_MODE_P (mode
))
10655 /* Vector compare. */
10657 *cost
+= extra_cost
->vect
.alu
;
10659 if (aarch64_float_const_zero_rtx_p (op1
))
10661 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10675 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
10677 /* Detect valid immediates. */
10678 if ((GET_MODE_CLASS (mode
) == MODE_INT
10679 || (GET_MODE_CLASS (mode
) == MODE_CC
10680 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
10681 && CONST_INT_P (op1
)
10682 && aarch64_uimm12_shift (INTVAL (op1
)))
10685 /* SUB(S) (immediate). */
10686 *cost
+= extra_cost
->alu
.arith
;
10690 /* Look for SUB (extended register). */
10691 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10692 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
10695 *cost
+= extra_cost
->alu
.extend_arith
;
10697 op1
= aarch64_strip_extend (op1
, true);
10698 *cost
+= rtx_cost (op1
, VOIDmode
,
10699 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
10703 rtx new_op1
= aarch64_strip_extend (op1
, false);
10705 /* Cost this as an FMA-alike operation. */
10706 if ((GET_CODE (new_op1
) == MULT
10707 || aarch64_shift_p (GET_CODE (new_op1
)))
10708 && code
!= COMPARE
)
10710 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
10711 (enum rtx_code
) code
,
10716 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
10720 if (VECTOR_MODE_P (mode
))
10723 *cost
+= extra_cost
->vect
.alu
;
10725 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10728 *cost
+= extra_cost
->alu
.arith
;
10730 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10733 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10747 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10748 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10751 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
10752 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10756 if (GET_MODE_CLASS (mode
) == MODE_INT
10757 && (aarch64_plus_immediate (op1
, mode
)
10758 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
10760 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
10763 /* ADD (immediate). */
10764 *cost
+= extra_cost
->alu
.arith
;
10768 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10770 /* Look for ADD (extended register). */
10771 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10772 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
10775 *cost
+= extra_cost
->alu
.extend_arith
;
10777 op0
= aarch64_strip_extend (op0
, true);
10778 *cost
+= rtx_cost (op0
, VOIDmode
,
10779 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
10783 /* Strip any extend, leave shifts behind as we will
10784 cost them through mult_cost. */
10785 new_op0
= aarch64_strip_extend (op0
, false);
10787 if (GET_CODE (new_op0
) == MULT
10788 || aarch64_shift_p (GET_CODE (new_op0
)))
10790 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
10795 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
10799 if (VECTOR_MODE_P (mode
))
10802 *cost
+= extra_cost
->vect
.alu
;
10804 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10807 *cost
+= extra_cost
->alu
.arith
;
10809 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10812 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10819 *cost
= COSTS_N_INSNS (1);
10823 if (VECTOR_MODE_P (mode
))
10824 *cost
+= extra_cost
->vect
.alu
;
10826 *cost
+= extra_cost
->alu
.rev
;
10831 if (aarch_rev16_p (x
))
10833 *cost
= COSTS_N_INSNS (1);
10837 if (VECTOR_MODE_P (mode
))
10838 *cost
+= extra_cost
->vect
.alu
;
10840 *cost
+= extra_cost
->alu
.rev
;
10845 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
10847 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
10848 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
10850 *cost
+= extra_cost
->alu
.shift
;
10854 /* Fall through. */
10861 if (VECTOR_MODE_P (mode
))
10864 *cost
+= extra_cost
->vect
.alu
;
10869 && GET_CODE (op0
) == MULT
10870 && CONST_INT_P (XEXP (op0
, 1))
10871 && CONST_INT_P (op1
)
10872 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
10873 INTVAL (op1
)) != 0)
10875 /* This is a UBFM/SBFM. */
10876 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
10878 *cost
+= extra_cost
->alu
.bfx
;
10882 if (is_int_mode (mode
, &int_mode
))
10884 if (CONST_INT_P (op1
))
10886 /* We have a mask + shift version of a UBFIZ
10887 i.e. the *andim_ashift<mode>_bfiz pattern. */
10888 if (GET_CODE (op0
) == ASHIFT
10889 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10892 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10893 (enum rtx_code
) code
, 0, speed
);
10895 *cost
+= extra_cost
->alu
.bfx
;
10899 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10901 /* We possibly get the immediate for free, this is not
10903 *cost
+= rtx_cost (op0
, int_mode
,
10904 (enum rtx_code
) code
, 0, speed
);
10906 *cost
+= extra_cost
->alu
.logical
;
10915 /* Handle ORN, EON, or BIC. */
10916 if (GET_CODE (op0
) == NOT
)
10917 op0
= XEXP (op0
, 0);
10919 new_op0
= aarch64_strip_shift (op0
);
10921 /* If we had a shift on op0 then this is a logical-shift-
10922 by-register/immediate operation. Otherwise, this is just
10923 a logical operation. */
10926 if (new_op0
!= op0
)
10928 /* Shift by immediate. */
10929 if (CONST_INT_P (XEXP (op0
, 1)))
10930 *cost
+= extra_cost
->alu
.log_shift
;
10932 *cost
+= extra_cost
->alu
.log_shift_reg
;
10935 *cost
+= extra_cost
->alu
.logical
;
10938 /* In both cases we want to cost both operands. */
10939 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10941 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10951 op0
= aarch64_strip_shift (x
);
10953 if (VECTOR_MODE_P (mode
))
10956 *cost
+= extra_cost
->vect
.alu
;
10960 /* MVN-shifted-reg. */
10963 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10966 *cost
+= extra_cost
->alu
.log_shift
;
10970 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10971 Handle the second form here taking care that 'a' in the above can
10973 else if (GET_CODE (op0
) == XOR
)
10975 rtx newop0
= XEXP (op0
, 0);
10976 rtx newop1
= XEXP (op0
, 1);
10977 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10979 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10980 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
10984 if (op0_stripped
!= newop0
)
10985 *cost
+= extra_cost
->alu
.log_shift
;
10987 *cost
+= extra_cost
->alu
.logical
;
10994 *cost
+= extra_cost
->alu
.logical
;
11001 /* If a value is written in SI mode, then zero extended to DI
11002 mode, the operation will in general be free as a write to
11003 a 'w' register implicitly zeroes the upper bits of an 'x'
11004 register. However, if this is
11006 (set (reg) (zero_extend (reg)))
11008 we must cost the explicit register move. */
11010 && GET_MODE (op0
) == SImode
11013 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
11015 /* If OP_COST is non-zero, then the cost of the zero extend
11016 is effectively the cost of the inner operation. Otherwise
11017 we have a MOV instruction and we take the cost from the MOV
11018 itself. This is true independently of whether we are
11019 optimizing for space or time. */
11025 else if (MEM_P (op0
))
11027 /* All loads can zero extend to any size for free. */
11028 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
11032 op0
= aarch64_extend_bitfield_pattern_p (x
);
11035 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
11037 *cost
+= extra_cost
->alu
.bfx
;
11043 if (VECTOR_MODE_P (mode
))
11046 *cost
+= extra_cost
->vect
.alu
;
11050 /* We generate an AND instead of UXTB/UXTH. */
11051 *cost
+= extra_cost
->alu
.logical
;
11057 if (MEM_P (XEXP (x
, 0)))
11062 rtx address
= XEXP (XEXP (x
, 0), 0);
11063 *cost
+= extra_cost
->ldst
.load_sign_extend
;
11066 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11072 op0
= aarch64_extend_bitfield_pattern_p (x
);
11075 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
11077 *cost
+= extra_cost
->alu
.bfx
;
11083 if (VECTOR_MODE_P (mode
))
11084 *cost
+= extra_cost
->vect
.alu
;
11086 *cost
+= extra_cost
->alu
.extend
;
11094 if (CONST_INT_P (op1
))
11098 if (VECTOR_MODE_P (mode
))
11100 /* Vector shift (immediate). */
11101 *cost
+= extra_cost
->vect
.alu
;
11105 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11107 *cost
+= extra_cost
->alu
.shift
;
11111 /* We can incorporate zero/sign extend for free. */
11112 if (GET_CODE (op0
) == ZERO_EXTEND
11113 || GET_CODE (op0
) == SIGN_EXTEND
)
11114 op0
= XEXP (op0
, 0);
11116 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
11121 if (VECTOR_MODE_P (mode
))
11124 /* Vector shift (register). */
11125 *cost
+= extra_cost
->vect
.alu
;
11131 *cost
+= extra_cost
->alu
.shift_reg
;
11133 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11134 && CONST_INT_P (XEXP (op1
, 1))
11135 && known_eq (INTVAL (XEXP (op1
, 1)),
11136 GET_MODE_BITSIZE (mode
) - 1))
11138 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11139 /* We already demanded XEXP (op1, 0) to be REG_P, so
11140 don't recurse into it. */
11144 return false; /* All arguments need to be in registers. */
11154 if (CONST_INT_P (op1
))
11156 /* ASR (immediate) and friends. */
11159 if (VECTOR_MODE_P (mode
))
11160 *cost
+= extra_cost
->vect
.alu
;
11162 *cost
+= extra_cost
->alu
.shift
;
11165 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11170 if (VECTOR_MODE_P (mode
))
11173 /* Vector shift (register). */
11174 *cost
+= extra_cost
->vect
.alu
;
11179 /* ASR (register) and friends. */
11180 *cost
+= extra_cost
->alu
.shift_reg
;
11182 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11183 && CONST_INT_P (XEXP (op1
, 1))
11184 && known_eq (INTVAL (XEXP (op1
, 1)),
11185 GET_MODE_BITSIZE (mode
) - 1))
11187 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11188 /* We already demanded XEXP (op1, 0) to be REG_P, so
11189 don't recurse into it. */
11193 return false; /* All arguments need to be in registers. */
11198 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
11199 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
11203 *cost
+= extra_cost
->ldst
.load
;
11205 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
11206 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
11208 /* ADRP, followed by ADD. */
11209 *cost
+= COSTS_N_INSNS (1);
11211 *cost
+= 2 * extra_cost
->alu
.arith
;
11213 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11214 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11218 *cost
+= extra_cost
->alu
.arith
;
11223 /* One extra load instruction, after accessing the GOT. */
11224 *cost
+= COSTS_N_INSNS (1);
11226 *cost
+= extra_cost
->ldst
.load
;
11232 /* ADRP/ADD (immediate). */
11234 *cost
+= extra_cost
->alu
.arith
;
11242 if (VECTOR_MODE_P (mode
))
11243 *cost
+= extra_cost
->vect
.alu
;
11245 *cost
+= extra_cost
->alu
.bfx
;
11248 /* We can trust that the immediates used will be correct (there
11249 are no by-register forms), so we need only cost op0. */
11250 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11254 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
11255 /* aarch64_rtx_mult_cost always handles recursion to its
11260 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11261 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11262 an unconditional negate. This case should only ever be reached through
11263 the set_smod_pow2_cheap check in expmed.c. */
11264 if (CONST_INT_P (XEXP (x
, 1))
11265 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
11266 && (mode
== SImode
|| mode
== DImode
))
11268 /* We expand to 4 instructions. Reset the baseline. */
11269 *cost
= COSTS_N_INSNS (4);
11272 *cost
+= 2 * extra_cost
->alu
.logical
11273 + 2 * extra_cost
->alu
.arith
;
11278 /* Fall-through. */
11282 /* Slighly prefer UMOD over SMOD. */
11283 if (VECTOR_MODE_P (mode
))
11284 *cost
+= extra_cost
->vect
.alu
;
11285 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11286 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
11287 + extra_cost
->mult
[mode
== DImode
].idiv
11288 + (code
== MOD
? 1 : 0));
11290 return false; /* All arguments need to be in registers. */
11297 if (VECTOR_MODE_P (mode
))
11298 *cost
+= extra_cost
->vect
.alu
;
11299 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11300 /* There is no integer SQRT, so only DIV and UDIV can get
11302 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
11303 /* Slighly prefer UDIV over SDIV. */
11304 + (code
== DIV
? 1 : 0));
11306 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
11308 return false; /* All arguments need to be in registers. */
11311 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
11312 XEXP (x
, 2), cost
, speed
);
11325 return false; /* All arguments must be in registers. */
11334 if (VECTOR_MODE_P (mode
))
11335 *cost
+= extra_cost
->vect
.alu
;
11337 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
11340 /* FMSUB, FNMADD, and FNMSUB are free. */
11341 if (GET_CODE (op0
) == NEG
)
11342 op0
= XEXP (op0
, 0);
11344 if (GET_CODE (op2
) == NEG
)
11345 op2
= XEXP (op2
, 0);
11347 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11348 and the by-element operand as operand 0. */
11349 if (GET_CODE (op1
) == NEG
)
11350 op1
= XEXP (op1
, 0);
11352 /* Catch vector-by-element operations. The by-element operand can
11353 either be (vec_duplicate (vec_select (x))) or just
11354 (vec_select (x)), depending on whether we are multiplying by
11355 a vector or a scalar.
11357 Canonicalization is not very good in these cases, FMA4 will put the
11358 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11359 if (GET_CODE (op0
) == VEC_DUPLICATE
)
11360 op0
= XEXP (op0
, 0);
11361 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
11362 op1
= XEXP (op1
, 0);
11364 if (GET_CODE (op0
) == VEC_SELECT
)
11365 op0
= XEXP (op0
, 0);
11366 else if (GET_CODE (op1
) == VEC_SELECT
)
11367 op1
= XEXP (op1
, 0);
11369 /* If the remaining parameters are not registers,
11370 get the cost to put them into registers. */
11371 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
11372 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
11373 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
11377 case UNSIGNED_FLOAT
:
11379 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
11385 if (VECTOR_MODE_P (mode
))
11387 /*Vector truncate. */
11388 *cost
+= extra_cost
->vect
.alu
;
11391 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
11395 case FLOAT_TRUNCATE
:
11398 if (VECTOR_MODE_P (mode
))
11400 /*Vector conversion. */
11401 *cost
+= extra_cost
->vect
.alu
;
11404 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
11411 /* Strip the rounding part. They will all be implemented
11412 by the fcvt* family of instructions anyway. */
11413 if (GET_CODE (x
) == UNSPEC
)
11415 unsigned int uns_code
= XINT (x
, 1);
11417 if (uns_code
== UNSPEC_FRINTA
11418 || uns_code
== UNSPEC_FRINTM
11419 || uns_code
== UNSPEC_FRINTN
11420 || uns_code
== UNSPEC_FRINTP
11421 || uns_code
== UNSPEC_FRINTZ
)
11422 x
= XVECEXP (x
, 0, 0);
11427 if (VECTOR_MODE_P (mode
))
11428 *cost
+= extra_cost
->vect
.alu
;
11430 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
11433 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11434 fixed-point fcvt. */
11435 if (GET_CODE (x
) == MULT
11436 && ((VECTOR_MODE_P (mode
)
11437 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
11438 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
11440 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
11445 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11449 if (VECTOR_MODE_P (mode
))
11451 /* ABS (vector). */
11453 *cost
+= extra_cost
->vect
.alu
;
11455 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11459 /* FABD, which is analogous to FADD. */
11460 if (GET_CODE (op0
) == MINUS
)
11462 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
11463 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
11465 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11469 /* Simple FABS is analogous to FNEG. */
11471 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11475 /* Integer ABS will either be split to
11476 two arithmetic instructions, or will be an ABS
11477 (scalar), which we don't model. */
11478 *cost
= COSTS_N_INSNS (2);
11480 *cost
+= 2 * extra_cost
->alu
.arith
;
11488 if (VECTOR_MODE_P (mode
))
11489 *cost
+= extra_cost
->vect
.alu
;
11492 /* FMAXNM/FMINNM/FMAX/FMIN.
11493 TODO: This may not be accurate for all implementations, but
11494 we do not model this in the cost tables. */
11495 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11501 /* The floating point round to integer frint* instructions. */
11502 if (aarch64_frint_unspec_p (XINT (x
, 1)))
11505 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
11510 if (XINT (x
, 1) == UNSPEC_RBIT
)
11513 *cost
+= extra_cost
->alu
.rev
;
11521 /* Decompose <su>muldi3_highpart. */
11522 if (/* (truncate:DI */
11525 && GET_MODE (XEXP (x
, 0)) == TImode
11526 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
11528 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
11529 /* (ANY_EXTEND:TI (reg:DI))
11530 (ANY_EXTEND:TI (reg:DI))) */
11531 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
11532 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
11533 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
11534 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
11535 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
11536 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
11537 /* (const_int 64) */
11538 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
11539 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
11543 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
11544 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
11545 mode
, MULT
, 0, speed
);
11546 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
11547 mode
, MULT
, 1, speed
);
11551 /* Fall through. */
11557 && flag_aarch64_verbose_cost
)
11558 fprintf (dump_file
,
11559 "\nFailed to cost RTX. Assuming default cost.\n");
11564 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11565 calculated for X. This cost is stored in *COST. Returns true
11566 if the total cost of X was calculated. */
11568 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
11569 int param
, int *cost
, bool speed
)
11571 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
11574 && flag_aarch64_verbose_cost
)
11576 print_rtl_single (dump_file
, x
);
11577 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
11578 speed
? "Hot" : "Cold",
11579 *cost
, result
? "final" : "partial");
11586 aarch64_register_move_cost (machine_mode mode
,
11587 reg_class_t from_i
, reg_class_t to_i
)
11589 enum reg_class from
= (enum reg_class
) from_i
;
11590 enum reg_class to
= (enum reg_class
) to_i
;
11591 const struct cpu_regmove_cost
*regmove_cost
11592 = aarch64_tune_params
.regmove_cost
;
11594 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11595 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
11598 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
11599 from
= GENERAL_REGS
;
11601 /* Moving between GPR and stack cost is the same as GP2GP. */
11602 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
11603 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
11604 return regmove_cost
->GP2GP
;
11606 /* To/From the stack register, we move via the gprs. */
11607 if (to
== STACK_REG
|| from
== STACK_REG
)
11608 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
11609 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
11611 if (known_eq (GET_MODE_SIZE (mode
), 16))
11613 /* 128-bit operations on general registers require 2 instructions. */
11614 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11615 return regmove_cost
->GP2GP
* 2;
11616 else if (from
== GENERAL_REGS
)
11617 return regmove_cost
->GP2FP
* 2;
11618 else if (to
== GENERAL_REGS
)
11619 return regmove_cost
->FP2GP
* 2;
11621 /* When AdvSIMD instructions are disabled it is not possible to move
11622 a 128-bit value directly between Q registers. This is handled in
11623 secondary reload. A general register is used as a scratch to move
11624 the upper DI value and the lower DI value is moved directly,
11625 hence the cost is the sum of three moves. */
11627 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
11629 return regmove_cost
->FP2FP
;
11632 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11633 return regmove_cost
->GP2GP
;
11634 else if (from
== GENERAL_REGS
)
11635 return regmove_cost
->GP2FP
;
11636 else if (to
== GENERAL_REGS
)
11637 return regmove_cost
->FP2GP
;
11639 return regmove_cost
->FP2FP
;
11643 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
11644 reg_class_t rclass ATTRIBUTE_UNUSED
,
11645 bool in ATTRIBUTE_UNUSED
)
11647 return aarch64_tune_params
.memmov_cost
;
11650 /* Implement TARGET_INIT_BUILTINS. */
11652 aarch64_init_builtins ()
11654 aarch64_general_init_builtins ();
11657 /* Implement TARGET_FOLD_BUILTIN. */
11659 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
11661 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11662 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11663 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
11664 switch (code
& AARCH64_BUILTIN_CLASS
)
11666 case AARCH64_BUILTIN_GENERAL
:
11667 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
11669 gcc_unreachable ();
11672 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
11674 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
11676 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
11677 tree fndecl
= gimple_call_fndecl (stmt
);
11678 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11679 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11680 gimple
*new_stmt
= NULL
;
11681 switch (code
& AARCH64_BUILTIN_CLASS
)
11683 case AARCH64_BUILTIN_GENERAL
:
11684 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
);
11691 gsi_replace (gsi
, new_stmt
, true);
11695 /* Implement TARGET_EXPAND_BUILTIN. */
11697 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int)
11699 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
11700 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11701 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11702 switch (code
& AARCH64_BUILTIN_CLASS
)
11704 case AARCH64_BUILTIN_GENERAL
:
11705 return aarch64_general_expand_builtin (subcode
, exp
, target
);
11707 gcc_unreachable ();
11710 /* Implement TARGET_BUILTIN_DECL. */
11712 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
11714 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11715 switch (code
& AARCH64_BUILTIN_CLASS
)
11717 case AARCH64_BUILTIN_GENERAL
:
11718 return aarch64_general_builtin_decl (subcode
, initialize_p
);
11720 gcc_unreachable ();
11723 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11724 to optimize 1.0/sqrt. */
11727 use_rsqrt_p (machine_mode mode
)
11729 return (!flag_trapping_math
11730 && flag_unsafe_math_optimizations
11731 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
11732 & AARCH64_APPROX_MODE (mode
))
11733 || flag_mrecip_low_precision_sqrt
));
11736 /* Function to decide when to use the approximate reciprocal square root
11740 aarch64_builtin_reciprocal (tree fndecl
)
11742 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
11744 if (!use_rsqrt_p (mode
))
11746 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
11747 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
11748 switch (code
& AARCH64_BUILTIN_CLASS
)
11750 case AARCH64_BUILTIN_GENERAL
:
11751 return aarch64_general_builtin_rsqrt (subcode
);
11753 gcc_unreachable ();
11756 /* Emit instruction sequence to compute either the approximate square root
11757 or its approximate reciprocal, depending on the flag RECP, and return
11758 whether the sequence was emitted or not. */
11761 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
11763 machine_mode mode
= GET_MODE (dst
);
11765 if (GET_MODE_INNER (mode
) == HFmode
)
11767 gcc_assert (!recp
);
11773 if (!(flag_mlow_precision_sqrt
11774 || (aarch64_tune_params
.approx_modes
->sqrt
11775 & AARCH64_APPROX_MODE (mode
))))
11778 if (flag_finite_math_only
11779 || flag_trapping_math
11780 || !flag_unsafe_math_optimizations
11781 || optimize_function_for_size_p (cfun
))
11785 /* Caller assumes we cannot fail. */
11786 gcc_assert (use_rsqrt_p (mode
));
11788 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
11789 rtx xmsk
= gen_reg_rtx (mmsk
);
11791 /* When calculating the approximate square root, compare the
11792 argument with 0.0 and create a mask. */
11793 emit_insn (gen_rtx_SET (xmsk
,
11795 gen_rtx_EQ (mmsk
, src
,
11796 CONST0_RTX (mode
)))));
11798 /* Estimate the approximate reciprocal square root. */
11799 rtx xdst
= gen_reg_rtx (mode
);
11800 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
11802 /* Iterate over the series twice for SF and thrice for DF. */
11803 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11805 /* Optionally iterate over the series once less for faster performance
11806 while sacrificing the accuracy. */
11807 if ((recp
&& flag_mrecip_low_precision_sqrt
)
11808 || (!recp
&& flag_mlow_precision_sqrt
))
11811 /* Iterate over the series to calculate the approximate reciprocal square
11813 rtx x1
= gen_reg_rtx (mode
);
11814 while (iterations
--)
11816 rtx x2
= gen_reg_rtx (mode
);
11817 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
11819 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
11821 if (iterations
> 0)
11822 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
11827 /* Qualify the approximate reciprocal square root when the argument is
11828 0.0 by squashing the intermediary result to 0.0. */
11829 rtx xtmp
= gen_reg_rtx (mmsk
);
11830 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
11831 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
11832 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
11834 /* Calculate the approximate square root. */
11835 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
11838 /* Finalize the approximation. */
11839 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
11844 /* Emit the instruction sequence to compute the approximation for the division
11845 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11848 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
11850 machine_mode mode
= GET_MODE (quo
);
11852 if (GET_MODE_INNER (mode
) == HFmode
)
11855 bool use_approx_division_p
= (flag_mlow_precision_div
11856 || (aarch64_tune_params
.approx_modes
->division
11857 & AARCH64_APPROX_MODE (mode
)));
11859 if (!flag_finite_math_only
11860 || flag_trapping_math
11861 || !flag_unsafe_math_optimizations
11862 || optimize_function_for_size_p (cfun
)
11863 || !use_approx_division_p
)
11866 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
11869 /* Estimate the approximate reciprocal. */
11870 rtx xrcp
= gen_reg_rtx (mode
);
11871 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
11873 /* Iterate over the series twice for SF and thrice for DF. */
11874 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11876 /* Optionally iterate over the series once less for faster performance,
11877 while sacrificing the accuracy. */
11878 if (flag_mlow_precision_div
)
11881 /* Iterate over the series to calculate the approximate reciprocal. */
11882 rtx xtmp
= gen_reg_rtx (mode
);
11883 while (iterations
--)
11885 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
11887 if (iterations
> 0)
11888 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11891 if (num
!= CONST1_RTX (mode
))
11893 /* As the approximate reciprocal of DEN is already calculated, only
11894 calculate the approximate division when NUM is not 1.0. */
11895 rtx xnum
= force_reg (mode
, num
);
11896 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
11899 /* Finalize the approximation. */
11900 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11904 /* Return the number of instructions that can be issued per cycle. */
11906 aarch64_sched_issue_rate (void)
11908 return aarch64_tune_params
.issue_rate
;
11911 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
11913 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
11915 if (DEBUG_INSN_P (insn
))
11918 rtx_code code
= GET_CODE (PATTERN (insn
));
11919 if (code
== USE
|| code
== CLOBBER
)
11922 if (get_attr_type (insn
) == TYPE_NO_INSN
)
11929 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11931 int issue_rate
= aarch64_sched_issue_rate ();
11933 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
11937 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11938 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11939 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11942 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
11945 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
11949 /* Vectorizer cost model target hooks. */
11951 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11953 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
11955 int misalign ATTRIBUTE_UNUSED
)
11958 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
11961 if (vectype
!= NULL
)
11962 fp
= FLOAT_TYPE_P (vectype
);
11964 switch (type_of_cost
)
11967 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
11970 return costs
->scalar_load_cost
;
11973 return costs
->scalar_store_cost
;
11976 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11979 return costs
->vec_align_load_cost
;
11982 return costs
->vec_store_cost
;
11984 case vec_to_scalar
:
11985 return costs
->vec_to_scalar_cost
;
11987 case scalar_to_vec
:
11988 return costs
->scalar_to_vec_cost
;
11990 case unaligned_load
:
11991 case vector_gather_load
:
11992 return costs
->vec_unalign_load_cost
;
11994 case unaligned_store
:
11995 case vector_scatter_store
:
11996 return costs
->vec_unalign_store_cost
;
11998 case cond_branch_taken
:
11999 return costs
->cond_taken_branch_cost
;
12001 case cond_branch_not_taken
:
12002 return costs
->cond_not_taken_branch_cost
;
12005 return costs
->vec_permute_cost
;
12007 case vec_promote_demote
:
12008 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
12010 case vec_construct
:
12011 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
12012 return elements
/ 2 + 1;
12015 gcc_unreachable ();
12019 /* Implement targetm.vectorize.add_stmt_cost. */
12021 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
12022 struct _stmt_vec_info
*stmt_info
, int misalign
,
12023 enum vect_cost_model_location where
)
12025 unsigned *cost
= (unsigned *) data
;
12026 unsigned retval
= 0;
12028 if (flag_vect_cost_model
)
12030 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
12032 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
12034 /* Statements in an inner loop relative to the loop being
12035 vectorized are weighted more heavily. The value here is
12036 arbitrary and could potentially be improved with analysis. */
12037 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
12038 count
*= 50; /* FIXME */
12040 retval
= (unsigned) (count
* stmt_cost
);
12041 cost
[where
] += retval
;
12047 static void initialize_aarch64_code_model (struct gcc_options
*);
12049 /* Parse the TO_PARSE string and put the architecture struct that it
12050 selects into RES and the architectural features into ISA_FLAGS.
12051 Return an aarch64_parse_opt_result describing the parse result.
12052 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12053 When the TO_PARSE string contains an invalid extension,
12054 a copy of the string is created and stored to INVALID_EXTENSION. */
12056 static enum aarch64_parse_opt_result
12057 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
12058 uint64_t *isa_flags
, std::string
*invalid_extension
)
12061 const struct processor
*arch
;
12064 ext
= strchr (to_parse
, '+');
12067 len
= ext
- to_parse
;
12069 len
= strlen (to_parse
);
12072 return AARCH64_PARSE_MISSING_ARG
;
12075 /* Loop through the list of supported ARCHes to find a match. */
12076 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
12078 if (strlen (arch
->name
) == len
12079 && strncmp (arch
->name
, to_parse
, len
) == 0)
12081 uint64_t isa_temp
= arch
->flags
;
12085 /* TO_PARSE string contains at least one extension. */
12086 enum aarch64_parse_opt_result ext_res
12087 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
12089 if (ext_res
!= AARCH64_PARSE_OK
)
12092 /* Extension parsing was successful. Confirm the result
12093 arch and ISA flags. */
12095 *isa_flags
= isa_temp
;
12096 return AARCH64_PARSE_OK
;
12100 /* ARCH name not found in list. */
12101 return AARCH64_PARSE_INVALID_ARG
;
12104 /* Parse the TO_PARSE string and put the result tuning in RES and the
12105 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
12106 describing the parse result. If there is an error parsing, RES and
12107 ISA_FLAGS are left unchanged.
12108 When the TO_PARSE string contains an invalid extension,
12109 a copy of the string is created and stored to INVALID_EXTENSION. */
12111 static enum aarch64_parse_opt_result
12112 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
12113 uint64_t *isa_flags
, std::string
*invalid_extension
)
12116 const struct processor
*cpu
;
12119 ext
= strchr (to_parse
, '+');
12122 len
= ext
- to_parse
;
12124 len
= strlen (to_parse
);
12127 return AARCH64_PARSE_MISSING_ARG
;
12130 /* Loop through the list of supported CPUs to find a match. */
12131 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12133 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
12135 uint64_t isa_temp
= cpu
->flags
;
12140 /* TO_PARSE string contains at least one extension. */
12141 enum aarch64_parse_opt_result ext_res
12142 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
12144 if (ext_res
!= AARCH64_PARSE_OK
)
12147 /* Extension parsing was successfull. Confirm the result
12148 cpu and ISA flags. */
12150 *isa_flags
= isa_temp
;
12151 return AARCH64_PARSE_OK
;
12155 /* CPU name not found in list. */
12156 return AARCH64_PARSE_INVALID_ARG
;
12159 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12160 Return an aarch64_parse_opt_result describing the parse result.
12161 If the parsing fails the RES does not change. */
12163 static enum aarch64_parse_opt_result
12164 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
12166 const struct processor
*cpu
;
12168 /* Loop through the list of supported CPUs to find a match. */
12169 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
12171 if (strcmp (cpu
->name
, to_parse
) == 0)
12174 return AARCH64_PARSE_OK
;
12178 /* CPU name not found in list. */
12179 return AARCH64_PARSE_INVALID_ARG
;
12182 /* Parse TOKEN, which has length LENGTH to see if it is an option
12183 described in FLAG. If it is, return the index bit for that fusion type.
12184 If not, error (printing OPTION_NAME) and return zero. */
12186 static unsigned int
12187 aarch64_parse_one_option_token (const char *token
,
12189 const struct aarch64_flag_desc
*flag
,
12190 const char *option_name
)
12192 for (; flag
->name
!= NULL
; flag
++)
12194 if (length
== strlen (flag
->name
)
12195 && !strncmp (flag
->name
, token
, length
))
12199 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
12203 /* Parse OPTION which is a comma-separated list of flags to enable.
12204 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12205 default state we inherit from the CPU tuning structures. OPTION_NAME
12206 gives the top-level option we are parsing in the -moverride string,
12207 for use in error messages. */
12209 static unsigned int
12210 aarch64_parse_boolean_options (const char *option
,
12211 const struct aarch64_flag_desc
*flags
,
12212 unsigned int initial_state
,
12213 const char *option_name
)
12215 const char separator
= '.';
12216 const char* specs
= option
;
12217 const char* ntoken
= option
;
12218 unsigned int found_flags
= initial_state
;
12220 while ((ntoken
= strchr (specs
, separator
)))
12222 size_t token_length
= ntoken
- specs
;
12223 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12227 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12228 in the token stream, reset the supported operations. So:
12230 adrp+add.cmp+branch.none.adrp+add
12232 would have the result of turning on only adrp+add fusion. */
12236 found_flags
|= token_ops
;
12240 /* We ended with a comma, print something. */
12243 error ("%s string ill-formed\n", option_name
);
12247 /* We still have one more token to parse. */
12248 size_t token_length
= strlen (specs
);
12249 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12256 found_flags
|= token_ops
;
12257 return found_flags
;
12260 /* Support for overriding instruction fusion. */
12263 aarch64_parse_fuse_string (const char *fuse_string
,
12264 struct tune_params
*tune
)
12266 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
12267 aarch64_fusible_pairs
,
12272 /* Support for overriding other tuning flags. */
12275 aarch64_parse_tune_string (const char *tune_string
,
12276 struct tune_params
*tune
)
12278 tune
->extra_tuning_flags
12279 = aarch64_parse_boolean_options (tune_string
,
12280 aarch64_tuning_flags
,
12281 tune
->extra_tuning_flags
,
12285 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12286 Accept the valid SVE vector widths allowed by
12287 aarch64_sve_vector_bits_enum and use it to override sve_width
12291 aarch64_parse_sve_width_string (const char *tune_string
,
12292 struct tune_params
*tune
)
12296 int n
= sscanf (tune_string
, "%d", &width
);
12299 error ("invalid format for sve_width");
12311 error ("invalid sve_width value: %d", width
);
12313 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
12316 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12317 we understand. If it is, extract the option string and handoff to
12318 the appropriate function. */
12321 aarch64_parse_one_override_token (const char* token
,
12323 struct tune_params
*tune
)
12325 const struct aarch64_tuning_override_function
*fn
12326 = aarch64_tuning_override_functions
;
12328 const char *option_part
= strchr (token
, '=');
12331 error ("tuning string missing in option (%s)", token
);
12335 /* Get the length of the option name. */
12336 length
= option_part
- token
;
12337 /* Skip the '=' to get to the option string. */
12340 for (; fn
->name
!= NULL
; fn
++)
12342 if (!strncmp (fn
->name
, token
, length
))
12344 fn
->parse_override (option_part
, tune
);
12349 error ("unknown tuning option (%s)",token
);
12353 /* A checking mechanism for the implementation of the tls size. */
12356 initialize_aarch64_tls_size (struct gcc_options
*opts
)
12358 if (aarch64_tls_size
== 0)
12359 aarch64_tls_size
= 24;
12361 switch (opts
->x_aarch64_cmodel_var
)
12363 case AARCH64_CMODEL_TINY
:
12364 /* Both the default and maximum TLS size allowed under tiny is 1M which
12365 needs two instructions to address, so we clamp the size to 24. */
12366 if (aarch64_tls_size
> 24)
12367 aarch64_tls_size
= 24;
12369 case AARCH64_CMODEL_SMALL
:
12370 /* The maximum TLS size allowed under small is 4G. */
12371 if (aarch64_tls_size
> 32)
12372 aarch64_tls_size
= 32;
12374 case AARCH64_CMODEL_LARGE
:
12375 /* The maximum TLS size allowed under large is 16E.
12376 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12377 if (aarch64_tls_size
> 48)
12378 aarch64_tls_size
= 48;
12381 gcc_unreachable ();
12387 /* Parse STRING looking for options in the format:
12388 string :: option:string
12389 option :: name=substring
12391 substring :: defined by option. */
12394 aarch64_parse_override_string (const char* input_string
,
12395 struct tune_params
* tune
)
12397 const char separator
= ':';
12398 size_t string_length
= strlen (input_string
) + 1;
12399 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
12400 char *string
= string_root
;
12401 strncpy (string
, input_string
, string_length
);
12402 string
[string_length
- 1] = '\0';
12404 char* ntoken
= string
;
12406 while ((ntoken
= strchr (string
, separator
)))
12408 size_t token_length
= ntoken
- string
;
12409 /* Make this substring look like a string. */
12411 aarch64_parse_one_override_token (string
, token_length
, tune
);
12415 /* One last option to parse. */
12416 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
12417 free (string_root
);
12422 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
12424 if (accepted_branch_protection_string
)
12426 opts
->x_aarch64_branch_protection_string
12427 = xstrdup (accepted_branch_protection_string
);
12430 /* PR 70044: We have to be careful about being called multiple times for the
12431 same function. This means all changes should be repeatable. */
12433 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12434 Disable the frame pointer flag so the mid-end will not use a frame
12435 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12436 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12437 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12438 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
12439 if (opts
->x_flag_omit_frame_pointer
== 0)
12440 opts
->x_flag_omit_frame_pointer
= 2;
12442 /* If not optimizing for size, set the default
12443 alignment to what the target wants. */
12444 if (!opts
->x_optimize_size
)
12446 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
12447 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
12448 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
12449 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
12450 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
12451 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
12454 /* We default to no pc-relative literal loads. */
12456 aarch64_pcrelative_literal_loads
= false;
12458 /* If -mpc-relative-literal-loads is set on the command line, this
12459 implies that the user asked for PC relative literal loads. */
12460 if (opts
->x_pcrelative_literal_loads
== 1)
12461 aarch64_pcrelative_literal_loads
= true;
12463 /* In the tiny memory model it makes no sense to disallow PC relative
12464 literal pool loads. */
12465 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12466 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12467 aarch64_pcrelative_literal_loads
= true;
12469 /* When enabling the lower precision Newton series for the square root, also
12470 enable it for the reciprocal square root, since the latter is an
12471 intermediary step for the former. */
12472 if (flag_mlow_precision_sqrt
)
12473 flag_mrecip_low_precision_sqrt
= true;
12476 /* 'Unpack' up the internal tuning structs and update the options
12477 in OPTS. The caller must have set up selected_tune and selected_arch
12478 as all the other target-specific codegen decisions are
12479 derived from them. */
12482 aarch64_override_options_internal (struct gcc_options
*opts
)
12484 aarch64_tune_flags
= selected_tune
->flags
;
12485 aarch64_tune
= selected_tune
->sched_core
;
12486 /* Make a copy of the tuning parameters attached to the core, which
12487 we may later overwrite. */
12488 aarch64_tune_params
= *(selected_tune
->tune
);
12489 aarch64_architecture_version
= selected_arch
->architecture_version
;
12491 if (opts
->x_aarch64_override_tune_string
)
12492 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
12493 &aarch64_tune_params
);
12495 /* This target defaults to strict volatile bitfields. */
12496 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
12497 opts
->x_flag_strict_volatile_bitfields
= 1;
12499 if (aarch64_stack_protector_guard
== SSP_GLOBAL
12500 && opts
->x_aarch64_stack_protector_guard_offset_str
)
12502 error ("incompatible options %<-mstack-protector-guard=global%> and "
12503 "%<-mstack-protector-guard-offset=%s%>",
12504 aarch64_stack_protector_guard_offset_str
);
12507 if (aarch64_stack_protector_guard
== SSP_SYSREG
12508 && !(opts
->x_aarch64_stack_protector_guard_offset_str
12509 && opts
->x_aarch64_stack_protector_guard_reg_str
))
12511 error ("both %<-mstack-protector-guard-offset%> and "
12512 "%<-mstack-protector-guard-reg%> must be used "
12513 "with %<-mstack-protector-guard=sysreg%>");
12516 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
12518 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
12519 error ("specify a system register with a small string length.");
12522 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
12525 const char *str
= aarch64_stack_protector_guard_offset_str
;
12527 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
12528 if (!*str
|| *end
|| errno
)
12529 error ("%qs is not a valid offset in %qs", str
,
12530 "-mstack-protector-guard-offset=");
12531 aarch64_stack_protector_guard_offset
= offs
;
12534 initialize_aarch64_code_model (opts
);
12535 initialize_aarch64_tls_size (opts
);
12537 int queue_depth
= 0;
12538 switch (aarch64_tune_params
.autoprefetcher_model
)
12540 case tune_params::AUTOPREFETCHER_OFF
:
12543 case tune_params::AUTOPREFETCHER_WEAK
:
12546 case tune_params::AUTOPREFETCHER_STRONG
:
12547 queue_depth
= max_insn_queue_index
+ 1;
12550 gcc_unreachable ();
12553 /* We don't mind passing in global_options_set here as we don't use
12554 the *options_set structs anyway. */
12555 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
12557 opts
->x_param_values
,
12558 global_options_set
.x_param_values
);
12560 /* Set up parameters to be used in prefetching algorithm. Do not
12561 override the defaults unless we are tuning for a core we have
12562 researched values for. */
12563 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
12564 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
12565 aarch64_tune_params
.prefetch
->num_slots
,
12566 opts
->x_param_values
,
12567 global_options_set
.x_param_values
);
12568 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
12569 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
12570 aarch64_tune_params
.prefetch
->l1_cache_size
,
12571 opts
->x_param_values
,
12572 global_options_set
.x_param_values
);
12573 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
12574 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
12575 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
12576 opts
->x_param_values
,
12577 global_options_set
.x_param_values
);
12578 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
12579 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
12580 aarch64_tune_params
.prefetch
->l2_cache_size
,
12581 opts
->x_param_values
,
12582 global_options_set
.x_param_values
);
12583 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
12584 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
12586 opts
->x_param_values
,
12587 global_options_set
.x_param_values
);
12588 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
12589 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
12590 aarch64_tune_params
.prefetch
->minimum_stride
,
12591 opts
->x_param_values
,
12592 global_options_set
.x_param_values
);
12594 /* Use the alternative scheduling-pressure algorithm by default. */
12595 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
12596 opts
->x_param_values
,
12597 global_options_set
.x_param_values
);
12599 /* If the user hasn't changed it via configure then set the default to 64 KB
12600 for the backend. */
12601 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
12602 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
12603 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
12604 opts
->x_param_values
,
12605 global_options_set
.x_param_values
);
12607 /* Validate the guard size. */
12608 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
12610 /* Enforce that interval is the same size as size so the mid-end does the
12612 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
12614 opts
->x_param_values
,
12615 global_options_set
.x_param_values
);
12617 /* The maybe_set calls won't update the value if the user has explicitly set
12618 one. Which means we need to validate that probing interval and guard size
12621 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
12622 if (guard_size
!= probe_interval
)
12623 error ("stack clash guard size %<%d%> must be equal to probing interval "
12624 "%<%d%>", guard_size
, probe_interval
);
12626 /* Enable sw prefetching at specified optimization level for
12627 CPUS that have prefetch. Lower optimization level threshold by 1
12628 when profiling is enabled. */
12629 if (opts
->x_flag_prefetch_loop_arrays
< 0
12630 && !opts
->x_optimize_size
12631 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
12632 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
12633 opts
->x_flag_prefetch_loop_arrays
= 1;
12635 if (opts
->x_aarch64_arch_string
== NULL
)
12636 opts
->x_aarch64_arch_string
= selected_arch
->name
;
12637 if (opts
->x_aarch64_cpu_string
== NULL
)
12638 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
12639 if (opts
->x_aarch64_tune_string
== NULL
)
12640 opts
->x_aarch64_tune_string
= selected_tune
->name
;
12642 aarch64_override_options_after_change_1 (opts
);
12645 /* Print a hint with a suggestion for a core or architecture name that
12646 most closely resembles what the user passed in STR. ARCH is true if
12647 the user is asking for an architecture name. ARCH is false if the user
12648 is asking for a core name. */
12651 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
12653 auto_vec
<const char *> candidates
;
12654 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
12655 for (; entry
->name
!= NULL
; entry
++)
12656 candidates
.safe_push (entry
->name
);
12658 #ifdef HAVE_LOCAL_CPU_DETECT
12659 /* Add also "native" as possible value. */
12661 candidates
.safe_push ("native");
12665 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
12667 inform (input_location
, "valid arguments are: %s;"
12668 " did you mean %qs?", s
, hint
);
12670 inform (input_location
, "valid arguments are: %s", s
);
12675 /* Print a hint with a suggestion for a core name that most closely resembles
12676 what the user passed in STR. */
12679 aarch64_print_hint_for_core (const char *str
)
12681 aarch64_print_hint_for_core_or_arch (str
, false);
12684 /* Print a hint with a suggestion for an architecture name that most closely
12685 resembles what the user passed in STR. */
12688 aarch64_print_hint_for_arch (const char *str
)
12690 aarch64_print_hint_for_core_or_arch (str
, true);
12694 /* Print a hint with a suggestion for an extension name
12695 that most closely resembles what the user passed in STR. */
12698 aarch64_print_hint_for_extensions (const std::string
&str
)
12700 auto_vec
<const char *> candidates
;
12701 aarch64_get_all_extension_candidates (&candidates
);
12703 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
12705 inform (input_location
, "valid arguments are: %s;"
12706 " did you mean %qs?", s
, hint
);
12708 inform (input_location
, "valid arguments are: %s;", s
);
12713 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12714 specified in STR and throw errors if appropriate. Put the results if
12715 they are valid in RES and ISA_FLAGS. Return whether the option is
12719 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
12720 uint64_t *isa_flags
)
12722 std::string invalid_extension
;
12723 enum aarch64_parse_opt_result parse_res
12724 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
12726 if (parse_res
== AARCH64_PARSE_OK
)
12731 case AARCH64_PARSE_MISSING_ARG
:
12732 error ("missing cpu name in %<-mcpu=%s%>", str
);
12734 case AARCH64_PARSE_INVALID_ARG
:
12735 error ("unknown value %qs for %<-mcpu%>", str
);
12736 aarch64_print_hint_for_core (str
);
12738 case AARCH64_PARSE_INVALID_FEATURE
:
12739 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12740 invalid_extension
.c_str (), str
);
12741 aarch64_print_hint_for_extensions (invalid_extension
);
12744 gcc_unreachable ();
12750 /* Parses CONST_STR for branch protection features specified in
12751 aarch64_branch_protect_types, and set any global variables required. Returns
12752 the parsing result and assigns LAST_STR to the last processed token from
12753 CONST_STR so that it can be used for error reporting. */
12756 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
12759 char *str_root
= xstrdup (const_str
);
12760 char* token_save
= NULL
;
12761 char *str
= strtok_r (str_root
, "+", &token_save
);
12762 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
12764 res
= AARCH64_PARSE_MISSING_ARG
;
12767 char *next_str
= strtok_r (NULL
, "+", &token_save
);
12768 /* Reset the branch protection features to their defaults. */
12769 aarch64_handle_no_branch_protection (NULL
, NULL
);
12771 while (str
&& res
== AARCH64_PARSE_OK
)
12773 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
12774 bool found
= false;
12775 /* Search for this type. */
12776 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
12778 if (strcmp (str
, type
->name
) == 0)
12781 res
= type
->handler (str
, next_str
);
12783 next_str
= strtok_r (NULL
, "+", &token_save
);
12788 if (found
&& res
== AARCH64_PARSE_OK
)
12790 bool found_subtype
= true;
12791 /* Loop through each token until we find one that isn't a
12793 while (found_subtype
)
12795 found_subtype
= false;
12796 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
12797 /* Search for the subtype. */
12798 while (str
&& subtype
&& subtype
->name
&& !found_subtype
12799 && res
== AARCH64_PARSE_OK
)
12801 if (strcmp (str
, subtype
->name
) == 0)
12803 found_subtype
= true;
12804 res
= subtype
->handler (str
, next_str
);
12806 next_str
= strtok_r (NULL
, "+", &token_save
);
12814 res
= AARCH64_PARSE_INVALID_ARG
;
12817 /* Copy the last processed token into the argument to pass it back.
12818 Used by option and attribute validation to print the offending token. */
12821 if (str
) strcpy (*last_str
, str
);
12822 else *last_str
= NULL
;
12824 if (res
== AARCH64_PARSE_OK
)
12826 /* If needed, alloc the accepted string then copy in const_str.
12827 Used by override_option_after_change_1. */
12828 if (!accepted_branch_protection_string
)
12829 accepted_branch_protection_string
= (char *) xmalloc (
12830 BRANCH_PROTECT_STR_MAX
12832 strncpy (accepted_branch_protection_string
, const_str
,
12833 BRANCH_PROTECT_STR_MAX
+ 1);
12834 /* Forcibly null-terminate. */
12835 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
12841 aarch64_validate_mbranch_protection (const char *const_str
)
12843 char *str
= (char *) xmalloc (strlen (const_str
));
12844 enum aarch64_parse_opt_result res
=
12845 aarch64_parse_branch_protection (const_str
, &str
);
12846 if (res
== AARCH64_PARSE_INVALID_ARG
)
12847 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
12848 else if (res
== AARCH64_PARSE_MISSING_ARG
)
12849 error ("missing argument for %<-mbranch-protection=%>");
12851 return res
== AARCH64_PARSE_OK
;
12854 /* Validate a command-line -march option. Parse the arch and extensions
12855 (if any) specified in STR and throw errors if appropriate. Put the
12856 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12857 option is valid. */
12860 aarch64_validate_march (const char *str
, const struct processor
**res
,
12861 uint64_t *isa_flags
)
12863 std::string invalid_extension
;
12864 enum aarch64_parse_opt_result parse_res
12865 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
12867 if (parse_res
== AARCH64_PARSE_OK
)
12872 case AARCH64_PARSE_MISSING_ARG
:
12873 error ("missing arch name in %<-march=%s%>", str
);
12875 case AARCH64_PARSE_INVALID_ARG
:
12876 error ("unknown value %qs for %<-march%>", str
);
12877 aarch64_print_hint_for_arch (str
);
12879 case AARCH64_PARSE_INVALID_FEATURE
:
12880 error ("invalid feature modifier %qs in %<-march=%s%>",
12881 invalid_extension
.c_str (), str
);
12882 aarch64_print_hint_for_extensions (invalid_extension
);
12885 gcc_unreachable ();
12891 /* Validate a command-line -mtune option. Parse the cpu
12892 specified in STR and throw errors if appropriate. Put the
12893 result, if it is valid, in RES. Return whether the option is
12897 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
12899 enum aarch64_parse_opt_result parse_res
12900 = aarch64_parse_tune (str
, res
);
12902 if (parse_res
== AARCH64_PARSE_OK
)
12907 case AARCH64_PARSE_MISSING_ARG
:
12908 error ("missing cpu name in %<-mtune=%s%>", str
);
12910 case AARCH64_PARSE_INVALID_ARG
:
12911 error ("unknown value %qs for %<-mtune%>", str
);
12912 aarch64_print_hint_for_core (str
);
12915 gcc_unreachable ();
12920 /* Return the CPU corresponding to the enum CPU.
12921 If it doesn't specify a cpu, return the default. */
12923 static const struct processor
*
12924 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
12926 if (cpu
!= aarch64_none
)
12927 return &all_cores
[cpu
];
12929 /* The & 0x3f is to extract the bottom 6 bits that encode the
12930 default cpu as selected by the --with-cpu GCC configure option
12932 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12933 flags mechanism should be reworked to make it more sane. */
12934 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12937 /* Return the architecture corresponding to the enum ARCH.
12938 If it doesn't specify a valid architecture, return the default. */
12940 static const struct processor
*
12941 aarch64_get_arch (enum aarch64_arch arch
)
12943 if (arch
!= aarch64_no_arch
)
12944 return &all_architectures
[arch
];
12946 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12948 return &all_architectures
[cpu
->arch
];
12951 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12954 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
12956 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12957 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12958 deciding which .md file patterns to use and when deciding whether
12959 something is a legitimate address or constant. */
12960 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
12961 return poly_uint16 (2, 2);
12963 return (int) value
/ 64;
12966 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12967 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12968 tuning structs. In particular it must set selected_tune and
12969 aarch64_isa_flags that define the available ISA features and tuning
12970 decisions. It must also set selected_arch as this will be used to
12971 output the .arch asm tags for each function. */
12974 aarch64_override_options (void)
12976 uint64_t cpu_isa
= 0;
12977 uint64_t arch_isa
= 0;
12978 aarch64_isa_flags
= 0;
12980 bool valid_cpu
= true;
12981 bool valid_tune
= true;
12982 bool valid_arch
= true;
12984 selected_cpu
= NULL
;
12985 selected_arch
= NULL
;
12986 selected_tune
= NULL
;
12988 if (aarch64_branch_protection_string
)
12989 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
12991 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12992 If either of -march or -mtune is given, they override their
12993 respective component of -mcpu. */
12994 if (aarch64_cpu_string
)
12995 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
12998 if (aarch64_arch_string
)
12999 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
13002 if (aarch64_tune_string
)
13003 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
13005 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13006 SUBTARGET_OVERRIDE_OPTIONS
;
13009 /* If the user did not specify a processor, choose the default
13010 one for them. This will be the CPU set during configuration using
13011 --with-cpu, otherwise it is "generic". */
13016 selected_cpu
= &all_cores
[selected_arch
->ident
];
13017 aarch64_isa_flags
= arch_isa
;
13018 explicit_arch
= selected_arch
->arch
;
13022 /* Get default configure-time CPU. */
13023 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
13024 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
13028 explicit_tune_core
= selected_tune
->ident
;
13030 /* If both -mcpu and -march are specified check that they are architecturally
13031 compatible, warn if they're not and prefer the -march ISA flags. */
13032 else if (selected_arch
)
13034 if (selected_arch
->arch
!= selected_cpu
->arch
)
13036 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13037 all_architectures
[selected_cpu
->arch
].name
,
13038 selected_arch
->name
);
13040 aarch64_isa_flags
= arch_isa
;
13041 explicit_arch
= selected_arch
->arch
;
13042 explicit_tune_core
= selected_tune
? selected_tune
->ident
13043 : selected_cpu
->ident
;
13047 /* -mcpu but no -march. */
13048 aarch64_isa_flags
= cpu_isa
;
13049 explicit_tune_core
= selected_tune
? selected_tune
->ident
13050 : selected_cpu
->ident
;
13051 gcc_assert (selected_cpu
);
13052 selected_arch
= &all_architectures
[selected_cpu
->arch
];
13053 explicit_arch
= selected_arch
->arch
;
13056 /* Set the arch as well as we will need it when outputing
13057 the .arch directive in assembly. */
13058 if (!selected_arch
)
13060 gcc_assert (selected_cpu
);
13061 selected_arch
= &all_architectures
[selected_cpu
->arch
];
13064 if (!selected_tune
)
13065 selected_tune
= selected_cpu
;
13067 if (aarch64_enable_bti
== 2)
13069 #ifdef TARGET_ENABLE_BTI
13070 aarch64_enable_bti
= 1;
13072 aarch64_enable_bti
= 0;
13076 /* Return address signing is currently not supported for ILP32 targets. For
13077 LP64 targets use the configured option in the absence of a command-line
13078 option for -mbranch-protection. */
13079 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
13081 #ifdef TARGET_ENABLE_PAC_RET
13082 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
13084 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
13088 #ifndef HAVE_AS_MABI_OPTION
13089 /* The compiler may have been configured with 2.23.* binutils, which does
13090 not have support for ILP32. */
13092 error ("assembler does not support %<-mabi=ilp32%>");
13095 /* Convert -msve-vector-bits to a VG count. */
13096 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
13098 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
13099 sorry ("return address signing is only supported for %<-mabi=lp64%>");
13101 /* Make sure we properly set up the explicit options. */
13102 if ((aarch64_cpu_string
&& valid_cpu
)
13103 || (aarch64_tune_string
&& valid_tune
))
13104 gcc_assert (explicit_tune_core
!= aarch64_none
);
13106 if ((aarch64_cpu_string
&& valid_cpu
)
13107 || (aarch64_arch_string
&& valid_arch
))
13108 gcc_assert (explicit_arch
!= aarch64_no_arch
);
13110 /* The pass to insert speculation tracking runs before
13111 shrink-wrapping and the latter does not know how to update the
13112 tracking status. So disable it in this case. */
13113 if (aarch64_track_speculation
)
13114 flag_shrink_wrap
= 0;
13116 aarch64_override_options_internal (&global_options
);
13118 /* Save these options as the default ones in case we push and pop them later
13119 while processing functions with potential target attributes. */
13120 target_option_default_node
= target_option_current_node
13121 = build_target_option_node (&global_options
);
13124 /* Implement targetm.override_options_after_change. */
13127 aarch64_override_options_after_change (void)
13129 aarch64_override_options_after_change_1 (&global_options
);
13132 static struct machine_function
*
13133 aarch64_init_machine_status (void)
13135 struct machine_function
*machine
;
13136 machine
= ggc_cleared_alloc
<machine_function
> ();
13141 aarch64_init_expanders (void)
13143 init_machine_status
= aarch64_init_machine_status
;
13146 /* A checking mechanism for the implementation of the various code models. */
13148 initialize_aarch64_code_model (struct gcc_options
*opts
)
13150 if (opts
->x_flag_pic
)
13152 switch (opts
->x_aarch64_cmodel_var
)
13154 case AARCH64_CMODEL_TINY
:
13155 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
13157 case AARCH64_CMODEL_SMALL
:
13158 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13159 aarch64_cmodel
= (flag_pic
== 2
13160 ? AARCH64_CMODEL_SMALL_PIC
13161 : AARCH64_CMODEL_SMALL_SPIC
);
13163 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
13166 case AARCH64_CMODEL_LARGE
:
13167 sorry ("code model %qs with %<-f%s%>", "large",
13168 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
13171 gcc_unreachable ();
13175 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
13178 /* Implement TARGET_OPTION_SAVE. */
13181 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
13183 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
13184 ptr
->x_aarch64_branch_protection_string
13185 = opts
->x_aarch64_branch_protection_string
;
13188 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13189 using the information saved in PTR. */
13192 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
13194 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
13195 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13196 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
13197 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13198 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
13199 opts
->x_aarch64_branch_protection_string
13200 = ptr
->x_aarch64_branch_protection_string
;
13201 if (opts
->x_aarch64_branch_protection_string
)
13203 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
13207 aarch64_override_options_internal (opts
);
13210 /* Implement TARGET_OPTION_PRINT. */
13213 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
13215 const struct processor
*cpu
13216 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
13217 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
13218 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
13219 std::string extension
13220 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
13222 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
13223 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
13224 arch
->name
, extension
.c_str ());
13227 static GTY(()) tree aarch64_previous_fndecl
;
13230 aarch64_reset_previous_fndecl (void)
13232 aarch64_previous_fndecl
= NULL
;
13235 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13236 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13237 make sure optab availability predicates are recomputed when necessary. */
13240 aarch64_save_restore_target_globals (tree new_tree
)
13242 if (TREE_TARGET_GLOBALS (new_tree
))
13243 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
13244 else if (new_tree
== target_option_default_node
)
13245 restore_target_globals (&default_target_globals
);
13247 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
13250 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13251 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13252 of the function, if such exists. This function may be called multiple
13253 times on a single function so use aarch64_previous_fndecl to avoid
13254 setting up identical state. */
13257 aarch64_set_current_function (tree fndecl
)
13259 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
13262 tree old_tree
= (aarch64_previous_fndecl
13263 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
13266 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13268 /* If current function has no attributes but the previous one did,
13269 use the default node. */
13270 if (!new_tree
&& old_tree
)
13271 new_tree
= target_option_default_node
;
13273 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13274 the default have been handled by aarch64_save_restore_target_globals from
13275 aarch64_pragma_target_parse. */
13276 if (old_tree
== new_tree
)
13279 aarch64_previous_fndecl
= fndecl
;
13281 /* First set the target options. */
13282 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
13284 aarch64_save_restore_target_globals (new_tree
);
13287 /* Enum describing the various ways we can handle attributes.
13288 In many cases we can reuse the generic option handling machinery. */
13290 enum aarch64_attr_opt_type
13292 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
13293 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
13294 aarch64_attr_enum
, /* Attribute sets an enum variable. */
13295 aarch64_attr_custom
/* Attribute requires a custom handling function. */
13298 /* All the information needed to handle a target attribute.
13299 NAME is the name of the attribute.
13300 ATTR_TYPE specifies the type of behavior of the attribute as described
13301 in the definition of enum aarch64_attr_opt_type.
13302 ALLOW_NEG is true if the attribute supports a "no-" form.
13303 HANDLER is the function that takes the attribute string as an argument
13304 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13305 OPT_NUM is the enum specifying the option that the attribute modifies.
13306 This is needed for attributes that mirror the behavior of a command-line
13307 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13308 aarch64_attr_enum. */
13310 struct aarch64_attribute_info
13313 enum aarch64_attr_opt_type attr_type
;
13315 bool (*handler
) (const char *);
13316 enum opt_code opt_num
;
13319 /* Handle the ARCH_STR argument to the arch= target attribute. */
13322 aarch64_handle_attr_arch (const char *str
)
13324 const struct processor
*tmp_arch
= NULL
;
13325 std::string invalid_extension
;
13326 enum aarch64_parse_opt_result parse_res
13327 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
13329 if (parse_res
== AARCH64_PARSE_OK
)
13331 gcc_assert (tmp_arch
);
13332 selected_arch
= tmp_arch
;
13333 explicit_arch
= selected_arch
->arch
;
13339 case AARCH64_PARSE_MISSING_ARG
:
13340 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13342 case AARCH64_PARSE_INVALID_ARG
:
13343 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
13344 aarch64_print_hint_for_arch (str
);
13346 case AARCH64_PARSE_INVALID_FEATURE
:
13347 error ("invalid feature modifier %s of value (\"%s\") in "
13348 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13349 aarch64_print_hint_for_extensions (invalid_extension
);
13352 gcc_unreachable ();
13358 /* Handle the argument CPU_STR to the cpu= target attribute. */
13361 aarch64_handle_attr_cpu (const char *str
)
13363 const struct processor
*tmp_cpu
= NULL
;
13364 std::string invalid_extension
;
13365 enum aarch64_parse_opt_result parse_res
13366 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
13368 if (parse_res
== AARCH64_PARSE_OK
)
13370 gcc_assert (tmp_cpu
);
13371 selected_tune
= tmp_cpu
;
13372 explicit_tune_core
= selected_tune
->ident
;
13374 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
13375 explicit_arch
= selected_arch
->arch
;
13381 case AARCH64_PARSE_MISSING_ARG
:
13382 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13384 case AARCH64_PARSE_INVALID_ARG
:
13385 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
13386 aarch64_print_hint_for_core (str
);
13388 case AARCH64_PARSE_INVALID_FEATURE
:
13389 error ("invalid feature modifier %s of value (\"%s\") in "
13390 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13391 aarch64_print_hint_for_extensions (invalid_extension
);
13394 gcc_unreachable ();
13400 /* Handle the argument STR to the branch-protection= attribute. */
13403 aarch64_handle_attr_branch_protection (const char* str
)
13405 char *err_str
= (char *) xmalloc (strlen (str
));
13406 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
13408 bool success
= false;
13411 case AARCH64_PARSE_MISSING_ARG
:
13412 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13415 case AARCH64_PARSE_INVALID_ARG
:
13416 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13417 "=\")%> pragma or attribute", err_str
);
13419 case AARCH64_PARSE_OK
:
13421 /* Fall through. */
13422 case AARCH64_PARSE_INVALID_FEATURE
:
13425 gcc_unreachable ();
13431 /* Handle the argument STR to the tune= target attribute. */
13434 aarch64_handle_attr_tune (const char *str
)
13436 const struct processor
*tmp_tune
= NULL
;
13437 enum aarch64_parse_opt_result parse_res
13438 = aarch64_parse_tune (str
, &tmp_tune
);
13440 if (parse_res
== AARCH64_PARSE_OK
)
13442 gcc_assert (tmp_tune
);
13443 selected_tune
= tmp_tune
;
13444 explicit_tune_core
= selected_tune
->ident
;
13450 case AARCH64_PARSE_INVALID_ARG
:
13451 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
13452 aarch64_print_hint_for_core (str
);
13455 gcc_unreachable ();
13461 /* Parse an architecture extensions target attribute string specified in STR.
13462 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13463 if successful. Update aarch64_isa_flags to reflect the ISA features
13467 aarch64_handle_attr_isa_flags (char *str
)
13469 enum aarch64_parse_opt_result parse_res
;
13470 uint64_t isa_flags
= aarch64_isa_flags
;
13472 /* We allow "+nothing" in the beginning to clear out all architectural
13473 features if the user wants to handpick specific features. */
13474 if (strncmp ("+nothing", str
, 8) == 0)
13480 std::string invalid_extension
;
13481 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
13483 if (parse_res
== AARCH64_PARSE_OK
)
13485 aarch64_isa_flags
= isa_flags
;
13491 case AARCH64_PARSE_MISSING_ARG
:
13492 error ("missing value in %<target()%> pragma or attribute");
13495 case AARCH64_PARSE_INVALID_FEATURE
:
13496 error ("invalid feature modifier %s of value (\"%s\") in "
13497 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13501 gcc_unreachable ();
13507 /* The target attributes that we support. On top of these we also support just
13508 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13509 handled explicitly in aarch64_process_one_target_attr. */
13511 static const struct aarch64_attribute_info aarch64_attributes
[] =
13513 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
13514 OPT_mgeneral_regs_only
},
13515 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
13516 OPT_mfix_cortex_a53_835769
},
13517 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
13518 OPT_mfix_cortex_a53_843419
},
13519 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
13520 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
13521 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
13522 OPT_momit_leaf_frame_pointer
},
13523 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
13524 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
13526 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
13527 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
13529 { "branch-protection", aarch64_attr_custom
, false,
13530 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
13531 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
13532 OPT_msign_return_address_
},
13533 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
13536 /* Parse ARG_STR which contains the definition of one target attribute.
13537 Show appropriate errors if any or return true if the attribute is valid. */
13540 aarch64_process_one_target_attr (char *arg_str
)
13542 bool invert
= false;
13544 size_t len
= strlen (arg_str
);
13548 error ("malformed %<target()%> pragma or attribute");
13552 char *str_to_check
= (char *) alloca (len
+ 1);
13553 strcpy (str_to_check
, arg_str
);
13555 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13556 It is easier to detect and handle it explicitly here rather than going
13557 through the machinery for the rest of the target attributes in this
13559 if (*str_to_check
== '+')
13560 return aarch64_handle_attr_isa_flags (str_to_check
);
13562 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
13567 char *arg
= strchr (str_to_check
, '=');
13569 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13570 and point ARG to "foo". */
13576 const struct aarch64_attribute_info
*p_attr
;
13577 bool found
= false;
13578 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
13580 /* If the names don't match up, or the user has given an argument
13581 to an attribute that doesn't accept one, or didn't give an argument
13582 to an attribute that expects one, fail to match. */
13583 if (strcmp (str_to_check
, p_attr
->name
) != 0)
13587 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
13588 || p_attr
->attr_type
== aarch64_attr_enum
;
13590 if (attr_need_arg_p
^ (arg
!= NULL
))
13592 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
13596 /* If the name matches but the attribute does not allow "no-" versions
13597 then we can't match. */
13598 if (invert
&& !p_attr
->allow_neg
)
13600 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
13604 switch (p_attr
->attr_type
)
13606 /* Has a custom handler registered.
13607 For example, cpu=, arch=, tune=. */
13608 case aarch64_attr_custom
:
13609 gcc_assert (p_attr
->handler
);
13610 if (!p_attr
->handler (arg
))
13614 /* Either set or unset a boolean option. */
13615 case aarch64_attr_bool
:
13617 struct cl_decoded_option decoded
;
13619 generate_option (p_attr
->opt_num
, NULL
, !invert
,
13620 CL_TARGET
, &decoded
);
13621 aarch64_handle_option (&global_options
, &global_options_set
,
13622 &decoded
, input_location
);
13625 /* Set or unset a bit in the target_flags. aarch64_handle_option
13626 should know what mask to apply given the option number. */
13627 case aarch64_attr_mask
:
13629 struct cl_decoded_option decoded
;
13630 /* We only need to specify the option number.
13631 aarch64_handle_option will know which mask to apply. */
13632 decoded
.opt_index
= p_attr
->opt_num
;
13633 decoded
.value
= !invert
;
13634 aarch64_handle_option (&global_options
, &global_options_set
,
13635 &decoded
, input_location
);
13638 /* Use the option setting machinery to set an option to an enum. */
13639 case aarch64_attr_enum
:
13644 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
13645 &value
, CL_TARGET
);
13648 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
13649 NULL
, DK_UNSPECIFIED
, input_location
,
13654 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
13659 gcc_unreachable ();
13663 /* If we reached here we either have found an attribute and validated
13664 it or didn't match any. If we matched an attribute but its arguments
13665 were malformed we will have returned false already. */
13669 /* Count how many times the character C appears in
13670 NULL-terminated string STR. */
13672 static unsigned int
13673 num_occurences_in_str (char c
, char *str
)
13675 unsigned int res
= 0;
13676 while (*str
!= '\0')
13687 /* Parse the tree in ARGS that contains the target attribute information
13688 and update the global target options space. */
13691 aarch64_process_target_attr (tree args
)
13693 if (TREE_CODE (args
) == TREE_LIST
)
13697 tree head
= TREE_VALUE (args
);
13700 if (!aarch64_process_target_attr (head
))
13703 args
= TREE_CHAIN (args
);
13709 if (TREE_CODE (args
) != STRING_CST
)
13711 error ("attribute %<target%> argument not a string");
13715 size_t len
= strlen (TREE_STRING_POINTER (args
));
13716 char *str_to_check
= (char *) alloca (len
+ 1);
13717 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
13721 error ("malformed %<target()%> pragma or attribute");
13725 /* Used to catch empty spaces between commas i.e.
13726 attribute ((target ("attr1,,attr2"))). */
13727 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
13729 /* Handle multiple target attributes separated by ','. */
13730 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
13732 unsigned int num_attrs
= 0;
13736 if (!aarch64_process_one_target_attr (token
))
13738 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
13742 token
= strtok_r (NULL
, ",", &str_to_check
);
13745 if (num_attrs
!= num_commas
+ 1)
13747 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
13754 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13755 process attribute ((target ("..."))). */
13758 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
13760 struct cl_target_option cur_target
;
13763 tree new_target
, new_optimize
;
13764 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13766 /* If what we're processing is the current pragma string then the
13767 target option node is already stored in target_option_current_node
13768 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13769 having to re-parse the string. This is especially useful to keep
13770 arm_neon.h compile times down since that header contains a lot
13771 of intrinsics enclosed in pragmas. */
13772 if (!existing_target
&& args
== current_target_pragma
)
13774 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
13777 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13779 old_optimize
= build_optimization_node (&global_options
);
13780 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13782 /* If the function changed the optimization levels as well as setting
13783 target options, start with the optimizations specified. */
13784 if (func_optimize
&& func_optimize
!= old_optimize
)
13785 cl_optimization_restore (&global_options
,
13786 TREE_OPTIMIZATION (func_optimize
));
13788 /* Save the current target options to restore at the end. */
13789 cl_target_option_save (&cur_target
, &global_options
);
13791 /* If fndecl already has some target attributes applied to it, unpack
13792 them so that we add this attribute on top of them, rather than
13793 overwriting them. */
13794 if (existing_target
)
13796 struct cl_target_option
*existing_options
13797 = TREE_TARGET_OPTION (existing_target
);
13799 if (existing_options
)
13800 cl_target_option_restore (&global_options
, existing_options
);
13803 cl_target_option_restore (&global_options
,
13804 TREE_TARGET_OPTION (target_option_current_node
));
13806 ret
= aarch64_process_target_attr (args
);
13808 /* Set up any additional state. */
13811 aarch64_override_options_internal (&global_options
);
13812 /* Initialize SIMD builtins if we haven't already.
13813 Set current_target_pragma to NULL for the duration so that
13814 the builtin initialization code doesn't try to tag the functions
13815 being built with the attributes specified by any current pragma, thus
13816 going into an infinite recursion. */
13819 tree saved_current_target_pragma
= current_target_pragma
;
13820 current_target_pragma
= NULL
;
13821 aarch64_init_simd_builtins ();
13822 current_target_pragma
= saved_current_target_pragma
;
13824 new_target
= build_target_option_node (&global_options
);
13829 new_optimize
= build_optimization_node (&global_options
);
13833 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
13835 if (old_optimize
!= new_optimize
)
13836 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
13839 cl_target_option_restore (&global_options
, &cur_target
);
13841 if (old_optimize
!= new_optimize
)
13842 cl_optimization_restore (&global_options
,
13843 TREE_OPTIMIZATION (old_optimize
));
13847 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13848 tri-bool options (yes, no, don't care) and the default value is
13849 DEF, determine whether to reject inlining. */
13852 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
13853 int dont_care
, int def
)
13855 /* If the callee doesn't care, always allow inlining. */
13856 if (callee
== dont_care
)
13859 /* If the caller doesn't care, always allow inlining. */
13860 if (caller
== dont_care
)
13863 /* Otherwise, allow inlining if either the callee and caller values
13864 agree, or if the callee is using the default value. */
13865 return (callee
== caller
|| callee
== def
);
13868 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13869 to inline CALLEE into CALLER based on target-specific info.
13870 Make sure that the caller and callee have compatible architectural
13871 features. Then go through the other possible target attributes
13872 and see if they can block inlining. Try not to reject always_inline
13873 callees unless they are incompatible architecturally. */
13876 aarch64_can_inline_p (tree caller
, tree callee
)
13878 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
13879 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
13881 struct cl_target_option
*caller_opts
13882 = TREE_TARGET_OPTION (caller_tree
? caller_tree
13883 : target_option_default_node
);
13885 struct cl_target_option
*callee_opts
13886 = TREE_TARGET_OPTION (callee_tree
? callee_tree
13887 : target_option_default_node
);
13889 /* Callee's ISA flags should be a subset of the caller's. */
13890 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
13891 != callee_opts
->x_aarch64_isa_flags
)
13894 /* Allow non-strict aligned functions inlining into strict
13896 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
13897 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
13898 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
13899 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
13902 bool always_inline
= lookup_attribute ("always_inline",
13903 DECL_ATTRIBUTES (callee
));
13905 /* If the architectural features match up and the callee is always_inline
13906 then the other attributes don't matter. */
13910 if (caller_opts
->x_aarch64_cmodel_var
13911 != callee_opts
->x_aarch64_cmodel_var
)
13914 if (caller_opts
->x_aarch64_tls_dialect
13915 != callee_opts
->x_aarch64_tls_dialect
)
13918 /* Honour explicit requests to workaround errata. */
13919 if (!aarch64_tribools_ok_for_inlining_p (
13920 caller_opts
->x_aarch64_fix_a53_err835769
,
13921 callee_opts
->x_aarch64_fix_a53_err835769
,
13922 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
13925 if (!aarch64_tribools_ok_for_inlining_p (
13926 caller_opts
->x_aarch64_fix_a53_err843419
,
13927 callee_opts
->x_aarch64_fix_a53_err843419
,
13928 2, TARGET_FIX_ERR_A53_843419
))
13931 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13932 caller and calle and they don't match up, reject inlining. */
13933 if (!aarch64_tribools_ok_for_inlining_p (
13934 caller_opts
->x_flag_omit_leaf_frame_pointer
,
13935 callee_opts
->x_flag_omit_leaf_frame_pointer
,
13939 /* If the callee has specific tuning overrides, respect them. */
13940 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
13941 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
13944 /* If the user specified tuning override strings for the
13945 caller and callee and they don't match up, reject inlining.
13946 We just do a string compare here, we don't analyze the meaning
13947 of the string, as it would be too costly for little gain. */
13948 if (callee_opts
->x_aarch64_override_tune_string
13949 && caller_opts
->x_aarch64_override_tune_string
13950 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
13951 caller_opts
->x_aarch64_override_tune_string
) != 0))
13957 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
13961 aarch64_tlsdesc_abi_id ()
13963 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
13964 if (!tlsdesc_abi
.initialized_p ())
13966 HARD_REG_SET full_reg_clobbers
;
13967 CLEAR_HARD_REG_SET (full_reg_clobbers
);
13968 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
13969 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
13970 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
13971 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
13972 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
13974 return tlsdesc_abi
.id ();
13977 /* Return true if SYMBOL_REF X binds locally. */
13980 aarch64_symbol_binds_local_p (const_rtx x
)
13982 return (SYMBOL_REF_DECL (x
)
13983 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
13984 : SYMBOL_REF_LOCAL_P (x
));
13987 /* Return true if SYMBOL_REF X is thread local */
13989 aarch64_tls_symbol_p (rtx x
)
13991 if (! TARGET_HAVE_TLS
)
13994 if (GET_CODE (x
) != SYMBOL_REF
)
13997 return SYMBOL_REF_TLS_MODEL (x
) != 0;
14000 /* Classify a TLS symbol into one of the TLS kinds. */
14001 enum aarch64_symbol_type
14002 aarch64_classify_tls_symbol (rtx x
)
14004 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
14008 case TLS_MODEL_GLOBAL_DYNAMIC
:
14009 case TLS_MODEL_LOCAL_DYNAMIC
:
14010 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
14012 case TLS_MODEL_INITIAL_EXEC
:
14013 switch (aarch64_cmodel
)
14015 case AARCH64_CMODEL_TINY
:
14016 case AARCH64_CMODEL_TINY_PIC
:
14017 return SYMBOL_TINY_TLSIE
;
14019 return SYMBOL_SMALL_TLSIE
;
14022 case TLS_MODEL_LOCAL_EXEC
:
14023 if (aarch64_tls_size
== 12)
14024 return SYMBOL_TLSLE12
;
14025 else if (aarch64_tls_size
== 24)
14026 return SYMBOL_TLSLE24
;
14027 else if (aarch64_tls_size
== 32)
14028 return SYMBOL_TLSLE32
;
14029 else if (aarch64_tls_size
== 48)
14030 return SYMBOL_TLSLE48
;
14032 gcc_unreachable ();
14034 case TLS_MODEL_EMULATED
:
14035 case TLS_MODEL_NONE
:
14036 return SYMBOL_FORCE_TO_MEM
;
14039 gcc_unreachable ();
14043 /* Return the correct method for accessing X + OFFSET, where X is either
14044 a SYMBOL_REF or LABEL_REF. */
14046 enum aarch64_symbol_type
14047 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
14049 if (GET_CODE (x
) == LABEL_REF
)
14051 switch (aarch64_cmodel
)
14053 case AARCH64_CMODEL_LARGE
:
14054 return SYMBOL_FORCE_TO_MEM
;
14056 case AARCH64_CMODEL_TINY_PIC
:
14057 case AARCH64_CMODEL_TINY
:
14058 return SYMBOL_TINY_ABSOLUTE
;
14060 case AARCH64_CMODEL_SMALL_SPIC
:
14061 case AARCH64_CMODEL_SMALL_PIC
:
14062 case AARCH64_CMODEL_SMALL
:
14063 return SYMBOL_SMALL_ABSOLUTE
;
14066 gcc_unreachable ();
14070 if (GET_CODE (x
) == SYMBOL_REF
)
14072 if (aarch64_tls_symbol_p (x
))
14073 return aarch64_classify_tls_symbol (x
);
14075 switch (aarch64_cmodel
)
14077 case AARCH64_CMODEL_TINY
:
14078 /* When we retrieve symbol + offset address, we have to make sure
14079 the offset does not cause overflow of the final address. But
14080 we have no way of knowing the address of symbol at compile time
14081 so we can't accurately say if the distance between the PC and
14082 symbol + offset is outside the addressible range of +/-1M in the
14083 TINY code model. So we rely on images not being greater than
14084 1M and cap the offset at 1M and anything beyond 1M will have to
14085 be loaded using an alternative mechanism. Furthermore if the
14086 symbol is a weak reference to something that isn't known to
14087 resolve to a symbol in this module, then force to memory. */
14088 if ((SYMBOL_REF_WEAK (x
)
14089 && !aarch64_symbol_binds_local_p (x
))
14090 || !IN_RANGE (offset
, -1048575, 1048575))
14091 return SYMBOL_FORCE_TO_MEM
;
14092 return SYMBOL_TINY_ABSOLUTE
;
14094 case AARCH64_CMODEL_SMALL
:
14095 /* Same reasoning as the tiny code model, but the offset cap here is
14097 if ((SYMBOL_REF_WEAK (x
)
14098 && !aarch64_symbol_binds_local_p (x
))
14099 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
14100 HOST_WIDE_INT_C (4294967264)))
14101 return SYMBOL_FORCE_TO_MEM
;
14102 return SYMBOL_SMALL_ABSOLUTE
;
14104 case AARCH64_CMODEL_TINY_PIC
:
14105 if (!aarch64_symbol_binds_local_p (x
))
14106 return SYMBOL_TINY_GOT
;
14107 return SYMBOL_TINY_ABSOLUTE
;
14109 case AARCH64_CMODEL_SMALL_SPIC
:
14110 case AARCH64_CMODEL_SMALL_PIC
:
14111 if (!aarch64_symbol_binds_local_p (x
))
14112 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
14113 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
14114 return SYMBOL_SMALL_ABSOLUTE
;
14116 case AARCH64_CMODEL_LARGE
:
14117 /* This is alright even in PIC code as the constant
14118 pool reference is always PC relative and within
14119 the same translation unit. */
14120 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
14121 return SYMBOL_SMALL_ABSOLUTE
;
14123 return SYMBOL_FORCE_TO_MEM
;
14126 gcc_unreachable ();
14130 /* By default push everything into the constant pool. */
14131 return SYMBOL_FORCE_TO_MEM
;
14135 aarch64_constant_address_p (rtx x
)
14137 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
14141 aarch64_legitimate_pic_operand_p (rtx x
)
14143 if (GET_CODE (x
) == SYMBOL_REF
14144 || (GET_CODE (x
) == CONST
14145 && GET_CODE (XEXP (x
, 0)) == PLUS
14146 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
14152 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14153 that should be rematerialized rather than spilled. */
14156 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
14158 /* Support CSE and rematerialization of common constants. */
14159 if (CONST_INT_P (x
)
14160 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14161 || GET_CODE (x
) == CONST_VECTOR
)
14164 /* Do not allow vector struct mode constants for Advanced SIMD.
14165 We could support 0 and -1 easily, but they need support in
14166 aarch64-simd.md. */
14167 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14168 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
14171 /* Only accept variable-length vector constants if they can be
14174 ??? It would be possible to handle rematerialization of other
14175 constants via secondary reloads. */
14176 if (vec_flags
& VEC_ANY_SVE
)
14177 return aarch64_simd_valid_immediate (x
, NULL
);
14179 if (GET_CODE (x
) == HIGH
)
14182 /* Accept polynomial constants that can be calculated by using the
14183 destination of a move as the sole temporary. Constants that
14184 require a second temporary cannot be rematerialized (they can't be
14185 forced to memory and also aren't legitimate constants). */
14187 if (poly_int_rtx_p (x
, &offset
))
14188 return aarch64_offset_temporaries (false, offset
) <= 1;
14190 /* If an offset is being added to something else, we need to allow the
14191 base to be moved into the destination register, meaning that there
14192 are no free temporaries for the offset. */
14193 x
= strip_offset (x
, &offset
);
14194 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
14197 /* Do not allow const (plus (anchor_symbol, const_int)). */
14198 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
14201 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14202 so spilling them is better than rematerialization. */
14203 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
14206 /* Label references are always constant. */
14207 if (GET_CODE (x
) == LABEL_REF
)
14214 aarch64_load_tp (rtx target
)
14217 || GET_MODE (target
) != Pmode
14218 || !register_operand (target
, Pmode
))
14219 target
= gen_reg_rtx (Pmode
);
14221 /* Can return in any reg. */
14222 emit_insn (gen_aarch64_load_tp_hard (target
));
14226 /* On AAPCS systems, this is the "struct __va_list". */
14227 static GTY(()) tree va_list_type
;
14229 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14230 Return the type to use as __builtin_va_list.
14232 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14244 aarch64_build_builtin_va_list (void)
14247 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14249 /* Create the type. */
14250 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
14251 /* Give it the required name. */
14252 va_list_name
= build_decl (BUILTINS_LOCATION
,
14254 get_identifier ("__va_list"),
14256 DECL_ARTIFICIAL (va_list_name
) = 1;
14257 TYPE_NAME (va_list_type
) = va_list_name
;
14258 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
14260 /* Create the fields. */
14261 f_stack
= build_decl (BUILTINS_LOCATION
,
14262 FIELD_DECL
, get_identifier ("__stack"),
14264 f_grtop
= build_decl (BUILTINS_LOCATION
,
14265 FIELD_DECL
, get_identifier ("__gr_top"),
14267 f_vrtop
= build_decl (BUILTINS_LOCATION
,
14268 FIELD_DECL
, get_identifier ("__vr_top"),
14270 f_groff
= build_decl (BUILTINS_LOCATION
,
14271 FIELD_DECL
, get_identifier ("__gr_offs"),
14272 integer_type_node
);
14273 f_vroff
= build_decl (BUILTINS_LOCATION
,
14274 FIELD_DECL
, get_identifier ("__vr_offs"),
14275 integer_type_node
);
14277 /* Tell tree-stdarg pass about our internal offset fields.
14278 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14279 purpose to identify whether the code is updating va_list internal
14280 offset fields through irregular way. */
14281 va_list_gpr_counter_field
= f_groff
;
14282 va_list_fpr_counter_field
= f_vroff
;
14284 DECL_ARTIFICIAL (f_stack
) = 1;
14285 DECL_ARTIFICIAL (f_grtop
) = 1;
14286 DECL_ARTIFICIAL (f_vrtop
) = 1;
14287 DECL_ARTIFICIAL (f_groff
) = 1;
14288 DECL_ARTIFICIAL (f_vroff
) = 1;
14290 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
14291 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
14292 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
14293 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
14294 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
14296 TYPE_FIELDS (va_list_type
) = f_stack
;
14297 DECL_CHAIN (f_stack
) = f_grtop
;
14298 DECL_CHAIN (f_grtop
) = f_vrtop
;
14299 DECL_CHAIN (f_vrtop
) = f_groff
;
14300 DECL_CHAIN (f_groff
) = f_vroff
;
14302 /* Compute its layout. */
14303 layout_type (va_list_type
);
14305 return va_list_type
;
14308 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14310 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
14312 const CUMULATIVE_ARGS
*cum
;
14313 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14314 tree stack
, grtop
, vrtop
, groff
, vroff
;
14316 int gr_save_area_size
= cfun
->va_list_gpr_size
;
14317 int vr_save_area_size
= cfun
->va_list_fpr_size
;
14320 cum
= &crtl
->args
.info
;
14321 if (cfun
->va_list_gpr_size
)
14322 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
14323 cfun
->va_list_gpr_size
);
14324 if (cfun
->va_list_fpr_size
)
14325 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
14326 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
14330 gcc_assert (cum
->aapcs_nvrn
== 0);
14331 vr_save_area_size
= 0;
14334 f_stack
= TYPE_FIELDS (va_list_type_node
);
14335 f_grtop
= DECL_CHAIN (f_stack
);
14336 f_vrtop
= DECL_CHAIN (f_grtop
);
14337 f_groff
= DECL_CHAIN (f_vrtop
);
14338 f_vroff
= DECL_CHAIN (f_groff
);
14340 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
14342 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
14344 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
14346 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
14348 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
14351 /* Emit code to initialize STACK, which points to the next varargs stack
14352 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14353 by named arguments. STACK is 8-byte aligned. */
14354 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
14355 if (cum
->aapcs_stack_size
> 0)
14356 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
14357 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
14358 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14360 /* Emit code to initialize GRTOP, the top of the GR save area.
14361 virtual_incoming_args_rtx should have been 16 byte aligned. */
14362 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
14363 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
14364 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14366 /* Emit code to initialize VRTOP, the top of the VR save area.
14367 This address is gr_save_area_bytes below GRTOP, rounded
14368 down to the next 16-byte boundary. */
14369 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
14370 vr_offset
= ROUND_UP (gr_save_area_size
,
14371 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14374 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
14375 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
14376 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14378 /* Emit code to initialize GROFF, the offset from GRTOP of the
14379 next GPR argument. */
14380 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
14381 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
14382 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14384 /* Likewise emit code to initialize VROFF, the offset from FTOP
14385 of the next VR argument. */
14386 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
14387 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
14388 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14391 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14394 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
14395 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
14399 bool is_ha
; /* is HFA or HVA. */
14400 bool dw_align
; /* double-word align. */
14401 machine_mode ag_mode
= VOIDmode
;
14405 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14406 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
14407 HOST_WIDE_INT size
, rsize
, adjust
, align
;
14408 tree t
, u
, cond1
, cond2
;
14410 indirect_p
= pass_va_arg_by_reference (type
);
14412 type
= build_pointer_type (type
);
14414 mode
= TYPE_MODE (type
);
14416 f_stack
= TYPE_FIELDS (va_list_type_node
);
14417 f_grtop
= DECL_CHAIN (f_stack
);
14418 f_vrtop
= DECL_CHAIN (f_grtop
);
14419 f_groff
= DECL_CHAIN (f_vrtop
);
14420 f_vroff
= DECL_CHAIN (f_groff
);
14422 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
14423 f_stack
, NULL_TREE
);
14424 size
= int_size_in_bytes (type
);
14428 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
14432 if (aarch64_vfp_is_call_or_return_candidate (mode
,
14438 /* No frontends can create types with variable-sized modes, so we
14439 shouldn't be asked to pass or return them. */
14440 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
14442 /* TYPE passed in fp/simd registers. */
14444 aarch64_err_no_fpadvsimd (mode
);
14446 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
14447 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
14448 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
14449 unshare_expr (valist
), f_vroff
, NULL_TREE
);
14451 rsize
= nregs
* UNITS_PER_VREG
;
14455 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
14456 adjust
= UNITS_PER_VREG
- ag_size
;
14458 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14459 && size
< UNITS_PER_VREG
)
14461 adjust
= UNITS_PER_VREG
- size
;
14466 /* TYPE passed in general registers. */
14467 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
14468 unshare_expr (valist
), f_grtop
, NULL_TREE
);
14469 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
14470 unshare_expr (valist
), f_groff
, NULL_TREE
);
14471 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
14472 nregs
= rsize
/ UNITS_PER_WORD
;
14476 if (abi_break
&& warn_psabi
)
14477 inform (input_location
, "parameter passing for argument of type "
14478 "%qT changed in GCC 9.1", type
);
14482 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14483 && size
< UNITS_PER_WORD
)
14485 adjust
= UNITS_PER_WORD
- size
;
14489 /* Get a local temporary for the field value. */
14490 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
14492 /* Emit code to branch if off >= 0. */
14493 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
14494 build_int_cst (TREE_TYPE (off
), 0));
14495 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
14499 /* Emit: offs = (offs + 15) & -16. */
14500 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14501 build_int_cst (TREE_TYPE (off
), 15));
14502 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
14503 build_int_cst (TREE_TYPE (off
), -16));
14504 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
14509 /* Update ap.__[g|v]r_offs */
14510 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14511 build_int_cst (TREE_TYPE (off
), rsize
));
14512 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
14516 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14518 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14519 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
14520 build_int_cst (TREE_TYPE (f_off
), 0));
14521 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
14523 /* String up: make sure the assignment happens before the use. */
14524 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
14525 COND_EXPR_ELSE (cond1
) = t
;
14527 /* Prepare the trees handling the argument that is passed on the stack;
14528 the top level node will store in ON_STACK. */
14529 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
14532 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14533 t
= fold_build_pointer_plus_hwi (arg
, 15);
14534 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14535 build_int_cst (TREE_TYPE (t
), -16));
14536 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
14540 /* Advance ap.__stack */
14541 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
14542 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14543 build_int_cst (TREE_TYPE (t
), -8));
14544 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
14545 /* String up roundup and advance. */
14547 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14548 /* String up with arg */
14549 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
14550 /* Big-endianness related address adjustment. */
14551 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14552 && size
< UNITS_PER_WORD
)
14554 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
14555 size_int (UNITS_PER_WORD
- size
));
14556 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
14559 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
14560 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
14562 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14565 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
14566 build_int_cst (TREE_TYPE (off
), adjust
));
14568 t
= fold_convert (sizetype
, t
);
14569 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
14573 /* type ha; // treat as "struct {ftype field[n];}"
14574 ... [computing offs]
14575 for (i = 0; i <nregs; ++i, offs += 16)
14576 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14579 tree tmp_ha
, field_t
, field_ptr_t
;
14581 /* Declare a local variable. */
14582 tmp_ha
= create_tmp_var_raw (type
, "ha");
14583 gimple_add_tmp_var (tmp_ha
);
14585 /* Establish the base type. */
14589 field_t
= float_type_node
;
14590 field_ptr_t
= float_ptr_type_node
;
14593 field_t
= double_type_node
;
14594 field_ptr_t
= double_ptr_type_node
;
14597 field_t
= long_double_type_node
;
14598 field_ptr_t
= long_double_ptr_type_node
;
14601 field_t
= aarch64_fp16_type_node
;
14602 field_ptr_t
= aarch64_fp16_ptr_type_node
;
14607 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
14608 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
14609 field_ptr_t
= build_pointer_type (field_t
);
14616 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14617 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
14619 t
= fold_convert (field_ptr_t
, addr
);
14620 t
= build2 (MODIFY_EXPR
, field_t
,
14621 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
14622 build1 (INDIRECT_REF
, field_t
, t
));
14624 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14625 for (i
= 1; i
< nregs
; ++i
)
14627 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
14628 u
= fold_convert (field_ptr_t
, addr
);
14629 u
= build2 (MODIFY_EXPR
, field_t
,
14630 build2 (MEM_REF
, field_t
, tmp_ha
,
14631 build_int_cst (field_ptr_t
,
14633 int_size_in_bytes (field_t
)))),
14634 build1 (INDIRECT_REF
, field_t
, u
));
14635 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
14638 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
14639 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
14642 COND_EXPR_ELSE (cond2
) = t
;
14643 addr
= fold_convert (build_pointer_type (type
), cond1
);
14644 addr
= build_va_arg_indirect_ref (addr
);
14647 addr
= build_va_arg_indirect_ref (addr
);
14652 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14655 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
14656 const function_arg_info
&arg
,
14657 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
14659 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
14660 CUMULATIVE_ARGS local_cum
;
14661 int gr_saved
= cfun
->va_list_gpr_size
;
14662 int vr_saved
= cfun
->va_list_fpr_size
;
14664 /* The caller has advanced CUM up to, but not beyond, the last named
14665 argument. Advance a local copy of CUM past the last "real" named
14666 argument, to find out how many registers are left over. */
14668 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
14670 /* Found out how many registers we need to save.
14671 Honor tree-stdvar analysis results. */
14672 if (cfun
->va_list_gpr_size
)
14673 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
14674 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
14675 if (cfun
->va_list_fpr_size
)
14676 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
14677 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
14681 gcc_assert (local_cum
.aapcs_nvrn
== 0);
14691 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14692 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
14693 - gr_saved
* UNITS_PER_WORD
);
14694 mem
= gen_frame_mem (BLKmode
, ptr
);
14695 set_mem_alias_set (mem
, get_varargs_alias_set ());
14697 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
14702 /* We can't use move_block_from_reg, because it will use
14703 the wrong mode, storing D regs only. */
14704 machine_mode mode
= TImode
;
14705 int off
, i
, vr_start
;
14707 /* Set OFF to the offset from virtual_incoming_args_rtx of
14708 the first vector register. The VR save area lies below
14709 the GR one, and is aligned to 16 bytes. */
14710 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14711 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14712 off
-= vr_saved
* UNITS_PER_VREG
;
14714 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
14715 for (i
= 0; i
< vr_saved
; ++i
)
14719 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
14720 mem
= gen_frame_mem (mode
, ptr
);
14721 set_mem_alias_set (mem
, get_varargs_alias_set ());
14722 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
14723 off
+= UNITS_PER_VREG
;
14728 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14729 any complication of having crtl->args.pretend_args_size changed. */
14730 cfun
->machine
->frame
.saved_varargs_size
14731 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14732 STACK_BOUNDARY
/ BITS_PER_UNIT
)
14733 + vr_saved
* UNITS_PER_VREG
);
14737 aarch64_conditional_register_usage (void)
14742 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
14745 call_used_regs
[i
] = 1;
14749 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
14752 call_used_regs
[i
] = 1;
14755 /* When tracking speculation, we need a couple of call-clobbered registers
14756 to track the speculation state. It would be nice to just use
14757 IP0 and IP1, but currently there are numerous places that just
14758 assume these registers are free for other uses (eg pointer
14759 authentication). */
14760 if (aarch64_track_speculation
)
14762 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14763 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14764 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14765 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14769 /* Walk down the type tree of TYPE counting consecutive base elements.
14770 If *MODEP is VOIDmode, then set it to the first valid floating point
14771 type. If a non-floating point type is found, or if a floating point
14772 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14773 otherwise return the count in the sub-tree. */
14775 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
14778 HOST_WIDE_INT size
;
14780 switch (TREE_CODE (type
))
14783 mode
= TYPE_MODE (type
);
14784 if (mode
!= DFmode
&& mode
!= SFmode
14785 && mode
!= TFmode
&& mode
!= HFmode
)
14788 if (*modep
== VOIDmode
)
14791 if (*modep
== mode
)
14797 mode
= TYPE_MODE (TREE_TYPE (type
));
14798 if (mode
!= DFmode
&& mode
!= SFmode
14799 && mode
!= TFmode
&& mode
!= HFmode
)
14802 if (*modep
== VOIDmode
)
14805 if (*modep
== mode
)
14811 /* Use V2SImode and V4SImode as representatives of all 64-bit
14812 and 128-bit vector types. */
14813 size
= int_size_in_bytes (type
);
14826 if (*modep
== VOIDmode
)
14829 /* Vector modes are considered to be opaque: two vectors are
14830 equivalent for the purposes of being homogeneous aggregates
14831 if they are the same size. */
14832 if (*modep
== mode
)
14840 tree index
= TYPE_DOMAIN (type
);
14842 /* Can't handle incomplete types nor sizes that are not
14844 if (!COMPLETE_TYPE_P (type
)
14845 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14848 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
14851 || !TYPE_MAX_VALUE (index
)
14852 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
14853 || !TYPE_MIN_VALUE (index
)
14854 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
14858 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
14859 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
14861 /* There must be no padding. */
14862 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14863 count
* GET_MODE_BITSIZE (*modep
)))
14875 /* Can't handle incomplete types nor sizes that are not
14877 if (!COMPLETE_TYPE_P (type
)
14878 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14881 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14883 if (TREE_CODE (field
) != FIELD_DECL
)
14886 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14889 count
+= sub_count
;
14892 /* There must be no padding. */
14893 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14894 count
* GET_MODE_BITSIZE (*modep
)))
14901 case QUAL_UNION_TYPE
:
14903 /* These aren't very interesting except in a degenerate case. */
14908 /* Can't handle incomplete types nor sizes that are not
14910 if (!COMPLETE_TYPE_P (type
)
14911 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14914 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14916 if (TREE_CODE (field
) != FIELD_DECL
)
14919 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14922 count
= count
> sub_count
? count
: sub_count
;
14925 /* There must be no padding. */
14926 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14927 count
* GET_MODE_BITSIZE (*modep
)))
14940 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14941 type as described in AAPCS64 \S 4.1.2.
14943 See the comment above aarch64_composite_type_p for the notes on MODE. */
14946 aarch64_short_vector_p (const_tree type
,
14949 poly_int64 size
= -1;
14951 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
14952 size
= int_size_in_bytes (type
);
14953 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
14954 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
14955 size
= GET_MODE_SIZE (mode
);
14957 return known_eq (size
, 8) || known_eq (size
, 16);
14960 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14961 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14962 array types. The C99 floating-point complex types are also considered
14963 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14964 types, which are GCC extensions and out of the scope of AAPCS64, are
14965 treated as composite types here as well.
14967 Note that MODE itself is not sufficient in determining whether a type
14968 is such a composite type or not. This is because
14969 stor-layout.c:compute_record_mode may have already changed the MODE
14970 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14971 structure with only one field may have its MODE set to the mode of the
14972 field. Also an integer mode whose size matches the size of the
14973 RECORD_TYPE type may be used to substitute the original mode
14974 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14975 solely relied on. */
14978 aarch64_composite_type_p (const_tree type
,
14981 if (aarch64_short_vector_p (type
, mode
))
14984 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
14987 if (mode
== BLKmode
14988 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
14989 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
14995 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14996 shall be passed or returned in simd/fp register(s) (providing these
14997 parameter passing registers are available).
14999 Upon successful return, *COUNT returns the number of needed registers,
15000 *BASE_MODE returns the mode of the individual register and when IS_HAF
15001 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
15002 floating-point aggregate or a homogeneous short-vector aggregate. */
15005 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
15007 machine_mode
*base_mode
,
15011 machine_mode new_mode
= VOIDmode
;
15012 bool composite_p
= aarch64_composite_type_p (type
, mode
);
15014 if (is_ha
!= NULL
) *is_ha
= false;
15016 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15017 || aarch64_short_vector_p (type
, mode
))
15022 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
15024 if (is_ha
!= NULL
) *is_ha
= true;
15026 new_mode
= GET_MODE_INNER (mode
);
15028 else if (type
&& composite_p
)
15030 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
15032 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
15034 if (is_ha
!= NULL
) *is_ha
= true;
15043 *base_mode
= new_mode
;
15047 /* Implement TARGET_STRUCT_VALUE_RTX. */
15050 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
15051 int incoming ATTRIBUTE_UNUSED
)
15053 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
15056 /* Implements target hook vector_mode_supported_p. */
15058 aarch64_vector_mode_supported_p (machine_mode mode
)
15060 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15061 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
15064 /* Return the full-width SVE vector mode for element mode MODE, if one
15067 aarch64_full_sve_mode (scalar_mode mode
)
15084 return VNx16QImode
;
15086 return opt_machine_mode ();
15090 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15093 aarch64_vq_mode (scalar_mode mode
)
15112 return opt_machine_mode ();
15116 /* Return appropriate SIMD container
15117 for MODE within a vector of WIDTH bits. */
15118 static machine_mode
15119 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
15121 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
15122 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
15124 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
15127 if (known_eq (width
, 128))
15128 return aarch64_vq_mode (mode
).else_mode (word_mode
);
15149 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15150 static machine_mode
15151 aarch64_preferred_simd_mode (scalar_mode mode
)
15153 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
15154 return aarch64_simd_container_mode (mode
, bits
);
15157 /* Return a list of possible vector sizes for the vectorizer
15158 to iterate over. */
15160 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
15163 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
15164 sizes
->safe_push (16);
15165 sizes
->safe_push (8);
15168 /* Implement TARGET_MANGLE_TYPE. */
15170 static const char *
15171 aarch64_mangle_type (const_tree type
)
15173 /* The AArch64 ABI documents say that "__va_list" has to be
15174 mangled as if it is in the "std" namespace. */
15175 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
15176 return "St9__va_list";
15178 /* Half-precision float. */
15179 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
15182 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15184 if (TYPE_NAME (type
) != NULL
)
15185 return aarch64_general_mangle_builtin_type (type
);
15187 /* Use the default mangling. */
15191 /* Find the first rtx_insn before insn that will generate an assembly
15195 aarch64_prev_real_insn (rtx_insn
*insn
)
15202 insn
= prev_real_insn (insn
);
15204 while (insn
&& recog_memoized (insn
) < 0);
15210 is_madd_op (enum attr_type t1
)
15213 /* A number of these may be AArch32 only. */
15214 enum attr_type mlatypes
[] = {
15215 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
15216 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
15217 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
15220 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
15222 if (t1
== mlatypes
[i
])
15229 /* Check if there is a register dependency between a load and the insn
15230 for which we hold recog_data. */
15233 dep_between_memop_and_curr (rtx memop
)
15238 gcc_assert (GET_CODE (memop
) == SET
);
15240 if (!REG_P (SET_DEST (memop
)))
15243 load_reg
= SET_DEST (memop
);
15244 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
15246 rtx operand
= recog_data
.operand
[opno
];
15247 if (REG_P (operand
)
15248 && reg_overlap_mentioned_p (load_reg
, operand
))
15256 /* When working around the Cortex-A53 erratum 835769,
15257 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15258 instruction and has a preceding memory instruction such that a NOP
15259 should be inserted between them. */
15262 aarch64_madd_needs_nop (rtx_insn
* insn
)
15264 enum attr_type attr_type
;
15268 if (!TARGET_FIX_ERR_A53_835769
)
15271 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
15274 attr_type
= get_attr_type (insn
);
15275 if (!is_madd_op (attr_type
))
15278 prev
= aarch64_prev_real_insn (insn
);
15279 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15280 Restore recog state to INSN to avoid state corruption. */
15281 extract_constrain_insn_cached (insn
);
15283 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
15286 body
= single_set (prev
);
15288 /* If the previous insn is a memory op and there is no dependency between
15289 it and the DImode madd, emit a NOP between them. If body is NULL then we
15290 have a complex memory operation, probably a load/store pair.
15291 Be conservative for now and emit a NOP. */
15292 if (GET_MODE (recog_data
.operand
[0]) == DImode
15293 && (!body
|| !dep_between_memop_and_curr (body
)))
15301 /* Implement FINAL_PRESCAN_INSN. */
15304 aarch64_final_prescan_insn (rtx_insn
*insn
)
15306 if (aarch64_madd_needs_nop (insn
))
15307 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
15311 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15315 aarch64_sve_index_immediate_p (rtx base_or_step
)
15317 return (CONST_INT_P (base_or_step
)
15318 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
15321 /* Return true if X is a valid immediate for the SVE ADD and SUB
15322 instructions. Negate X first if NEGATE_P is true. */
15325 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
15329 if (!const_vec_duplicate_p (x
, &elt
)
15330 || !CONST_INT_P (elt
))
15333 HOST_WIDE_INT val
= INTVAL (elt
);
15336 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
15339 return IN_RANGE (val
, 0, 0xff);
15340 return IN_RANGE (val
, 0, 0xff00);
15343 /* Return true if X is a valid immediate operand for an SVE logical
15344 instruction such as AND. */
15347 aarch64_sve_bitmask_immediate_p (rtx x
)
15351 return (const_vec_duplicate_p (x
, &elt
)
15352 && CONST_INT_P (elt
)
15353 && aarch64_bitmask_imm (INTVAL (elt
),
15354 GET_MODE_INNER (GET_MODE (x
))));
15357 /* Return true if X is a valid immediate for the SVE DUP and CPY
15361 aarch64_sve_dup_immediate_p (rtx x
)
15363 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
15364 if (!CONST_INT_P (x
))
15367 HOST_WIDE_INT val
= INTVAL (x
);
15369 return IN_RANGE (val
, -0x80, 0x7f);
15370 return IN_RANGE (val
, -0x8000, 0x7f00);
15373 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15374 SIGNED_P says whether the operand is signed rather than unsigned. */
15377 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
15381 return (const_vec_duplicate_p (x
, &elt
)
15382 && CONST_INT_P (elt
)
15384 ? IN_RANGE (INTVAL (elt
), -16, 15)
15385 : IN_RANGE (INTVAL (elt
), 0, 127)));
15388 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15389 instruction. Negate X first if NEGATE_P is true. */
15392 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
15397 if (!const_vec_duplicate_p (x
, &elt
)
15398 || GET_CODE (elt
) != CONST_DOUBLE
)
15401 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
15404 r
= real_value_negate (&r
);
15406 if (real_equal (&r
, &dconst1
))
15408 if (real_equal (&r
, &dconsthalf
))
15413 /* Return true if X is a valid immediate operand for an SVE FMUL
15417 aarch64_sve_float_mul_immediate_p (rtx x
)
15421 return (const_vec_duplicate_p (x
, &elt
)
15422 && GET_CODE (elt
) == CONST_DOUBLE
15423 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
15424 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
15427 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15428 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15429 is nonnull, use it to describe valid immediates. */
15431 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
15432 simd_immediate_info
*info
,
15433 enum simd_immediate_check which
,
15434 simd_immediate_info::insn_type insn
)
15436 /* Try a 4-byte immediate with LSL. */
15437 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
15438 if ((val32
& (0xff << shift
)) == val32
)
15441 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15442 simd_immediate_info::LSL
, shift
);
15446 /* Try a 2-byte immediate with LSL. */
15447 unsigned int imm16
= val32
& 0xffff;
15448 if (imm16
== (val32
>> 16))
15449 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
15450 if ((imm16
& (0xff << shift
)) == imm16
)
15453 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
15454 simd_immediate_info::LSL
, shift
);
15458 /* Try a 4-byte immediate with MSL, except for cases that MVN
15460 if (which
== AARCH64_CHECK_MOV
)
15461 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
15463 unsigned int low
= (1 << shift
) - 1;
15464 if (((val32
& (0xff << shift
)) | low
) == val32
)
15467 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15468 simd_immediate_info::MSL
, shift
);
15476 /* Return true if replicating VAL64 is a valid immediate for the
15477 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15478 use it to describe valid immediates. */
15480 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
15481 simd_immediate_info
*info
,
15482 enum simd_immediate_check which
)
15484 unsigned int val32
= val64
& 0xffffffff;
15485 unsigned int val16
= val64
& 0xffff;
15486 unsigned int val8
= val64
& 0xff;
15488 if (val32
== (val64
>> 32))
15490 if ((which
& AARCH64_CHECK_ORR
) != 0
15491 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
15492 simd_immediate_info::MOV
))
15495 if ((which
& AARCH64_CHECK_BIC
) != 0
15496 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
15497 simd_immediate_info::MVN
))
15500 /* Try using a replicated byte. */
15501 if (which
== AARCH64_CHECK_MOV
15502 && val16
== (val32
>> 16)
15503 && val8
== (val16
>> 8))
15506 *info
= simd_immediate_info (QImode
, val8
);
15511 /* Try using a bit-to-bytemask. */
15512 if (which
== AARCH64_CHECK_MOV
)
15515 for (i
= 0; i
< 64; i
+= 8)
15517 unsigned char byte
= (val64
>> i
) & 0xff;
15518 if (byte
!= 0 && byte
!= 0xff)
15524 *info
= simd_immediate_info (DImode
, val64
);
15531 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15532 instruction. If INFO is nonnull, use it to describe valid immediates. */
15535 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
15536 simd_immediate_info
*info
)
15538 scalar_int_mode mode
= DImode
;
15539 unsigned int val32
= val64
& 0xffffffff;
15540 if (val32
== (val64
>> 32))
15543 unsigned int val16
= val32
& 0xffff;
15544 if (val16
== (val32
>> 16))
15547 unsigned int val8
= val16
& 0xff;
15548 if (val8
== (val16
>> 8))
15552 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
15553 if (IN_RANGE (val
, -0x80, 0x7f))
15555 /* DUP with no shift. */
15557 *info
= simd_immediate_info (mode
, val
);
15560 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
15562 /* DUP with LSL #8. */
15564 *info
= simd_immediate_info (mode
, val
);
15567 if (aarch64_bitmask_imm (val64
, mode
))
15571 *info
= simd_immediate_info (mode
, val
);
15577 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15578 it to describe valid immediates. */
15581 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
15583 if (x
== CONST0_RTX (GET_MODE (x
)))
15586 *info
= simd_immediate_info (DImode
, 0);
15590 /* Analyze the value as a VNx16BImode. This should be relatively
15591 efficient, since rtx_vector_builder has enough built-in capacity
15592 to store all VLA predicate constants without needing the heap. */
15593 rtx_vector_builder builder
;
15594 if (!aarch64_get_sve_pred_bits (builder
, x
))
15597 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
15598 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
15600 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
15601 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
15602 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
15606 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
15607 *info
= simd_immediate_info (int_mode
, pattern
);
15615 /* Return true if OP is a valid SIMD immediate for the operation
15616 described by WHICH. If INFO is nonnull, use it to describe valid
15619 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
15620 enum simd_immediate_check which
)
15622 machine_mode mode
= GET_MODE (op
);
15623 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15624 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15627 if (vec_flags
& VEC_SVE_PRED
)
15628 return aarch64_sve_pred_valid_immediate (op
, info
);
15630 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
15632 unsigned int n_elts
;
15633 if (GET_CODE (op
) == CONST_VECTOR
15634 && CONST_VECTOR_DUPLICATE_P (op
))
15635 n_elts
= CONST_VECTOR_NPATTERNS (op
);
15636 else if ((vec_flags
& VEC_SVE_DATA
)
15637 && const_vec_series_p (op
, &base
, &step
))
15639 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
15640 if (!aarch64_sve_index_immediate_p (base
)
15641 || !aarch64_sve_index_immediate_p (step
))
15645 *info
= simd_immediate_info (elt_mode
, base
, step
);
15648 else if (GET_CODE (op
) == CONST_VECTOR
15649 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
15650 /* N_ELTS set above. */;
15654 scalar_float_mode elt_float_mode
;
15656 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
15658 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
15659 if (aarch64_float_const_zero_rtx_p (elt
)
15660 || aarch64_float_const_representable_p (elt
))
15663 *info
= simd_immediate_info (elt_float_mode
, elt
);
15668 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
15672 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
15674 /* Expand the vector constant out into a byte vector, with the least
15675 significant byte of the register first. */
15676 auto_vec
<unsigned char, 16> bytes
;
15677 bytes
.reserve (n_elts
* elt_size
);
15678 for (unsigned int i
= 0; i
< n_elts
; i
++)
15680 /* The vector is provided in gcc endian-neutral fashion.
15681 For aarch64_be Advanced SIMD, it must be laid out in the vector
15682 register in reverse order. */
15683 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
15684 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
15686 if (elt_mode
!= elt_int_mode
)
15687 elt
= gen_lowpart (elt_int_mode
, elt
);
15689 if (!CONST_INT_P (elt
))
15692 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
15693 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
15695 bytes
.quick_push (elt_val
& 0xff);
15696 elt_val
>>= BITS_PER_UNIT
;
15700 /* The immediate must repeat every eight bytes. */
15701 unsigned int nbytes
= bytes
.length ();
15702 for (unsigned i
= 8; i
< nbytes
; ++i
)
15703 if (bytes
[i
] != bytes
[i
- 8])
15706 /* Get the repeating 8-byte value as an integer. No endian correction
15707 is needed here because bytes is already in lsb-first order. */
15708 unsigned HOST_WIDE_INT val64
= 0;
15709 for (unsigned int i
= 0; i
< 8; i
++)
15710 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
15711 << (i
* BITS_PER_UNIT
));
15713 if (vec_flags
& VEC_SVE_DATA
)
15714 return aarch64_sve_valid_immediate (val64
, info
);
15716 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
15719 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15720 has a step in the range of INDEX. Return the index expression if so,
15721 otherwise return null. */
15723 aarch64_check_zero_based_sve_index_immediate (rtx x
)
15726 if (const_vec_series_p (x
, &base
, &step
)
15727 && base
== const0_rtx
15728 && aarch64_sve_index_immediate_p (step
))
15733 /* Check of immediate shift constants are within range. */
15735 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
15737 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
15739 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
15741 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
15744 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15745 operation of width WIDTH at bit position POS. */
15748 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
15750 gcc_assert (CONST_INT_P (width
));
15751 gcc_assert (CONST_INT_P (pos
));
15753 unsigned HOST_WIDE_INT mask
15754 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
15755 return GEN_INT (mask
<< UINTVAL (pos
));
15759 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
15761 if (GET_CODE (x
) == HIGH
15762 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
15765 if (CONST_INT_P (x
))
15768 if (VECTOR_MODE_P (GET_MODE (x
)))
15770 /* Require predicate constants to be VNx16BI before RA, so that we
15771 force everything to have a canonical form. */
15772 if (!lra_in_progress
15773 && !reload_completed
15774 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
15775 && GET_MODE (x
) != VNx16BImode
)
15778 return aarch64_simd_valid_immediate (x
, NULL
);
15781 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
15784 if (aarch64_sve_cnt_immediate_p (x
))
15787 return aarch64_classify_symbolic_expression (x
)
15788 == SYMBOL_TINY_ABSOLUTE
;
15791 /* Return a const_int vector of VAL. */
15793 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
15795 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
15796 return gen_const_vec_duplicate (mode
, c
);
15799 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15802 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
15804 machine_mode vmode
;
15806 vmode
= aarch64_simd_container_mode (mode
, 64);
15807 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
15808 return aarch64_simd_valid_immediate (op_v
, NULL
);
15811 /* Construct and return a PARALLEL RTX vector with elements numbering the
15812 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15813 the vector - from the perspective of the architecture. This does not
15814 line up with GCC's perspective on lane numbers, so we end up with
15815 different masks depending on our target endian-ness. The diagram
15816 below may help. We must draw the distinction when building masks
15817 which select one half of the vector. An instruction selecting
15818 architectural low-lanes for a big-endian target, must be described using
15819 a mask selecting GCC high-lanes.
15821 Big-Endian Little-Endian
15823 GCC 0 1 2 3 3 2 1 0
15824 | x | x | x | x | | x | x | x | x |
15825 Architecture 3 2 1 0 3 2 1 0
15827 Low Mask: { 2, 3 } { 0, 1 }
15828 High Mask: { 0, 1 } { 2, 3 }
15830 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15833 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
15835 rtvec v
= rtvec_alloc (nunits
/ 2);
15836 int high_base
= nunits
/ 2;
15842 if (BYTES_BIG_ENDIAN
)
15843 base
= high
? low_base
: high_base
;
15845 base
= high
? high_base
: low_base
;
15847 for (i
= 0; i
< nunits
/ 2; i
++)
15848 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
15850 t1
= gen_rtx_PARALLEL (mode
, v
);
15854 /* Check OP for validity as a PARALLEL RTX vector with elements
15855 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15856 from the perspective of the architecture. See the diagram above
15857 aarch64_simd_vect_par_cnst_half for more details. */
15860 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
15864 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
15867 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
15868 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
15869 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
15872 if (count_op
!= count_ideal
)
15875 for (i
= 0; i
< count_ideal
; i
++)
15877 rtx elt_op
= XVECEXP (op
, 0, i
);
15878 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
15880 if (!CONST_INT_P (elt_op
)
15881 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
15887 /* Return a PARALLEL containing NELTS elements, with element I equal
15888 to BASE + I * STEP. */
15891 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
15893 rtvec vec
= rtvec_alloc (nelts
);
15894 for (unsigned int i
= 0; i
< nelts
; ++i
)
15895 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
15896 return gen_rtx_PARALLEL (VOIDmode
, vec
);
15899 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15900 series with step STEP. */
15903 aarch64_stepped_int_parallel_p (rtx op
, int step
)
15905 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
15908 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
15909 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
15910 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
15911 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
15917 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15918 HIGH (exclusive). */
15920 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
15923 HOST_WIDE_INT lane
;
15924 gcc_assert (CONST_INT_P (operand
));
15925 lane
= INTVAL (operand
);
15927 if (lane
< low
|| lane
>= high
)
15930 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
15932 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
15936 /* Peform endian correction on lane number N, which indexes a vector
15937 of mode MODE, and return the result as an SImode rtx. */
15940 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
15942 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
15945 /* Return TRUE if OP is a valid vector addressing mode. */
15948 aarch64_simd_mem_operand_p (rtx op
)
15950 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
15951 || REG_P (XEXP (op
, 0)));
15954 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15957 aarch64_sve_ld1r_operand_p (rtx op
)
15959 struct aarch64_address_info addr
;
15963 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
15964 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
15965 && addr
.type
== ADDRESS_REG_IMM
15966 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
15969 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15971 aarch64_sve_ld1rq_operand_p (rtx op
)
15973 struct aarch64_address_info addr
;
15974 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
15976 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
15979 if (addr
.type
== ADDRESS_REG_IMM
)
15980 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
15982 if (addr
.type
== ADDRESS_REG_REG
)
15983 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
15988 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15989 The conditions for STR are the same. */
15991 aarch64_sve_ldr_operand_p (rtx op
)
15993 struct aarch64_address_info addr
;
15996 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
15997 false, ADDR_QUERY_ANY
)
15998 && addr
.type
== ADDRESS_REG_IMM
);
16001 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
16002 We need to be able to access the individual pieces, so the range
16003 is different from LD[234] and ST[234]. */
16005 aarch64_sve_struct_memory_operand_p (rtx op
)
16010 machine_mode mode
= GET_MODE (op
);
16011 struct aarch64_address_info addr
;
16012 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
16014 || addr
.type
!= ADDRESS_REG_IMM
)
16017 poly_int64 first
= addr
.const_offset
;
16018 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
16019 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
16020 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
16023 /* Emit a register copy from operand to operand, taking care not to
16024 early-clobber source registers in the process.
16026 COUNT is the number of components into which the copy needs to be
16029 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
16030 unsigned int count
)
16033 int rdest
= REGNO (operands
[0]);
16034 int rsrc
= REGNO (operands
[1]);
16036 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
16038 for (i
= 0; i
< count
; i
++)
16039 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
16040 gen_rtx_REG (mode
, rsrc
+ i
));
16042 for (i
= 0; i
< count
; i
++)
16043 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
16044 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
16047 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16048 one of VSTRUCT modes: OI, CI, or XI. */
16050 aarch64_simd_attr_length_rglist (machine_mode mode
)
16052 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
16053 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
16056 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
16057 alignment of a vector to 128 bits. SVE predicates have an alignment of
16059 static HOST_WIDE_INT
16060 aarch64_simd_vector_alignment (const_tree type
)
16062 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16063 be set for non-predicate vectors of booleans. Modes are the most
16064 direct way we have of identifying real SVE predicate types. */
16065 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
16067 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
16069 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
16072 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
16074 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
16076 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
16078 /* If the length of the vector is fixed, try to align to that length,
16079 otherwise don't try to align at all. */
16080 HOST_WIDE_INT result
;
16081 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
16082 result
= TYPE_ALIGN (TREE_TYPE (type
));
16085 return TYPE_ALIGN (type
);
16088 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
16090 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
16095 /* For fixed-length vectors, check that the vectorizer will aim for
16096 full-vector alignment. This isn't true for generic GCC vectors
16097 that are wider than the ABI maximum of 128 bits. */
16098 poly_uint64 preferred_alignment
=
16099 aarch64_vectorize_preferred_vector_alignment (type
);
16100 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
16101 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
16102 preferred_alignment
))
16105 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
16109 /* Return true if the vector misalignment factor is supported by the
16112 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
16113 const_tree type
, int misalignment
,
16116 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
16118 /* Return if movmisalign pattern is not supported for this mode. */
16119 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
16122 /* Misalignment factor is unknown at compile time. */
16123 if (misalignment
== -1)
16126 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
16130 /* If VALS is a vector constant that can be loaded into a register
16131 using DUP, generate instructions to do so and return an RTX to
16132 assign to the register. Otherwise return NULL_RTX. */
16134 aarch64_simd_dup_constant (rtx vals
)
16136 machine_mode mode
= GET_MODE (vals
);
16137 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16140 if (!const_vec_duplicate_p (vals
, &x
))
16143 /* We can load this constant by using DUP and a constant in a
16144 single ARM register. This will be cheaper than a vector
16146 x
= copy_to_mode_reg (inner_mode
, x
);
16147 return gen_vec_duplicate (mode
, x
);
16151 /* Generate code to load VALS, which is a PARALLEL containing only
16152 constants (for vec_init) or CONST_VECTOR, efficiently into a
16153 register. Returns an RTX to copy into the register, or NULL_RTX
16154 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16156 aarch64_simd_make_constant (rtx vals
)
16158 machine_mode mode
= GET_MODE (vals
);
16160 rtx const_vec
= NULL_RTX
;
16164 if (GET_CODE (vals
) == CONST_VECTOR
)
16166 else if (GET_CODE (vals
) == PARALLEL
)
16168 /* A CONST_VECTOR must contain only CONST_INTs and
16169 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16170 Only store valid constants in a CONST_VECTOR. */
16171 int n_elts
= XVECLEN (vals
, 0);
16172 for (i
= 0; i
< n_elts
; ++i
)
16174 rtx x
= XVECEXP (vals
, 0, i
);
16175 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16178 if (n_const
== n_elts
)
16179 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
16182 gcc_unreachable ();
16184 if (const_vec
!= NULL_RTX
16185 && aarch64_simd_valid_immediate (const_vec
, NULL
))
16186 /* Load using MOVI/MVNI. */
16188 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
16189 /* Loaded using DUP. */
16191 else if (const_vec
!= NULL_RTX
)
16192 /* Load from constant pool. We cannot take advantage of single-cycle
16193 LD1 because we need a PC-relative addressing mode. */
16196 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16197 We cannot construct an initializer. */
16201 /* Expand a vector initialisation sequence, such that TARGET is
16202 initialised to contain VALS. */
16205 aarch64_expand_vector_init (rtx target
, rtx vals
)
16207 machine_mode mode
= GET_MODE (target
);
16208 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
16209 /* The number of vector elements. */
16210 int n_elts
= XVECLEN (vals
, 0);
16211 /* The number of vector elements which are not constant. */
16213 rtx any_const
= NULL_RTX
;
16214 /* The first element of vals. */
16215 rtx v0
= XVECEXP (vals
, 0, 0);
16216 bool all_same
= true;
16218 /* This is a special vec_init<M><N> where N is not an element mode but a
16219 vector mode with half the elements of M. We expect to find two entries
16220 of mode N in VALS and we must put their concatentation into TARGET. */
16221 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
16223 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
16224 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
16225 rtx lo
= XVECEXP (vals
, 0, 0);
16226 rtx hi
= XVECEXP (vals
, 0, 1);
16227 machine_mode narrow_mode
= GET_MODE (lo
);
16228 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
16229 gcc_assert (narrow_mode
== GET_MODE (hi
));
16231 /* When we want to concatenate a half-width vector with zeroes we can
16232 use the aarch64_combinez[_be] patterns. Just make sure that the
16233 zeroes are in the right half. */
16234 if (BYTES_BIG_ENDIAN
16235 && aarch64_simd_imm_zero (lo
, narrow_mode
)
16236 && general_operand (hi
, narrow_mode
))
16237 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
16238 else if (!BYTES_BIG_ENDIAN
16239 && aarch64_simd_imm_zero (hi
, narrow_mode
)
16240 && general_operand (lo
, narrow_mode
))
16241 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
16244 /* Else create the two half-width registers and combine them. */
16246 lo
= force_reg (GET_MODE (lo
), lo
);
16248 hi
= force_reg (GET_MODE (hi
), hi
);
16250 if (BYTES_BIG_ENDIAN
)
16251 std::swap (lo
, hi
);
16252 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
16257 /* Count the number of variable elements to initialise. */
16258 for (int i
= 0; i
< n_elts
; ++i
)
16260 rtx x
= XVECEXP (vals
, 0, i
);
16261 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
16266 all_same
&= rtx_equal_p (x
, v0
);
16269 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16270 how best to handle this. */
16273 rtx constant
= aarch64_simd_make_constant (vals
);
16274 if (constant
!= NULL_RTX
)
16276 emit_move_insn (target
, constant
);
16281 /* Splat a single non-constant element if we can. */
16284 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
16285 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16289 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
16290 gcc_assert (icode
!= CODE_FOR_nothing
);
16292 /* If there are only variable elements, try to optimize
16293 the insertion using dup for the most common element
16294 followed by insertions. */
16296 /* The algorithm will fill matches[*][0] with the earliest matching element,
16297 and matches[X][1] with the count of duplicate elements (if X is the
16298 earliest element which has duplicates). */
16300 if (n_var
== n_elts
&& n_elts
<= 16)
16302 int matches
[16][2] = {0};
16303 for (int i
= 0; i
< n_elts
; i
++)
16305 for (int j
= 0; j
<= i
; j
++)
16307 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
16315 int maxelement
= 0;
16317 for (int i
= 0; i
< n_elts
; i
++)
16318 if (matches
[i
][1] > maxv
)
16321 maxv
= matches
[i
][1];
16324 /* Create a duplicate of the most common element, unless all elements
16325 are equally useless to us, in which case just immediately set the
16326 vector register using the first element. */
16330 /* For vectors of two 64-bit elements, we can do even better. */
16332 && (inner_mode
== E_DImode
16333 || inner_mode
== E_DFmode
))
16336 rtx x0
= XVECEXP (vals
, 0, 0);
16337 rtx x1
= XVECEXP (vals
, 0, 1);
16338 /* Combine can pick up this case, but handling it directly
16339 here leaves clearer RTL.
16341 This is load_pair_lanes<mode>, and also gives us a clean-up
16342 for store_pair_lanes<mode>. */
16343 if (memory_operand (x0
, inner_mode
)
16344 && memory_operand (x1
, inner_mode
)
16345 && !STRICT_ALIGNMENT
16346 && rtx_equal_p (XEXP (x1
, 0),
16347 plus_constant (Pmode
,
16349 GET_MODE_SIZE (inner_mode
))))
16352 if (inner_mode
== DFmode
)
16353 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
16355 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
16360 /* The subreg-move sequence below will move into lane zero of the
16361 vector register. For big-endian we want that position to hold
16362 the last element of VALS. */
16363 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
16364 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16365 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
16369 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16370 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16373 /* Insert the rest. */
16374 for (int i
= 0; i
< n_elts
; i
++)
16376 rtx x
= XVECEXP (vals
, 0, i
);
16377 if (matches
[i
][0] == maxelement
)
16379 x
= copy_to_mode_reg (inner_mode
, x
);
16380 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16385 /* Initialise a vector which is part-variable. We want to first try
16386 to build those lanes which are constant in the most efficient way we
16388 if (n_var
!= n_elts
)
16390 rtx copy
= copy_rtx (vals
);
16392 /* Load constant part of vector. We really don't care what goes into the
16393 parts we will overwrite, but we're more likely to be able to load the
16394 constant efficiently if it has fewer, larger, repeating parts
16395 (see aarch64_simd_valid_immediate). */
16396 for (int i
= 0; i
< n_elts
; i
++)
16398 rtx x
= XVECEXP (vals
, 0, i
);
16399 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16401 rtx subst
= any_const
;
16402 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
16404 /* Look in the copied vector, as more elements are const. */
16405 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
16406 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
16412 XVECEXP (copy
, 0, i
) = subst
;
16414 aarch64_expand_vector_init (target
, copy
);
16417 /* Insert the variable lanes directly. */
16418 for (int i
= 0; i
< n_elts
; i
++)
16420 rtx x
= XVECEXP (vals
, 0, i
);
16421 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16423 x
= copy_to_mode_reg (inner_mode
, x
);
16424 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16428 /* Emit RTL corresponding to:
16429 insr TARGET, ELEM. */
16432 emit_insr (rtx target
, rtx elem
)
16434 machine_mode mode
= GET_MODE (target
);
16435 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16436 elem
= force_reg (elem_mode
, elem
);
16438 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
16439 gcc_assert (icode
!= CODE_FOR_nothing
);
16440 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
16443 /* Subroutine of aarch64_sve_expand_vector_init for handling
16444 trailing constants.
16445 This function works as follows:
16446 (a) Create a new vector consisting of trailing constants.
16447 (b) Initialize TARGET with the constant vector using emit_move_insn.
16448 (c) Insert remaining elements in TARGET using insr.
16449 NELTS is the total number of elements in original vector while
16450 while NELTS_REQD is the number of elements that are actually
16453 ??? The heuristic used is to do above only if number of constants
16454 is at least half the total number of elements. May need fine tuning. */
16457 aarch64_sve_expand_vector_init_handle_trailing_constants
16458 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
16460 machine_mode mode
= GET_MODE (target
);
16461 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16462 int n_trailing_constants
= 0;
16464 for (int i
= nelts_reqd
- 1;
16465 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
16467 n_trailing_constants
++;
16469 if (n_trailing_constants
>= nelts_reqd
/ 2)
16471 rtx_vector_builder
v (mode
, 1, nelts
);
16472 for (int i
= 0; i
< nelts
; i
++)
16473 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
16474 rtx const_vec
= v
.build ();
16475 emit_move_insn (target
, const_vec
);
16477 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
16478 emit_insr (target
, builder
.elt (i
));
16486 /* Subroutine of aarch64_sve_expand_vector_init.
16488 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16489 (b) Skip trailing elements from BUILDER, which are the same as
16490 element NELTS_REQD - 1.
16491 (c) Insert earlier elements in reverse order in TARGET using insr. */
16494 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
16495 const rtx_vector_builder
&builder
,
16498 machine_mode mode
= GET_MODE (target
);
16499 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16501 struct expand_operand ops
[2];
16502 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
16503 gcc_assert (icode
!= CODE_FOR_nothing
);
16505 create_output_operand (&ops
[0], target
, mode
);
16506 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
16507 expand_insn (icode
, 2, ops
);
16509 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16510 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
16511 emit_insr (target
, builder
.elt (i
));
16514 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16515 when all trailing elements of builder are same.
16516 This works as follows:
16517 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16518 (b) Insert remaining elements in TARGET using insr.
16520 ??? The heuristic used is to do above if number of same trailing elements
16521 is at least 3/4 of total number of elements, loosely based on
16522 heuristic from mostly_zeros_p. May need fine-tuning. */
16525 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16526 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
16528 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16529 if (ndups
>= (3 * nelts_reqd
) / 4)
16531 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
16532 nelts_reqd
- ndups
+ 1);
16539 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16540 of elements in BUILDER.
16542 The function tries to initialize TARGET from BUILDER if it fits one
16543 of the special cases outlined below.
16545 Failing that, the function divides BUILDER into two sub-vectors:
16546 v_even = even elements of BUILDER;
16547 v_odd = odd elements of BUILDER;
16549 and recursively calls itself with v_even and v_odd.
16551 if (recursive call succeeded for v_even or v_odd)
16552 TARGET = zip (v_even, v_odd)
16554 The function returns true if it managed to build TARGET from BUILDER
16555 with one of the special cases, false otherwise.
16557 Example: {a, 1, b, 2, c, 3, d, 4}
16559 The vector gets divided into:
16560 v_even = {a, b, c, d}
16561 v_odd = {1, 2, 3, 4}
16563 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16564 initialize tmp2 from constant vector v_odd using emit_move_insn.
16566 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16567 4 elements, so we construct tmp1 from v_even using insr:
16574 TARGET = zip (tmp1, tmp2)
16575 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16578 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
16579 int nelts
, int nelts_reqd
)
16581 machine_mode mode
= GET_MODE (target
);
16583 /* Case 1: Vector contains trailing constants. */
16585 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16586 (target
, builder
, nelts
, nelts_reqd
))
16589 /* Case 2: Vector contains leading constants. */
16591 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
16592 for (int i
= 0; i
< nelts_reqd
; i
++)
16593 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
16594 rev_builder
.finalize ();
16596 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16597 (target
, rev_builder
, nelts
, nelts_reqd
))
16599 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16603 /* Case 3: Vector contains trailing same element. */
16605 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16606 (target
, builder
, nelts_reqd
))
16609 /* Case 4: Vector contains leading same element. */
16611 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16612 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
16614 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16618 /* Avoid recursing below 4-elements.
16619 ??? The threshold 4 may need fine-tuning. */
16621 if (nelts_reqd
<= 4)
16624 rtx_vector_builder
v_even (mode
, 1, nelts
);
16625 rtx_vector_builder
v_odd (mode
, 1, nelts
);
16627 for (int i
= 0; i
< nelts
* 2; i
+= 2)
16629 v_even
.quick_push (builder
.elt (i
));
16630 v_odd
.quick_push (builder
.elt (i
+ 1));
16633 v_even
.finalize ();
16636 rtx tmp1
= gen_reg_rtx (mode
);
16637 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
16638 nelts
, nelts_reqd
/ 2);
16640 rtx tmp2
= gen_reg_rtx (mode
);
16641 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
16642 nelts
, nelts_reqd
/ 2);
16644 if (!did_even_p
&& !did_odd_p
)
16647 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16648 special cases and zip v_even, v_odd. */
16651 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
16654 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
16656 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
16657 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
16661 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16664 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
16666 machine_mode mode
= GET_MODE (target
);
16667 int nelts
= XVECLEN (vals
, 0);
16669 rtx_vector_builder
v (mode
, 1, nelts
);
16670 for (int i
= 0; i
< nelts
; i
++)
16671 v
.quick_push (XVECEXP (vals
, 0, i
));
16674 /* If neither sub-vectors of v could be initialized specially,
16675 then use INSR to insert all elements from v into TARGET.
16676 ??? This might not be optimal for vectors with large
16677 initializers like 16-element or above.
16678 For nelts < 4, it probably isn't useful to handle specially. */
16681 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
16682 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
16685 /* Check whether VALUE is a vector constant in which every element
16686 is either a power of 2 or a negated power of 2. If so, return
16687 a constant vector of log2s, and flip CODE between PLUS and MINUS
16688 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16691 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
16693 if (GET_CODE (value
) != CONST_VECTOR
)
16696 rtx_vector_builder builder
;
16697 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
16700 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
16701 /* 1 if the result of the multiplication must be negated,
16702 0 if it mustn't, or -1 if we don't yet care. */
16704 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
16705 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
16707 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
16708 if (!CONST_SCALAR_INT_P (elt
))
16710 rtx_mode_t
val (elt
, int_mode
);
16711 wide_int pow2
= wi::neg (val
);
16714 /* It matters whether we negate or not. Make that choice,
16715 and make sure that it's consistent with previous elements. */
16716 if (negate
== !wi::neg_p (val
))
16718 negate
= wi::neg_p (val
);
16722 /* POW2 is now the value that we want to be a power of 2. */
16723 int shift
= wi::exact_log2 (pow2
);
16726 builder
.quick_push (gen_int_mode (shift
, int_mode
));
16729 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16731 else if (negate
== 1)
16732 code
= code
== PLUS
? MINUS
: PLUS
;
16733 return builder
.build ();
16736 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16737 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16738 operands array, in the same order as for fma_optab. Return true if
16739 the function emitted all the necessary instructions, false if the caller
16740 should generate the pattern normally with the new OPERANDS array. */
16743 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
16745 machine_mode mode
= GET_MODE (operands
[0]);
16746 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
16748 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
16749 NULL_RTX
, true, OPTAB_DIRECT
);
16750 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
16751 operands
[3], product
, operands
[0], true,
16755 operands
[2] = force_reg (mode
, operands
[2]);
16759 /* Likewise, but for a conditional pattern. */
16762 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
16764 machine_mode mode
= GET_MODE (operands
[0]);
16765 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
16767 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
16768 NULL_RTX
, true, OPTAB_DIRECT
);
16769 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
16770 operands
[4], product
, operands
[5]));
16773 operands
[3] = force_reg (mode
, operands
[3]);
16777 static unsigned HOST_WIDE_INT
16778 aarch64_shift_truncation_mask (machine_mode mode
)
16780 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
16782 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
16785 /* Select a format to encode pointers in exception handling data. */
16787 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
16790 switch (aarch64_cmodel
)
16792 case AARCH64_CMODEL_TINY
:
16793 case AARCH64_CMODEL_TINY_PIC
:
16794 case AARCH64_CMODEL_SMALL
:
16795 case AARCH64_CMODEL_SMALL_PIC
:
16796 case AARCH64_CMODEL_SMALL_SPIC
:
16797 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16799 type
= DW_EH_PE_sdata4
;
16802 /* No assumptions here. 8-byte relocs required. */
16803 type
= DW_EH_PE_sdata8
;
16806 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
16809 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16812 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
16814 if (aarch64_simd_decl_p (decl
))
16816 fprintf (stream
, "\t.variant_pcs\t");
16817 assemble_name (stream
, name
);
16818 fprintf (stream
, "\n");
16822 /* The last .arch and .tune assembly strings that we printed. */
16823 static std::string aarch64_last_printed_arch_string
;
16824 static std::string aarch64_last_printed_tune_string
;
16826 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16827 by the function fndecl. */
16830 aarch64_declare_function_name (FILE *stream
, const char* name
,
16833 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
16835 struct cl_target_option
*targ_options
;
16837 targ_options
= TREE_TARGET_OPTION (target_parts
);
16839 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
16840 gcc_assert (targ_options
);
16842 const struct processor
*this_arch
16843 = aarch64_get_arch (targ_options
->x_explicit_arch
);
16845 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
16846 std::string extension
16847 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
16849 /* Only update the assembler .arch string if it is distinct from the last
16850 such string we printed. */
16851 std::string to_print
= this_arch
->name
+ extension
;
16852 if (to_print
!= aarch64_last_printed_arch_string
)
16854 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
16855 aarch64_last_printed_arch_string
= to_print
;
16858 /* Print the cpu name we're tuning for in the comments, might be
16859 useful to readers of the generated asm. Do it only when it changes
16860 from function to function and verbose assembly is requested. */
16861 const struct processor
*this_tune
16862 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
16864 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
16866 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
16868 aarch64_last_printed_tune_string
= this_tune
->name
;
16871 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
16873 /* Don't forget the type directive for ELF. */
16874 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
16875 ASM_OUTPUT_LABEL (stream
, name
);
16878 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16881 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
16883 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
16884 const char *value
= IDENTIFIER_POINTER (target
);
16885 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16886 ASM_OUTPUT_DEF (stream
, name
, value
);
16889 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16890 function symbol references. */
16893 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
16895 default_elf_asm_output_external (stream
, decl
, name
);
16896 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16899 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16900 Used to output the .cfi_b_key_frame directive when signing the current
16901 function with the B key. */
16904 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
16906 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
16907 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
16908 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
16911 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16914 aarch64_start_file (void)
16916 struct cl_target_option
*default_options
16917 = TREE_TARGET_OPTION (target_option_default_node
);
16919 const struct processor
*default_arch
16920 = aarch64_get_arch (default_options
->x_explicit_arch
);
16921 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
16922 std::string extension
16923 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
16924 default_arch
->flags
);
16926 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
16927 aarch64_last_printed_tune_string
= "";
16928 asm_fprintf (asm_out_file
, "\t.arch %s\n",
16929 aarch64_last_printed_arch_string
.c_str ());
16931 default_file_start ();
16934 /* Emit load exclusive. */
16937 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
16938 rtx mem
, rtx model_rtx
)
16940 if (mode
== TImode
)
16941 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
16942 gen_highpart (DImode
, rval
),
16945 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
16948 /* Emit store exclusive. */
16951 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
16952 rtx mem
, rtx rval
, rtx model_rtx
)
16954 if (mode
== TImode
)
16955 emit_insn (gen_aarch64_store_exclusive_pair
16956 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
16957 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
16959 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
16962 /* Mark the previous jump instruction as unlikely. */
16965 aarch64_emit_unlikely_jump (rtx insn
)
16967 rtx_insn
*jump
= emit_jump_insn (insn
);
16968 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
16971 /* We store the names of the various atomic helpers in a 5x4 array.
16972 Return the libcall function given MODE, MODEL and NAMES. */
16975 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
16976 const atomic_ool_names
*names
)
16978 memmodel model
= memmodel_base (INTVAL (model_rtx
));
16979 int mode_idx
, model_idx
;
16999 gcc_unreachable ();
17004 case MEMMODEL_RELAXED
:
17007 case MEMMODEL_CONSUME
:
17008 case MEMMODEL_ACQUIRE
:
17011 case MEMMODEL_RELEASE
:
17014 case MEMMODEL_ACQ_REL
:
17015 case MEMMODEL_SEQ_CST
:
17019 gcc_unreachable ();
17022 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
17023 VISIBILITY_HIDDEN
);
17026 #define DEF0(B, N) \
17027 { "__aarch64_" #B #N "_relax", \
17028 "__aarch64_" #B #N "_acq", \
17029 "__aarch64_" #B #N "_rel", \
17030 "__aarch64_" #B #N "_acq_rel" }
17032 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17033 { NULL, NULL, NULL, NULL }
17034 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17036 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
17037 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
17038 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
17039 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
17040 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
17041 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
17047 /* Expand a compare and swap pattern. */
17050 aarch64_expand_compare_and_swap (rtx operands
[])
17052 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
17053 machine_mode mode
, r_mode
;
17055 bval
= operands
[0];
17056 rval
= operands
[1];
17058 oldval
= operands
[3];
17059 newval
= operands
[4];
17060 is_weak
= operands
[5];
17061 mod_s
= operands
[6];
17062 mod_f
= operands
[7];
17063 mode
= GET_MODE (mem
);
17065 /* Normally the succ memory model must be stronger than fail, but in the
17066 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17067 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
17068 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
17069 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
17070 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
17073 if (mode
== QImode
|| mode
== HImode
)
17076 rval
= gen_reg_rtx (r_mode
);
17081 /* The CAS insn requires oldval and rval overlap, but we need to
17082 have a copy of oldval saved across the operation to tell if
17083 the operation is successful. */
17084 if (reg_overlap_mentioned_p (rval
, oldval
))
17085 rval
= copy_to_mode_reg (r_mode
, oldval
);
17087 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
17089 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
17091 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17093 else if (TARGET_OUTLINE_ATOMICS
)
17095 /* Oldval must satisfy compare afterward. */
17096 if (!aarch64_plus_operand (oldval
, mode
))
17097 oldval
= force_reg (mode
, oldval
);
17098 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
17099 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
17100 oldval
, mode
, newval
, mode
,
17101 XEXP (mem
, 0), Pmode
);
17102 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17106 /* The oldval predicate varies by mode. Test it and force to reg. */
17107 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
17108 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
17109 oldval
= force_reg (mode
, oldval
);
17111 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
17112 is_weak
, mod_s
, mod_f
));
17113 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
17116 if (r_mode
!= mode
)
17117 rval
= gen_lowpart (mode
, rval
);
17118 emit_move_insn (operands
[1], rval
);
17120 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
17121 emit_insn (gen_rtx_SET (bval
, x
));
17124 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17125 sequence implementing an atomic operation. */
17128 aarch64_emit_post_barrier (enum memmodel model
)
17130 const enum memmodel base_model
= memmodel_base (model
);
17132 if (is_mm_sync (model
)
17133 && (base_model
== MEMMODEL_ACQUIRE
17134 || base_model
== MEMMODEL_ACQ_REL
17135 || base_model
== MEMMODEL_SEQ_CST
))
17137 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
17141 /* Split a compare and swap pattern. */
17144 aarch64_split_compare_and_swap (rtx operands
[])
17146 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
17149 rtx_code_label
*label1
, *label2
;
17150 enum memmodel model
;
17152 rval
= operands
[0];
17154 oldval
= operands
[2];
17155 newval
= operands
[3];
17156 is_weak
= (operands
[4] != const0_rtx
);
17157 model_rtx
= operands
[5];
17158 scratch
= operands
[7];
17159 mode
= GET_MODE (mem
);
17160 model
= memmodel_from_int (INTVAL (model_rtx
));
17162 /* When OLDVAL is zero and we want the strong version we can emit a tighter
17165 LD[A]XR rval, [mem]
17167 ST[L]XR scratch, newval, [mem]
17168 CBNZ scratch, .label1
17171 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
17172 oldval
== const0_rtx
&& mode
!= TImode
);
17177 label1
= gen_label_rtx ();
17178 emit_label (label1
);
17180 label2
= gen_label_rtx ();
17182 /* The initial load can be relaxed for a __sync operation since a final
17183 barrier will be emitted to stop code hoisting. */
17184 if (is_mm_sync (model
))
17185 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
17187 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
17190 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
17193 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
17194 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
17196 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17197 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
17198 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17200 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
17204 if (aarch64_track_speculation
)
17206 /* Emit an explicit compare instruction, so that we can correctly
17207 track the condition codes. */
17208 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
17209 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
17212 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
17214 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17215 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
17216 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17219 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
17221 emit_label (label2
);
17223 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17224 to set the condition flags. If this is not used it will be removed by
17227 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
17229 /* Emit any final barrier needed for a __sync operation. */
17230 if (is_mm_sync (model
))
17231 aarch64_emit_post_barrier (model
);
17234 /* Split an atomic operation. */
17237 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
17238 rtx value
, rtx model_rtx
, rtx cond
)
17240 machine_mode mode
= GET_MODE (mem
);
17241 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
17242 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
17243 const bool is_sync
= is_mm_sync (model
);
17244 rtx_code_label
*label
;
17247 /* Split the atomic operation into a sequence. */
17248 label
= gen_label_rtx ();
17249 emit_label (label
);
17252 new_out
= gen_lowpart (wmode
, new_out
);
17254 old_out
= gen_lowpart (wmode
, old_out
);
17257 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
17259 /* The initial load can be relaxed for a __sync operation since a final
17260 barrier will be emitted to stop code hoisting. */
17262 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
17263 GEN_INT (MEMMODEL_RELAXED
));
17265 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
17274 x
= gen_rtx_AND (wmode
, old_out
, value
);
17275 emit_insn (gen_rtx_SET (new_out
, x
));
17276 x
= gen_rtx_NOT (wmode
, new_out
);
17277 emit_insn (gen_rtx_SET (new_out
, x
));
17281 if (CONST_INT_P (value
))
17283 value
= GEN_INT (-INTVAL (value
));
17286 /* Fall through. */
17289 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
17290 emit_insn (gen_rtx_SET (new_out
, x
));
17294 aarch64_emit_store_exclusive (mode
, cond
, mem
,
17295 gen_lowpart (mode
, new_out
), model_rtx
);
17297 if (aarch64_track_speculation
)
17299 /* Emit an explicit compare instruction, so that we can correctly
17300 track the condition codes. */
17301 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
17302 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
17305 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
17307 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
17308 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
17309 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
17311 /* Emit any final barrier needed for a __sync operation. */
17313 aarch64_emit_post_barrier (model
);
17317 aarch64_init_libfuncs (void)
17319 /* Half-precision float operations. The compiler handles all operations
17320 with NULL libfuncs by converting to SFmode. */
17323 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
17324 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
17327 set_optab_libfunc (add_optab
, HFmode
, NULL
);
17328 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
17329 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
17330 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
17331 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
17334 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
17335 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
17336 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
17337 set_optab_libfunc (le_optab
, HFmode
, NULL
);
17338 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
17339 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
17340 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
17343 /* Target hook for c_mode_for_suffix. */
17344 static machine_mode
17345 aarch64_c_mode_for_suffix (char suffix
)
17353 /* We can only represent floating point constants which will fit in
17354 "quarter-precision" values. These values are characterised by
17355 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17358 (-1)^s * (n/16) * 2^r
17361 's' is the sign bit.
17362 'n' is an integer in the range 16 <= n <= 31.
17363 'r' is an integer in the range -3 <= r <= 4. */
17365 /* Return true iff X can be represented by a quarter-precision
17366 floating point immediate operand X. Note, we cannot represent 0.0. */
17368 aarch64_float_const_representable_p (rtx x
)
17370 /* This represents our current view of how many bits
17371 make up the mantissa. */
17372 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
17374 unsigned HOST_WIDE_INT mantissa
, mask
;
17375 REAL_VALUE_TYPE r
, m
;
17378 x
= unwrap_const_vec_duplicate (x
);
17379 if (!CONST_DOUBLE_P (x
))
17382 if (GET_MODE (x
) == VOIDmode
17383 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
17386 r
= *CONST_DOUBLE_REAL_VALUE (x
);
17388 /* We cannot represent infinities, NaNs or +/-zero. We won't
17389 know if we have +zero until we analyse the mantissa, but we
17390 can reject the other invalid values. */
17391 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
17392 || REAL_VALUE_MINUS_ZERO (r
))
17395 /* Extract exponent. */
17396 r
= real_value_abs (&r
);
17397 exponent
= REAL_EXP (&r
);
17399 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17400 highest (sign) bit, with a fixed binary point at bit point_pos.
17401 m1 holds the low part of the mantissa, m2 the high part.
17402 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17403 bits for the mantissa, this can fail (low bits will be lost). */
17404 real_ldexp (&m
, &r
, point_pos
- exponent
);
17405 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
17407 /* If the low part of the mantissa has bits set we cannot represent
17409 if (w
.ulow () != 0)
17411 /* We have rejected the lower HOST_WIDE_INT, so update our
17412 understanding of how many bits lie in the mantissa and
17413 look only at the high HOST_WIDE_INT. */
17414 mantissa
= w
.elt (1);
17415 point_pos
-= HOST_BITS_PER_WIDE_INT
;
17417 /* We can only represent values with a mantissa of the form 1.xxxx. */
17418 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
17419 if ((mantissa
& mask
) != 0)
17422 /* Having filtered unrepresentable values, we may now remove all
17423 but the highest 5 bits. */
17424 mantissa
>>= point_pos
- 5;
17426 /* We cannot represent the value 0.0, so reject it. This is handled
17431 /* Then, as bit 4 is always set, we can mask it off, leaving
17432 the mantissa in the range [0, 15]. */
17433 mantissa
&= ~(1 << 4);
17434 gcc_assert (mantissa
<= 15);
17436 /* GCC internally does not use IEEE754-like encoding (where normalized
17437 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17438 Our mantissa values are shifted 4 places to the left relative to
17439 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17440 by 5 places to correct for GCC's representation. */
17441 exponent
= 5 - exponent
;
17443 return (exponent
>= 0 && exponent
<= 7);
17446 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17447 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17448 output MOVI/MVNI, ORR or BIC immediate. */
17450 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
17451 enum simd_immediate_check which
)
17454 static char templ
[40];
17455 const char *mnemonic
;
17456 const char *shift_op
;
17457 unsigned int lane_count
= 0;
17460 struct simd_immediate_info info
;
17462 /* This will return true to show const_vector is legal for use as either
17463 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17464 It will also update INFO to show how the immediate should be generated.
17465 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17466 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
17467 gcc_assert (is_valid
);
17469 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17470 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
17472 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17474 gcc_assert (info
.insn
== simd_immediate_info::MOV
17475 && info
.u
.mov
.shift
== 0);
17476 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17477 move immediate path. */
17478 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17479 info
.u
.mov
.value
= GEN_INT (0);
17482 const unsigned int buf_size
= 20;
17483 char float_buf
[buf_size
] = {'\0'};
17484 real_to_decimal_for_mode (float_buf
,
17485 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17486 buf_size
, buf_size
, 1, info
.elt_mode
);
17488 if (lane_count
== 1)
17489 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
17491 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
17492 lane_count
, element_char
, float_buf
);
17497 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
17499 if (which
== AARCH64_CHECK_MOV
)
17501 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
17502 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
17504 if (lane_count
== 1)
17505 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
17506 mnemonic
, UINTVAL (info
.u
.mov
.value
));
17507 else if (info
.u
.mov
.shift
)
17508 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17509 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
17510 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
17513 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17514 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
17515 element_char
, UINTVAL (info
.u
.mov
.value
));
17519 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17520 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
17521 if (info
.u
.mov
.shift
)
17522 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17523 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
17524 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
17527 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17528 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
17529 element_char
, UINTVAL (info
.u
.mov
.value
));
17535 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
17538 /* If a floating point number was passed and we desire to use it in an
17539 integer mode do the conversion to integer. */
17540 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
17542 unsigned HOST_WIDE_INT ival
;
17543 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
17544 gcc_unreachable ();
17545 immediate
= gen_int_mode (ival
, mode
);
17548 machine_mode vmode
;
17549 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17550 a 128 bit vector mode. */
17551 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
17553 vmode
= aarch64_simd_container_mode (mode
, width
);
17554 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
17555 return aarch64_output_simd_mov_immediate (v_op
, width
);
17558 /* Return the output string to use for moving immediate CONST_VECTOR
17559 into an SVE register. */
17562 aarch64_output_sve_mov_immediate (rtx const_vector
)
17564 static char templ
[40];
17565 struct simd_immediate_info info
;
17568 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
17569 gcc_assert (is_valid
);
17571 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17573 machine_mode vec_mode
= GET_MODE (const_vector
);
17574 if (aarch64_sve_pred_mode_p (vec_mode
))
17576 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
17577 if (info
.insn
== simd_immediate_info::MOV
)
17579 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
17580 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
17584 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
17585 unsigned int total_bytes
;
17586 if (info
.u
.pattern
== AARCH64_SV_ALL
17587 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
17588 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
17589 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
17591 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
17592 svpattern_token (info
.u
.pattern
));
17597 if (info
.insn
== simd_immediate_info::INDEX
)
17599 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
17600 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
17601 element_char
, INTVAL (info
.u
.index
.base
),
17602 INTVAL (info
.u
.index
.step
));
17606 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17608 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17609 info
.u
.mov
.value
= GEN_INT (0);
17612 const int buf_size
= 20;
17613 char float_buf
[buf_size
] = {};
17614 real_to_decimal_for_mode (float_buf
,
17615 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17616 buf_size
, buf_size
, 1, info
.elt_mode
);
17618 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
17619 element_char
, float_buf
);
17624 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
17625 element_char
, INTVAL (info
.u
.mov
.value
));
17629 /* Split operands into moves from op[1] + op[2] into op[0]. */
17632 aarch64_split_combinev16qi (rtx operands
[3])
17634 unsigned int dest
= REGNO (operands
[0]);
17635 unsigned int src1
= REGNO (operands
[1]);
17636 unsigned int src2
= REGNO (operands
[2]);
17637 machine_mode halfmode
= GET_MODE (operands
[1]);
17638 unsigned int halfregs
= REG_NREGS (operands
[1]);
17639 rtx destlo
, desthi
;
17641 gcc_assert (halfmode
== V16QImode
);
17643 if (src1
== dest
&& src2
== dest
+ halfregs
)
17645 /* No-op move. Can't split to nothing; emit something. */
17646 emit_note (NOTE_INSN_DELETED
);
17650 /* Preserve register attributes for variable tracking. */
17651 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
17652 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
17653 GET_MODE_SIZE (halfmode
));
17655 /* Special case of reversed high/low parts. */
17656 if (reg_overlap_mentioned_p (operands
[2], destlo
)
17657 && reg_overlap_mentioned_p (operands
[1], desthi
))
17659 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17660 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
17661 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17663 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
17665 /* Try to avoid unnecessary moves if part of the result
17666 is in the right place already. */
17668 emit_move_insn (destlo
, operands
[1]);
17669 if (src2
!= dest
+ halfregs
)
17670 emit_move_insn (desthi
, operands
[2]);
17674 if (src2
!= dest
+ halfregs
)
17675 emit_move_insn (desthi
, operands
[2]);
17677 emit_move_insn (destlo
, operands
[1]);
17681 /* vec_perm support. */
17683 struct expand_vec_perm_d
17685 rtx target
, op0
, op1
;
17686 vec_perm_indices perm
;
17687 machine_mode vmode
;
17688 unsigned int vec_flags
;
17693 /* Generate a variable permutation. */
17696 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17698 machine_mode vmode
= GET_MODE (target
);
17699 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17701 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
17702 gcc_checking_assert (GET_MODE (op0
) == vmode
);
17703 gcc_checking_assert (GET_MODE (op1
) == vmode
);
17704 gcc_checking_assert (GET_MODE (sel
) == vmode
);
17705 gcc_checking_assert (TARGET_SIMD
);
17709 if (vmode
== V8QImode
)
17711 /* Expand the argument to a V16QI mode by duplicating it. */
17712 rtx pair
= gen_reg_rtx (V16QImode
);
17713 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
17714 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17718 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
17725 if (vmode
== V8QImode
)
17727 pair
= gen_reg_rtx (V16QImode
);
17728 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
17729 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17733 pair
= gen_reg_rtx (OImode
);
17734 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
17735 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
17740 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17741 NELT is the number of elements in the vector. */
17744 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
17747 machine_mode vmode
= GET_MODE (target
);
17748 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17751 /* The TBL instruction does not use a modulo index, so we must take care
17752 of that ourselves. */
17753 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
17754 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
17755 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
17757 /* For big-endian, we also need to reverse the index within the vector
17758 (but not which vector). */
17759 if (BYTES_BIG_ENDIAN
)
17761 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17763 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
17764 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
17765 NULL
, 0, OPTAB_LIB_WIDEN
);
17767 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
17770 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17773 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
17775 emit_insn (gen_rtx_SET (target
,
17776 gen_rtx_UNSPEC (GET_MODE (target
),
17777 gen_rtvec (2, op0
, op1
), code
)));
17780 /* Expand an SVE vec_perm with the given operands. */
17783 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17785 machine_mode data_mode
= GET_MODE (target
);
17786 machine_mode sel_mode
= GET_MODE (sel
);
17787 /* Enforced by the pattern condition. */
17788 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
17790 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17791 size of the two value vectors, i.e. the upper bits of the indices
17792 are effectively ignored. SVE TBL instead produces 0 for any
17793 out-of-range indices, so we need to modulo all the vec_perm indices
17794 to ensure they are all in range. */
17795 rtx sel_reg
= force_reg (sel_mode
, sel
);
17797 /* Check if the sel only references the first values vector. */
17798 if (GET_CODE (sel
) == CONST_VECTOR
17799 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
17801 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
17805 /* Check if the two values vectors are the same. */
17806 if (rtx_equal_p (op0
, op1
))
17808 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
17809 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17810 NULL
, 0, OPTAB_DIRECT
);
17811 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
17815 /* Run TBL on for each value vector and combine the results. */
17817 rtx res0
= gen_reg_rtx (data_mode
);
17818 rtx res1
= gen_reg_rtx (data_mode
);
17819 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
17820 if (GET_CODE (sel
) != CONST_VECTOR
17821 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
17823 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
17825 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17826 NULL
, 0, OPTAB_DIRECT
);
17828 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
17829 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
17830 NULL
, 0, OPTAB_DIRECT
);
17831 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
17832 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
17833 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
17835 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
17838 /* Recognize patterns suitable for the TRN instructions. */
17840 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
17843 poly_uint64 nelt
= d
->perm
.length ();
17844 rtx out
, in0
, in1
, x
;
17845 machine_mode vmode
= d
->vmode
;
17847 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17850 /* Note that these are little-endian tests.
17851 We correct for big-endian later. */
17852 if (!d
->perm
[0].is_constant (&odd
)
17853 || (odd
!= 0 && odd
!= 1)
17854 || !d
->perm
.series_p (0, 2, odd
, 2)
17855 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
17864 /* We don't need a big-endian lane correction for SVE; see the comment
17865 at the head of aarch64-sve.md for details. */
17866 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17868 x
= in0
, in0
= in1
, in1
= x
;
17873 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17874 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
17878 /* Recognize patterns suitable for the UZP instructions. */
17880 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
17883 rtx out
, in0
, in1
, x
;
17884 machine_mode vmode
= d
->vmode
;
17886 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17889 /* Note that these are little-endian tests.
17890 We correct for big-endian later. */
17891 if (!d
->perm
[0].is_constant (&odd
)
17892 || (odd
!= 0 && odd
!= 1)
17893 || !d
->perm
.series_p (0, 1, odd
, 2))
17902 /* We don't need a big-endian lane correction for SVE; see the comment
17903 at the head of aarch64-sve.md for details. */
17904 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17906 x
= in0
, in0
= in1
, in1
= x
;
17911 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17912 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
17916 /* Recognize patterns suitable for the ZIP instructions. */
17918 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
17921 poly_uint64 nelt
= d
->perm
.length ();
17922 rtx out
, in0
, in1
, x
;
17923 machine_mode vmode
= d
->vmode
;
17925 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17928 /* Note that these are little-endian tests.
17929 We correct for big-endian later. */
17930 poly_uint64 first
= d
->perm
[0];
17931 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
17932 || !d
->perm
.series_p (0, 2, first
, 1)
17933 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
17935 high
= maybe_ne (first
, 0U);
17943 /* We don't need a big-endian lane correction for SVE; see the comment
17944 at the head of aarch64-sve.md for details. */
17945 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17947 x
= in0
, in0
= in1
, in1
= x
;
17952 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17953 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
17957 /* Recognize patterns for the EXT insn. */
17960 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
17962 HOST_WIDE_INT location
;
17965 /* The first element always refers to the first vector.
17966 Check if the extracted indices are increasing by one. */
17967 if (d
->vec_flags
== VEC_SVE_PRED
17968 || !d
->perm
[0].is_constant (&location
)
17969 || !d
->perm
.series_p (0, 1, location
, 1))
17976 /* The case where (location == 0) is a no-op for both big- and little-endian,
17977 and is removed by the mid-end at optimization levels -O1 and higher.
17979 We don't need a big-endian lane correction for SVE; see the comment
17980 at the head of aarch64-sve.md for details. */
17981 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
17983 /* After setup, we want the high elements of the first vector (stored
17984 at the LSB end of the register), and the low elements of the second
17985 vector (stored at the MSB end of the register). So swap. */
17986 std::swap (d
->op0
, d
->op1
);
17987 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17988 to_constant () is safe since this is restricted to Advanced SIMD
17990 location
= d
->perm
.length ().to_constant () - location
;
17993 offset
= GEN_INT (location
);
17994 emit_set_insn (d
->target
,
17995 gen_rtx_UNSPEC (d
->vmode
,
17996 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
18001 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
18002 within each 64-bit, 32-bit or 16-bit granule. */
18005 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
18007 HOST_WIDE_INT diff
;
18008 unsigned int i
, size
, unspec
;
18009 machine_mode pred_mode
;
18011 if (d
->vec_flags
== VEC_SVE_PRED
18012 || !d
->one_vector_p
18013 || !d
->perm
[0].is_constant (&diff
))
18016 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
18019 unspec
= UNSPEC_REV64
;
18020 pred_mode
= VNx2BImode
;
18022 else if (size
== 4)
18024 unspec
= UNSPEC_REV32
;
18025 pred_mode
= VNx4BImode
;
18027 else if (size
== 2)
18029 unspec
= UNSPEC_REV16
;
18030 pred_mode
= VNx8BImode
;
18035 unsigned int step
= diff
+ 1;
18036 for (i
= 0; i
< step
; ++i
)
18037 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
18044 if (d
->vec_flags
== VEC_SVE_DATA
)
18046 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
18047 rtx target
= gen_reg_rtx (int_mode
);
18048 if (BYTES_BIG_ENDIAN
)
18049 /* The act of taking a subreg between INT_MODE and d->vmode
18050 is itself a reversing operation on big-endian targets;
18051 see the comment at the head of aarch64-sve.md for details.
18052 First reinterpret OP0 as INT_MODE without using a subreg
18053 and without changing the contents. */
18054 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
18057 /* For SVE we use REV[BHW] unspecs derived from the element size
18058 of v->mode and vector modes whose elements have SIZE bytes.
18059 This ensures that the vector modes match the predicate modes. */
18060 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
18061 rtx pred
= aarch64_ptrue_reg (pred_mode
);
18062 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
18063 gen_lowpart (int_mode
, d
->op0
)));
18065 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
18068 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
18069 emit_set_insn (d
->target
, src
);
18073 /* Recognize patterns for the REV insn, which reverses elements within
18077 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
18079 poly_uint64 nelt
= d
->perm
.length ();
18081 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
18084 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
18091 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
18092 emit_set_insn (d
->target
, src
);
18097 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
18099 rtx out
= d
->target
;
18102 machine_mode vmode
= d
->vmode
;
18105 if (d
->vec_flags
== VEC_SVE_PRED
18106 || d
->perm
.encoding ().encoded_nelts () != 1
18107 || !d
->perm
[0].is_constant (&elt
))
18110 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
18117 /* The generic preparation in aarch64_expand_vec_perm_const_1
18118 swaps the operand order and the permute indices if it finds
18119 d->perm[0] to be in the second operand. Thus, we can always
18120 use d->op0 and need not do any extra arithmetic to get the
18121 correct lane number. */
18123 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
18125 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
18126 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
18127 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
18132 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
18134 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
18135 machine_mode vmode
= d
->vmode
;
18137 /* Make sure that the indices are constant. */
18138 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
18139 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
18140 if (!d
->perm
[i
].is_constant ())
18146 /* Generic code will try constant permutation twice. Once with the
18147 original mode and again with the elements lowered to QImode.
18148 So wait and don't do the selector expansion ourselves. */
18149 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
18152 /* to_constant is safe since this routine is specific to Advanced SIMD
18154 unsigned int nelt
= d
->perm
.length ().to_constant ();
18155 for (unsigned int i
= 0; i
< nelt
; ++i
)
18156 /* If big-endian and two vectors we end up with a weird mixed-endian
18157 mode on NEON. Reverse the index within each word but not the word
18158 itself. to_constant is safe because we checked is_constant above. */
18159 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
18160 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
18161 : d
->perm
[i
].to_constant ());
18163 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
18164 sel
= force_reg (vmode
, sel
);
18166 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
18170 /* Try to implement D using an SVE TBL instruction. */
18173 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
18175 unsigned HOST_WIDE_INT nelt
;
18177 /* Permuting two variable-length vectors could overflow the
18179 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
18185 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
18186 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
18187 if (d
->one_vector_p
)
18188 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
18190 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
18194 /* Try to implement D using SVE SEL instruction. */
18197 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
18199 machine_mode vmode
= d
->vmode
;
18200 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
18202 if (d
->vec_flags
!= VEC_SVE_DATA
18206 int n_patterns
= d
->perm
.encoding ().npatterns ();
18207 poly_int64 vec_len
= d
->perm
.length ();
18209 for (int i
= 0; i
< n_patterns
; ++i
)
18210 if (!known_eq (d
->perm
[i
], i
)
18211 && !known_eq (d
->perm
[i
], vec_len
+ i
))
18214 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
18215 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
18216 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
18222 machine_mode pred_mode
= aarch64_sve_pred_mode (unit_size
).require ();
18224 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
18225 for (int i
= 0; i
< n_patterns
* 2; i
++)
18227 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
18228 : CONST0_RTX (BImode
);
18229 builder
.quick_push (elem
);
18232 rtx const_vec
= builder
.build ();
18233 rtx pred
= force_reg (pred_mode
, const_vec
);
18234 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op1
, d
->op0
, pred
));
18239 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
18241 /* The pattern matching functions above are written to look for a small
18242 number to begin the sequence (0, 1, N/2). If we begin with an index
18243 from the second operand, we can swap the operands. */
18244 poly_int64 nelt
= d
->perm
.length ();
18245 if (known_ge (d
->perm
[0], nelt
))
18247 d
->perm
.rotate_inputs (1);
18248 std::swap (d
->op0
, d
->op1
);
18251 if ((d
->vec_flags
== VEC_ADVSIMD
18252 || d
->vec_flags
== VEC_SVE_DATA
18253 || d
->vec_flags
== VEC_SVE_PRED
)
18254 && known_gt (nelt
, 1))
18256 if (aarch64_evpc_rev_local (d
))
18258 else if (aarch64_evpc_rev_global (d
))
18260 else if (aarch64_evpc_ext (d
))
18262 else if (aarch64_evpc_dup (d
))
18264 else if (aarch64_evpc_zip (d
))
18266 else if (aarch64_evpc_uzp (d
))
18268 else if (aarch64_evpc_trn (d
))
18270 else if (aarch64_evpc_sel (d
))
18272 if (d
->vec_flags
== VEC_SVE_DATA
)
18273 return aarch64_evpc_sve_tbl (d
);
18274 else if (d
->vec_flags
== VEC_ADVSIMD
)
18275 return aarch64_evpc_tbl (d
);
18280 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18283 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
18284 rtx op1
, const vec_perm_indices
&sel
)
18286 struct expand_vec_perm_d d
;
18288 /* Check whether the mask can be applied to a single vector. */
18289 if (sel
.ninputs () == 1
18290 || (op0
&& rtx_equal_p (op0
, op1
)))
18291 d
.one_vector_p
= true;
18292 else if (sel
.all_from_input_p (0))
18294 d
.one_vector_p
= true;
18297 else if (sel
.all_from_input_p (1))
18299 d
.one_vector_p
= true;
18303 d
.one_vector_p
= false;
18305 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
18306 sel
.nelts_per_input ());
18308 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
18312 d
.testing_p
= !target
;
18315 return aarch64_expand_vec_perm_const_1 (&d
);
18317 rtx_insn
*last
= get_last_insn ();
18318 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
18319 gcc_assert (last
== get_last_insn ());
18324 /* Generate a byte permute mask for a register of mode MODE,
18325 which has NUNITS units. */
18328 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
18330 /* We have to reverse each vector because we dont have
18331 a permuted load that can reverse-load according to ABI rules. */
18333 rtvec v
= rtvec_alloc (16);
18335 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
18337 gcc_assert (BYTES_BIG_ENDIAN
);
18338 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
18340 for (i
= 0; i
< nunits
; i
++)
18341 for (j
= 0; j
< usize
; j
++)
18342 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
18343 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
18344 return force_reg (V16QImode
, mask
);
18347 /* Expand an SVE integer comparison using the SVE equivalent of:
18349 (set TARGET (CODE OP0 OP1)). */
18352 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
18354 machine_mode pred_mode
= GET_MODE (target
);
18355 machine_mode data_mode
= GET_MODE (op0
);
18356 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
18358 if (!rtx_equal_p (target
, res
))
18359 emit_move_insn (target
, res
);
18362 /* Return the UNSPEC_COND_* code for comparison CODE. */
18364 static unsigned int
18365 aarch64_unspec_cond_code (rtx_code code
)
18370 return UNSPEC_COND_FCMNE
;
18372 return UNSPEC_COND_FCMEQ
;
18374 return UNSPEC_COND_FCMLT
;
18376 return UNSPEC_COND_FCMGT
;
18378 return UNSPEC_COND_FCMLE
;
18380 return UNSPEC_COND_FCMGE
;
18382 return UNSPEC_COND_FCMUO
;
18384 gcc_unreachable ();
18390 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18392 where <X> is the operation associated with comparison CODE.
18393 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18396 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18397 bool known_ptrue_p
, rtx op0
, rtx op1
)
18399 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
18400 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
18401 gen_rtvec (4, pred
, flag
, op0
, op1
),
18402 aarch64_unspec_cond_code (code
));
18403 emit_set_insn (target
, unspec
);
18406 /* Emit the SVE equivalent of:
18408 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18409 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18410 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18412 where <Xi> is the operation associated with comparison CODEi.
18413 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18416 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
18417 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
18419 machine_mode pred_mode
= GET_MODE (pred
);
18420 rtx tmp1
= gen_reg_rtx (pred_mode
);
18421 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
18422 rtx tmp2
= gen_reg_rtx (pred_mode
);
18423 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
18424 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
18427 /* Emit the SVE equivalent of:
18429 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18430 (set TARGET (not TMP))
18432 where <X> is the operation associated with comparison CODE.
18433 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18436 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
18437 bool known_ptrue_p
, rtx op0
, rtx op1
)
18439 machine_mode pred_mode
= GET_MODE (pred
);
18440 rtx tmp
= gen_reg_rtx (pred_mode
);
18441 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
18442 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
18445 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18447 (set TARGET (CODE OP0 OP1))
18449 If CAN_INVERT_P is true, the caller can also handle inverted results;
18450 return true if the result is in fact inverted. */
18453 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
18454 rtx op0
, rtx op1
, bool can_invert_p
)
18456 machine_mode pred_mode
= GET_MODE (target
);
18457 machine_mode data_mode
= GET_MODE (op0
);
18459 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
18463 /* UNORDERED has no immediate form. */
18464 op1
= force_reg (data_mode
, op1
);
18473 /* There is native support for the comparison. */
18474 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18479 /* This is a trapping operation (LT or GT). */
18480 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
18484 if (!flag_trapping_math
)
18486 /* This would trap for signaling NaNs. */
18487 op1
= force_reg (data_mode
, op1
);
18488 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
18489 ptrue
, true, op0
, op1
);
18497 if (flag_trapping_math
)
18499 /* Work out which elements are ordered. */
18500 rtx ordered
= gen_reg_rtx (pred_mode
);
18501 op1
= force_reg (data_mode
, op1
);
18502 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
18503 ptrue
, true, op0
, op1
);
18505 /* Test the opposite condition for the ordered elements,
18506 then invert the result. */
18510 code
= reverse_condition_maybe_unordered (code
);
18513 aarch64_emit_sve_fp_cond (target
, code
,
18514 ordered
, false, op0
, op1
);
18517 aarch64_emit_sve_invert_fp_cond (target
, code
,
18518 ordered
, false, op0
, op1
);
18524 /* ORDERED has no immediate form. */
18525 op1
= force_reg (data_mode
, op1
);
18529 gcc_unreachable ();
18532 /* There is native support for the inverse comparison. */
18533 code
= reverse_condition_maybe_unordered (code
);
18536 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18539 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18543 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18544 of the data being selected and CMP_MODE is the mode of the values being
18548 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
18551 machine_mode pred_mode
18552 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
18553 GET_MODE_SIZE (cmp_mode
)).require ();
18554 rtx pred
= gen_reg_rtx (pred_mode
);
18555 if (FLOAT_MODE_P (cmp_mode
))
18557 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
18558 ops
[4], ops
[5], true))
18559 std::swap (ops
[1], ops
[2]);
18562 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
18564 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
18565 ops
[1] = force_reg (data_mode
, ops
[1]);
18566 /* The "false" value can only be zero if the "true" value is a constant. */
18567 if (register_operand (ops
[1], data_mode
)
18568 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
18569 ops
[2] = force_reg (data_mode
, ops
[2]);
18571 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
18572 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
18575 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18576 true. However due to issues with register allocation it is preferable
18577 to avoid tieing integer scalar and FP scalar modes. Executing integer
18578 operations in general registers is better than treating them as scalar
18579 vector operations. This reduces latency and avoids redundant int<->FP
18580 moves. So tie modes if they are either the same class, or vector modes
18581 with other vector modes, vector structs or any scalar mode. */
18584 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
18586 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
18589 /* We specifically want to allow elements of "structure" modes to
18590 be tieable to the structure. This more general condition allows
18591 other rarer situations too. The reason we don't extend this to
18592 predicate modes is that there are no predicate structure modes
18593 nor any specific instructions for extracting part of a predicate
18595 if (aarch64_vector_data_mode_p (mode1
)
18596 && aarch64_vector_data_mode_p (mode2
))
18599 /* Also allow any scalar modes with vectors. */
18600 if (aarch64_vector_mode_supported_p (mode1
)
18601 || aarch64_vector_mode_supported_p (mode2
))
18607 /* Return a new RTX holding the result of moving POINTER forward by
18611 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
18613 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
18615 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
18619 /* Return a new RTX holding the result of moving POINTER forward by the
18620 size of the mode it points to. */
18623 aarch64_progress_pointer (rtx pointer
)
18625 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
18628 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18632 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
18635 rtx reg
= gen_reg_rtx (mode
);
18637 /* "Cast" the pointers to the correct mode. */
18638 *src
= adjust_address (*src
, mode
, 0);
18639 *dst
= adjust_address (*dst
, mode
, 0);
18640 /* Emit the memcpy. */
18641 emit_move_insn (reg
, *src
);
18642 emit_move_insn (*dst
, reg
);
18643 /* Move the pointers forward. */
18644 *src
= aarch64_progress_pointer (*src
);
18645 *dst
= aarch64_progress_pointer (*dst
);
18648 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18649 we succeed, otherwise return false. */
18652 aarch64_expand_cpymem (rtx
*operands
)
18655 rtx dst
= operands
[0];
18656 rtx src
= operands
[1];
18658 machine_mode cur_mode
= BLKmode
, next_mode
;
18659 bool speed_p
= !optimize_function_for_size_p (cfun
);
18661 /* When optimizing for size, give a better estimate of the length of a
18662 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18663 will always require an even number of instructions to do now. And each
18664 operation requires both a load+store, so devide the max number by 2. */
18665 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
18667 /* We can't do anything smart if the amount to copy is not constant. */
18668 if (!CONST_INT_P (operands
[2]))
18671 n
= INTVAL (operands
[2]);
18673 /* Try to keep the number of instructions low. For all cases we will do at
18674 most two moves for the residual amount, since we'll always overlap the
18676 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
18679 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
18680 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
18682 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
18683 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
18685 /* Convert n to bits to make the rest of the code simpler. */
18686 n
= n
* BITS_PER_UNIT
;
18688 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18689 larger than TImode, but we should not use them for loads/stores here. */
18690 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
18694 /* Find the largest mode in which to do the copy in without over reading
18696 opt_scalar_int_mode mode_iter
;
18697 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
18698 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
18699 cur_mode
= mode_iter
.require ();
18701 gcc_assert (cur_mode
!= BLKmode
);
18703 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
18704 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
18708 /* Do certain trailing copies as overlapping if it's going to be
18709 cheaper. i.e. less instructions to do so. For instance doing a 15
18710 byte copy it's more efficient to do two overlapping 8 byte copies than
18712 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
18714 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
18715 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
18716 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
18717 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
18725 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18726 SImode stores. Handle the case when the constant has identical
18727 bottom and top halves. This is beneficial when the two stores can be
18728 merged into an STP and we avoid synthesising potentially expensive
18729 immediates twice. Return true if such a split is possible. */
18732 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
18734 rtx lo
= gen_lowpart (SImode
, src
);
18735 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
18737 bool size_p
= optimize_function_for_size_p (cfun
);
18739 if (!rtx_equal_p (lo
, hi
))
18742 unsigned int orig_cost
18743 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
18744 unsigned int lo_cost
18745 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
18747 /* We want to transform:
18749 MOVK x1, 0x140, lsl 16
18750 MOVK x1, 0xc0da, lsl 32
18751 MOVK x1, 0x140, lsl 48
18755 MOVK w1, 0x140, lsl 16
18757 So we want to perform this only when we save two instructions
18758 or more. When optimizing for size, however, accept any code size
18760 if (size_p
&& orig_cost
<= lo_cost
)
18764 && (orig_cost
<= lo_cost
+ 1))
18767 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
18768 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
18771 rtx tmp_reg
= gen_reg_rtx (SImode
);
18772 aarch64_expand_mov_immediate (tmp_reg
, lo
);
18773 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
18774 /* Don't emit an explicit store pair as this may not be always profitable.
18775 Let the sched-fusion logic decide whether to merge them. */
18776 emit_move_insn (mem_lo
, tmp_reg
);
18777 emit_move_insn (mem_hi
, tmp_reg
);
18782 /* Generate RTL for a conditional branch with rtx comparison CODE in
18783 mode CC_MODE. The destination of the unlikely conditional branch
18787 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
18791 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
18792 gen_rtx_REG (cc_mode
, CC_REGNUM
),
18795 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18796 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
18798 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18801 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18803 OP1 represents the TImode destination operand 1
18804 OP2 represents the TImode destination operand 2
18805 LOW_DEST represents the low half (DImode) of TImode operand 0
18806 LOW_IN1 represents the low half (DImode) of TImode operand 1
18807 LOW_IN2 represents the low half (DImode) of TImode operand 2
18808 HIGH_DEST represents the high half (DImode) of TImode operand 0
18809 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18810 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18813 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18814 rtx
*low_in1
, rtx
*low_in2
,
18815 rtx
*high_dest
, rtx
*high_in1
,
18818 *low_dest
= gen_reg_rtx (DImode
);
18819 *low_in1
= gen_lowpart (DImode
, op1
);
18820 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18821 subreg_lowpart_offset (DImode
, TImode
));
18822 *high_dest
= gen_reg_rtx (DImode
);
18823 *high_in1
= gen_highpart (DImode
, op1
);
18824 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18825 subreg_highpart_offset (DImode
, TImode
));
18828 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18830 This function differs from 'arch64_addti_scratch_regs' in that
18831 OP1 can be an immediate constant (zero). We must call
18832 subreg_highpart_offset with DImode and TImode arguments, otherwise
18833 VOIDmode will be used for the const_int which generates an internal
18834 error from subreg_size_highpart_offset which does not expect a size of zero.
18836 OP1 represents the TImode destination operand 1
18837 OP2 represents the TImode destination operand 2
18838 LOW_DEST represents the low half (DImode) of TImode operand 0
18839 LOW_IN1 represents the low half (DImode) of TImode operand 1
18840 LOW_IN2 represents the low half (DImode) of TImode operand 2
18841 HIGH_DEST represents the high half (DImode) of TImode operand 0
18842 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18843 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18847 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18848 rtx
*low_in1
, rtx
*low_in2
,
18849 rtx
*high_dest
, rtx
*high_in1
,
18852 *low_dest
= gen_reg_rtx (DImode
);
18853 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18854 subreg_lowpart_offset (DImode
, TImode
));
18856 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18857 subreg_lowpart_offset (DImode
, TImode
));
18858 *high_dest
= gen_reg_rtx (DImode
);
18860 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18861 subreg_highpart_offset (DImode
, TImode
));
18862 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18863 subreg_highpart_offset (DImode
, TImode
));
18866 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18868 OP0 represents the TImode destination operand 0
18869 LOW_DEST represents the low half (DImode) of TImode operand 0
18870 LOW_IN1 represents the low half (DImode) of TImode operand 1
18871 LOW_IN2 represents the low half (DImode) of TImode operand 2
18872 HIGH_DEST represents the high half (DImode) of TImode operand 0
18873 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18874 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18875 UNSIGNED_P is true if the operation is being performed on unsigned
18878 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
18879 rtx low_in2
, rtx high_dest
, rtx high_in1
,
18880 rtx high_in2
, bool unsigned_p
)
18882 if (low_in2
== const0_rtx
)
18884 low_dest
= low_in1
;
18885 high_in2
= force_reg (DImode
, high_in2
);
18887 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
18889 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
18893 if (CONST_INT_P (low_in2
))
18895 high_in2
= force_reg (DImode
, high_in2
);
18896 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
18897 GEN_INT (-INTVAL (low_in2
))));
18900 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
18903 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
18905 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
18908 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
18909 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
18913 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18915 static unsigned HOST_WIDE_INT
18916 aarch64_asan_shadow_offset (void)
18919 return (HOST_WIDE_INT_1
<< 29);
18921 return (HOST_WIDE_INT_1
<< 36);
18925 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
18926 int code
, tree treeop0
, tree treeop1
)
18928 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18930 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18932 struct expand_operand ops
[4];
18935 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18937 op_mode
= GET_MODE (op0
);
18938 if (op_mode
== VOIDmode
)
18939 op_mode
= GET_MODE (op1
);
18947 icode
= CODE_FOR_cmpsi
;
18952 icode
= CODE_FOR_cmpdi
;
18957 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18958 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
18963 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18964 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
18972 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
18973 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
18979 *prep_seq
= get_insns ();
18982 create_fixed_operand (&ops
[0], op0
);
18983 create_fixed_operand (&ops
[1], op1
);
18986 if (!maybe_expand_insn (icode
, 2, ops
))
18991 *gen_seq
= get_insns ();
18994 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
18995 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
18999 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
19000 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
19002 rtx op0
, op1
, target
;
19003 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
19004 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
19006 struct expand_operand ops
[6];
19009 push_to_sequence (*prep_seq
);
19010 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
19012 op_mode
= GET_MODE (op0
);
19013 if (op_mode
== VOIDmode
)
19014 op_mode
= GET_MODE (op1
);
19022 icode
= CODE_FOR_ccmpsi
;
19027 icode
= CODE_FOR_ccmpdi
;
19032 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
19033 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
19038 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
19039 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
19047 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
19048 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
19054 *prep_seq
= get_insns ();
19057 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
19058 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
19060 if (bit_code
!= AND
)
19062 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
19063 GET_MODE (XEXP (prev
, 0))),
19064 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
19065 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
19068 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
19069 create_fixed_operand (&ops
[1], target
);
19070 create_fixed_operand (&ops
[2], op0
);
19071 create_fixed_operand (&ops
[3], op1
);
19072 create_fixed_operand (&ops
[4], prev
);
19073 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
19075 push_to_sequence (*gen_seq
);
19076 if (!maybe_expand_insn (icode
, 6, ops
))
19082 *gen_seq
= get_insns ();
19085 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
19088 #undef TARGET_GEN_CCMP_FIRST
19089 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19091 #undef TARGET_GEN_CCMP_NEXT
19092 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19094 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
19095 instruction fusion of some sort. */
19098 aarch64_macro_fusion_p (void)
19100 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
19104 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
19105 should be kept together during scheduling. */
19108 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
19111 rtx prev_set
= single_set (prev
);
19112 rtx curr_set
= single_set (curr
);
19113 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
19114 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
19116 if (!aarch64_macro_fusion_p ())
19119 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
19121 /* We are trying to match:
19122 prev (mov) == (set (reg r0) (const_int imm16))
19123 curr (movk) == (set (zero_extract (reg r0)
19126 (const_int imm16_1)) */
19128 set_dest
= SET_DEST (curr_set
);
19130 if (GET_CODE (set_dest
) == ZERO_EXTRACT
19131 && CONST_INT_P (SET_SRC (curr_set
))
19132 && CONST_INT_P (SET_SRC (prev_set
))
19133 && CONST_INT_P (XEXP (set_dest
, 2))
19134 && INTVAL (XEXP (set_dest
, 2)) == 16
19135 && REG_P (XEXP (set_dest
, 0))
19136 && REG_P (SET_DEST (prev_set
))
19137 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
19143 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
19146 /* We're trying to match:
19147 prev (adrp) == (set (reg r1)
19148 (high (symbol_ref ("SYM"))))
19149 curr (add) == (set (reg r0)
19151 (symbol_ref ("SYM"))))
19152 Note that r0 need not necessarily be the same as r1, especially
19153 during pre-regalloc scheduling. */
19155 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
19156 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
19158 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
19159 && REG_P (XEXP (SET_SRC (curr_set
), 0))
19160 && REGNO (XEXP (SET_SRC (curr_set
), 0))
19161 == REGNO (SET_DEST (prev_set
))
19162 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
19163 XEXP (SET_SRC (curr_set
), 1)))
19168 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
19171 /* We're trying to match:
19172 prev (movk) == (set (zero_extract (reg r0)
19175 (const_int imm16_1))
19176 curr (movk) == (set (zero_extract (reg r0)
19179 (const_int imm16_2)) */
19181 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
19182 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
19183 && REG_P (XEXP (SET_DEST (prev_set
), 0))
19184 && REG_P (XEXP (SET_DEST (curr_set
), 0))
19185 && REGNO (XEXP (SET_DEST (prev_set
), 0))
19186 == REGNO (XEXP (SET_DEST (curr_set
), 0))
19187 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
19188 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
19189 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
19190 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
19191 && CONST_INT_P (SET_SRC (prev_set
))
19192 && CONST_INT_P (SET_SRC (curr_set
)))
19196 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
19198 /* We're trying to match:
19199 prev (adrp) == (set (reg r0)
19200 (high (symbol_ref ("SYM"))))
19201 curr (ldr) == (set (reg r1)
19202 (mem (lo_sum (reg r0)
19203 (symbol_ref ("SYM")))))
19205 curr (ldr) == (set (reg r1)
19208 (symbol_ref ("SYM")))))) */
19209 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
19210 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
19212 rtx curr_src
= SET_SRC (curr_set
);
19214 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
19215 curr_src
= XEXP (curr_src
, 0);
19217 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
19218 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
19219 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
19220 == REGNO (SET_DEST (prev_set
))
19221 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
19222 XEXP (SET_SRC (prev_set
), 0)))
19227 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
19228 && any_condjump_p (curr
))
19230 unsigned int condreg1
, condreg2
;
19232 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
19233 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
19235 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
19237 && modified_in_p (cc_reg_1
, prev
))
19239 enum attr_type prev_type
= get_attr_type (prev
);
19241 /* FIXME: this misses some which is considered simple arthematic
19242 instructions for ThunderX. Simple shifts are missed here. */
19243 if (prev_type
== TYPE_ALUS_SREG
19244 || prev_type
== TYPE_ALUS_IMM
19245 || prev_type
== TYPE_LOGICS_REG
19246 || prev_type
== TYPE_LOGICS_IMM
)
19253 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
19254 && any_condjump_p (curr
))
19256 /* We're trying to match:
19257 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19258 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
19260 (label_ref ("SYM"))
19262 if (SET_DEST (curr_set
) == (pc_rtx
)
19263 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
19264 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
19265 && REG_P (SET_DEST (prev_set
))
19266 && REGNO (SET_DEST (prev_set
))
19267 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
19269 /* Fuse ALU operations followed by conditional branch instruction. */
19270 switch (get_attr_type (prev
))
19273 case TYPE_ALU_SREG
:
19276 case TYPE_ADCS_REG
:
19277 case TYPE_ADCS_IMM
:
19278 case TYPE_LOGIC_REG
:
19279 case TYPE_LOGIC_IMM
:
19283 case TYPE_SHIFT_REG
:
19284 case TYPE_SHIFT_IMM
:
19299 /* Return true iff the instruction fusion described by OP is enabled. */
19302 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
19304 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
19307 /* If MEM is in the form of [base+offset], extract the two parts
19308 of address and set to BASE and OFFSET, otherwise return false
19309 after clearing BASE and OFFSET. */
19312 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
19316 gcc_assert (MEM_P (mem
));
19318 addr
= XEXP (mem
, 0);
19323 *offset
= const0_rtx
;
19327 if (GET_CODE (addr
) == PLUS
19328 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
19330 *base
= XEXP (addr
, 0);
19331 *offset
= XEXP (addr
, 1);
19336 *offset
= NULL_RTX
;
19341 /* Types for scheduling fusion. */
19342 enum sched_fusion_type
19344 SCHED_FUSION_NONE
= 0,
19345 SCHED_FUSION_LD_SIGN_EXTEND
,
19346 SCHED_FUSION_LD_ZERO_EXTEND
,
19352 /* If INSN is a load or store of address in the form of [base+offset],
19353 extract the two parts and set to BASE and OFFSET. Return scheduling
19354 fusion type this INSN is. */
19356 static enum sched_fusion_type
19357 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
19360 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
19362 gcc_assert (INSN_P (insn
));
19363 x
= PATTERN (insn
);
19364 if (GET_CODE (x
) != SET
)
19365 return SCHED_FUSION_NONE
;
19368 dest
= SET_DEST (x
);
19370 machine_mode dest_mode
= GET_MODE (dest
);
19372 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
19373 return SCHED_FUSION_NONE
;
19375 if (GET_CODE (src
) == SIGN_EXTEND
)
19377 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
19378 src
= XEXP (src
, 0);
19379 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19380 return SCHED_FUSION_NONE
;
19382 else if (GET_CODE (src
) == ZERO_EXTEND
)
19384 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
19385 src
= XEXP (src
, 0);
19386 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
19387 return SCHED_FUSION_NONE
;
19390 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
19391 extract_base_offset_in_addr (src
, base
, offset
);
19392 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
19394 fusion
= SCHED_FUSION_ST
;
19395 extract_base_offset_in_addr (dest
, base
, offset
);
19398 return SCHED_FUSION_NONE
;
19400 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
19401 fusion
= SCHED_FUSION_NONE
;
19406 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19408 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19409 and PRI are only calculated for these instructions. For other instruction,
19410 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19411 type instruction fusion can be added by returning different priorities.
19413 It's important that irrelevant instructions get the largest FUSION_PRI. */
19416 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
19417 int *fusion_pri
, int *pri
)
19421 enum sched_fusion_type fusion
;
19423 gcc_assert (INSN_P (insn
));
19426 fusion
= fusion_load_store (insn
, &base
, &offset
);
19427 if (fusion
== SCHED_FUSION_NONE
)
19434 /* Set FUSION_PRI according to fusion type and base register. */
19435 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
19437 /* Calculate PRI. */
19440 /* INSN with smaller offset goes first. */
19441 off_val
= (int)(INTVAL (offset
));
19443 tmp
-= (off_val
& 0xfffff);
19445 tmp
+= ((- off_val
) & 0xfffff);
19451 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19452 Adjust priority of sha1h instructions so they are scheduled before
19453 other SHA1 instructions. */
19456 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
19458 rtx x
= PATTERN (insn
);
19460 if (GET_CODE (x
) == SET
)
19464 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
19465 return priority
+ 10;
19471 /* Given OPERANDS of consecutive load/store, check if we can merge
19472 them into ldp/stp. LOAD is true if they are load instructions.
19473 MODE is the mode of memory operands. */
19476 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
19479 HOST_WIDE_INT offval_1
, offval_2
, msize
;
19480 enum reg_class rclass_1
, rclass_2
;
19481 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
19485 mem_1
= operands
[1];
19486 mem_2
= operands
[3];
19487 reg_1
= operands
[0];
19488 reg_2
= operands
[2];
19489 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
19490 if (REGNO (reg_1
) == REGNO (reg_2
))
19495 mem_1
= operands
[0];
19496 mem_2
= operands
[2];
19497 reg_1
= operands
[1];
19498 reg_2
= operands
[3];
19501 /* The mems cannot be volatile. */
19502 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
19505 /* If we have SImode and slow unaligned ldp,
19506 check the alignment to be at least 8 byte. */
19508 && (aarch64_tune_params
.extra_tuning_flags
19509 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19511 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
19514 /* Check if the addresses are in the form of [base+offset]. */
19515 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19516 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
19518 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19519 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
19522 /* Check if the bases are same. */
19523 if (!rtx_equal_p (base_1
, base_2
))
19526 /* The operands must be of the same size. */
19527 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
19528 GET_MODE_SIZE (GET_MODE (mem_2
))));
19530 offval_1
= INTVAL (offset_1
);
19531 offval_2
= INTVAL (offset_2
);
19532 /* We should only be trying this for fixed-sized modes. There is no
19533 SVE LDP/STP instruction. */
19534 msize
= GET_MODE_SIZE (mode
).to_constant ();
19535 /* Check if the offsets are consecutive. */
19536 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
19539 /* Check if the addresses are clobbered by load. */
19542 if (reg_mentioned_p (reg_1
, mem_1
))
19545 /* In increasing order, the last load can clobber the address. */
19546 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
19550 /* One of the memory accesses must be a mempair operand.
19551 If it is not the first one, they need to be swapped by the
19553 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
19554 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
19557 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
19558 rclass_1
= FP_REGS
;
19560 rclass_1
= GENERAL_REGS
;
19562 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
19563 rclass_2
= FP_REGS
;
19565 rclass_2
= GENERAL_REGS
;
19567 /* Check if the registers are of same class. */
19568 if (rclass_1
!= rclass_2
)
19574 /* Given OPERANDS of consecutive load/store that can be merged,
19575 swap them if they are not in ascending order. */
19577 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
19579 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
19580 HOST_WIDE_INT offval_1
, offval_2
;
19584 mem_1
= operands
[1];
19585 mem_2
= operands
[3];
19589 mem_1
= operands
[0];
19590 mem_2
= operands
[2];
19593 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19594 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19596 offval_1
= INTVAL (offset_1
);
19597 offval_2
= INTVAL (offset_2
);
19599 if (offval_1
> offval_2
)
19601 /* Irrespective of whether this is a load or a store,
19602 we do the same swap. */
19603 std::swap (operands
[0], operands
[2]);
19604 std::swap (operands
[1], operands
[3]);
19608 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19609 comparison between the two. */
19611 aarch64_host_wide_int_compare (const void *x
, const void *y
)
19613 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
19614 * ((const HOST_WIDE_INT
*) y
));
19617 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19618 other pointing to a REG rtx containing an offset, compare the offsets
19623 1 iff offset (X) > offset (Y)
19624 0 iff offset (X) == offset (Y)
19625 -1 iff offset (X) < offset (Y) */
19627 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
19629 const rtx
* operands_1
= (const rtx
*) x
;
19630 const rtx
* operands_2
= (const rtx
*) y
;
19631 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
19633 if (MEM_P (operands_1
[0]))
19634 mem_1
= operands_1
[0];
19636 mem_1
= operands_1
[1];
19638 if (MEM_P (operands_2
[0]))
19639 mem_2
= operands_2
[0];
19641 mem_2
= operands_2
[1];
19643 /* Extract the offsets. */
19644 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19645 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
19647 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
19649 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
19652 /* Given OPERANDS of consecutive load/store, check if we can merge
19653 them into ldp/stp by adjusting the offset. LOAD is true if they
19654 are load instructions. MODE is the mode of memory operands.
19656 Given below consecutive stores:
19658 str w1, [xb, 0x100]
19659 str w1, [xb, 0x104]
19660 str w1, [xb, 0x108]
19661 str w1, [xb, 0x10c]
19663 Though the offsets are out of the range supported by stp, we can
19664 still pair them after adjusting the offset, like:
19666 add scratch, xb, 0x100
19667 stp w1, w1, [scratch]
19668 stp w1, w1, [scratch, 0x8]
19670 The peephole patterns detecting this opportunity should guarantee
19671 the scratch register is avaliable. */
19674 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
19677 const int num_insns
= 4;
19678 enum reg_class rclass
;
19679 HOST_WIDE_INT offvals
[num_insns
], msize
;
19680 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
19684 for (int i
= 0; i
< num_insns
; i
++)
19686 reg
[i
] = operands
[2 * i
];
19687 mem
[i
] = operands
[2 * i
+ 1];
19689 gcc_assert (REG_P (reg
[i
]));
19692 /* Do not attempt to merge the loads if the loads clobber each other. */
19693 for (int i
= 0; i
< 8; i
+= 2)
19694 for (int j
= i
+ 2; j
< 8; j
+= 2)
19695 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
19699 for (int i
= 0; i
< num_insns
; i
++)
19701 mem
[i
] = operands
[2 * i
];
19702 reg
[i
] = operands
[2 * i
+ 1];
19705 /* Skip if memory operand is by itself valid for ldp/stp. */
19706 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
19709 for (int i
= 0; i
< num_insns
; i
++)
19711 /* The mems cannot be volatile. */
19712 if (MEM_VOLATILE_P (mem
[i
]))
19715 /* Check if the addresses are in the form of [base+offset]. */
19716 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
19717 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
19721 /* Check if the registers are of same class. */
19722 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
19723 ? FP_REGS
: GENERAL_REGS
;
19725 for (int i
= 1; i
< num_insns
; i
++)
19726 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
19728 if (rclass
!= FP_REGS
)
19733 if (rclass
!= GENERAL_REGS
)
19737 /* Only the last register in the order in which they occur
19738 may be clobbered by the load. */
19739 if (rclass
== GENERAL_REGS
&& load
)
19740 for (int i
= 0; i
< num_insns
- 1; i
++)
19741 if (reg_mentioned_p (reg
[i
], mem
[i
]))
19744 /* Check if the bases are same. */
19745 for (int i
= 0; i
< num_insns
- 1; i
++)
19746 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
19749 for (int i
= 0; i
< num_insns
; i
++)
19750 offvals
[i
] = INTVAL (offset
[i
]);
19752 msize
= GET_MODE_SIZE (mode
);
19754 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19755 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
19756 aarch64_host_wide_int_compare
);
19758 if (!(offvals
[1] == offvals
[0] + msize
19759 && offvals
[3] == offvals
[2] + msize
))
19762 /* Check that offsets are within range of each other. The ldp/stp
19763 instructions have 7 bit immediate offsets, so use 0x80. */
19764 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
19767 /* The offsets must be aligned with respect to each other. */
19768 if (offvals
[0] % msize
!= offvals
[2] % msize
)
19771 /* If we have SImode and slow unaligned ldp,
19772 check the alignment to be at least 8 byte. */
19774 && (aarch64_tune_params
.extra_tuning_flags
19775 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19777 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
19783 /* Given OPERANDS of consecutive load/store, this function pairs them
19784 into LDP/STP after adjusting the offset. It depends on the fact
19785 that the operands can be sorted so the offsets are correct for STP.
19786 MODE is the mode of memory operands. CODE is the rtl operator
19787 which should be applied to all memory operands, it's SIGN_EXTEND,
19788 ZERO_EXTEND or UNKNOWN. */
19791 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
19792 scalar_mode mode
, RTX_CODE code
)
19794 rtx base
, offset_1
, offset_3
, t1
, t2
;
19795 rtx mem_1
, mem_2
, mem_3
, mem_4
;
19796 rtx temp_operands
[8];
19797 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
19798 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
19800 /* We make changes on a copy as we may still bail out. */
19801 for (int i
= 0; i
< 8; i
++)
19802 temp_operands
[i
] = operands
[i
];
19804 /* Sort the operands. */
19805 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
19807 /* Copy the memory operands so that if we have to bail for some
19808 reason the original addresses are unchanged. */
19811 mem_1
= copy_rtx (temp_operands
[1]);
19812 mem_2
= copy_rtx (temp_operands
[3]);
19813 mem_3
= copy_rtx (temp_operands
[5]);
19814 mem_4
= copy_rtx (temp_operands
[7]);
19818 mem_1
= copy_rtx (temp_operands
[0]);
19819 mem_2
= copy_rtx (temp_operands
[2]);
19820 mem_3
= copy_rtx (temp_operands
[4]);
19821 mem_4
= copy_rtx (temp_operands
[6]);
19822 gcc_assert (code
== UNKNOWN
);
19825 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19826 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
19827 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
19828 && offset_3
!= NULL_RTX
);
19830 /* Adjust offset so it can fit in LDP/STP instruction. */
19831 msize
= GET_MODE_SIZE (mode
);
19832 stp_off_upper_limit
= msize
* (0x40 - 1);
19833 stp_off_lower_limit
= - msize
* 0x40;
19835 off_val_1
= INTVAL (offset_1
);
19836 off_val_3
= INTVAL (offset_3
);
19838 /* The base offset is optimally half way between the two STP/LDP offsets. */
19840 base_off
= (off_val_1
+ off_val_3
) / 2;
19842 /* However, due to issues with negative LDP/STP offset generation for
19843 larger modes, for DF, DI and vector modes. we must not use negative
19844 addresses smaller than 9 signed unadjusted bits can store. This
19845 provides the most range in this case. */
19846 base_off
= off_val_1
;
19848 /* Adjust the base so that it is aligned with the addresses but still
19850 if (base_off
% msize
!= off_val_1
% msize
)
19851 /* Fix the offset, bearing in mind we want to make it bigger not
19853 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19854 else if (msize
<= 4)
19855 /* The negative range of LDP/STP is one larger than the positive range. */
19858 /* Check if base offset is too big or too small. We can attempt to resolve
19859 this issue by setting it to the maximum value and seeing if the offsets
19861 if (base_off
>= 0x1000)
19863 base_off
= 0x1000 - 1;
19864 /* We must still make sure that the base offset is aligned with respect
19865 to the address. But it may may not be made any bigger. */
19866 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19869 /* Likewise for the case where the base is too small. */
19870 if (base_off
<= -0x1000)
19872 base_off
= -0x1000 + 1;
19873 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19876 /* Offset of the first STP/LDP. */
19877 new_off_1
= off_val_1
- base_off
;
19879 /* Offset of the second STP/LDP. */
19880 new_off_3
= off_val_3
- base_off
;
19882 /* The offsets must be within the range of the LDP/STP instructions. */
19883 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
19884 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
19887 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
19889 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
19890 new_off_1
+ msize
), true);
19891 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
19893 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
19894 new_off_3
+ msize
), true);
19896 if (!aarch64_mem_pair_operand (mem_1
, mode
)
19897 || !aarch64_mem_pair_operand (mem_3
, mode
))
19900 if (code
== ZERO_EXTEND
)
19902 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
19903 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
19904 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
19905 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
19907 else if (code
== SIGN_EXTEND
)
19909 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
19910 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
19911 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
19912 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
19917 operands
[0] = temp_operands
[0];
19918 operands
[1] = mem_1
;
19919 operands
[2] = temp_operands
[2];
19920 operands
[3] = mem_2
;
19921 operands
[4] = temp_operands
[4];
19922 operands
[5] = mem_3
;
19923 operands
[6] = temp_operands
[6];
19924 operands
[7] = mem_4
;
19928 operands
[0] = mem_1
;
19929 operands
[1] = temp_operands
[1];
19930 operands
[2] = mem_2
;
19931 operands
[3] = temp_operands
[3];
19932 operands
[4] = mem_3
;
19933 operands
[5] = temp_operands
[5];
19934 operands
[6] = mem_4
;
19935 operands
[7] = temp_operands
[7];
19938 /* Emit adjusting instruction. */
19939 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
19940 /* Emit ldp/stp instructions. */
19941 t1
= gen_rtx_SET (operands
[0], operands
[1]);
19942 t2
= gen_rtx_SET (operands
[2], operands
[3]);
19943 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19944 t1
= gen_rtx_SET (operands
[4], operands
[5]);
19945 t2
= gen_rtx_SET (operands
[6], operands
[7]);
19946 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19950 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19951 it isn't worth branching around empty masked ops (including masked
19955 aarch64_empty_mask_is_expensive (unsigned)
19960 /* Return 1 if pseudo register should be created and used to hold
19961 GOT address for PIC code. */
19964 aarch64_use_pseudo_pic_reg (void)
19966 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
19969 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19972 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
19974 switch (XINT (x
, 1))
19976 case UNSPEC_GOTSMALLPIC
:
19977 case UNSPEC_GOTSMALLPIC28K
:
19978 case UNSPEC_GOTTINYPIC
:
19984 return default_unspec_may_trap_p (x
, flags
);
19988 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19989 return the log2 of that value. Otherwise return -1. */
19992 aarch64_fpconst_pow_of_2 (rtx x
)
19994 const REAL_VALUE_TYPE
*r
;
19996 if (!CONST_DOUBLE_P (x
))
19999 r
= CONST_DOUBLE_REAL_VALUE (x
);
20001 if (REAL_VALUE_NEGATIVE (*r
)
20002 || REAL_VALUE_ISNAN (*r
)
20003 || REAL_VALUE_ISINF (*r
)
20004 || !real_isinteger (r
, DFmode
))
20007 return exact_log2 (real_to_integer (r
));
20010 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20011 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20012 return n. Otherwise return -1. */
20015 aarch64_fpconst_pow2_recip (rtx x
)
20017 REAL_VALUE_TYPE r0
;
20019 if (!CONST_DOUBLE_P (x
))
20022 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
20023 if (exact_real_inverse (DFmode
, &r0
)
20024 && !REAL_VALUE_NEGATIVE (r0
))
20026 int ret
= exact_log2 (real_to_integer (&r0
));
20027 if (ret
>= 1 && ret
<= 32)
20033 /* If X is a vector of equal CONST_DOUBLE values and that value is
20034 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
20037 aarch64_vec_fpconst_pow_of_2 (rtx x
)
20040 if (GET_CODE (x
) != CONST_VECTOR
20041 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
20044 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
20047 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
20051 for (int i
= 1; i
< nelts
; i
++)
20052 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
20058 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20061 __fp16 always promotes through this hook.
20062 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20063 through the generic excess precision logic rather than here. */
20066 aarch64_promoted_type (const_tree t
)
20068 if (SCALAR_FLOAT_TYPE_P (t
)
20069 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
20070 return float_type_node
;
20075 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
20078 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
20079 optimization_type opt_type
)
20084 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
20091 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
20093 static unsigned int
20094 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
20097 /* Polynomial invariant 1 == (VG / 2) - 1. */
20098 gcc_assert (i
== 1);
20101 return AARCH64_DWARF_VG
;
20104 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20105 if MODE is HFmode, and punt to the generic implementation otherwise. */
20108 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
20110 return (mode
== HFmode
20112 : default_libgcc_floating_mode_supported_p (mode
));
20115 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20116 if MODE is HFmode, and punt to the generic implementation otherwise. */
20119 aarch64_scalar_mode_supported_p (scalar_mode mode
)
20121 return (mode
== HFmode
20123 : default_scalar_mode_supported_p (mode
));
20126 /* Set the value of FLT_EVAL_METHOD.
20127 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20129 0: evaluate all operations and constants, whose semantic type has at
20130 most the range and precision of type float, to the range and
20131 precision of float; evaluate all other operations and constants to
20132 the range and precision of the semantic type;
20134 N, where _FloatN is a supported interchange floating type
20135 evaluate all operations and constants, whose semantic type has at
20136 most the range and precision of _FloatN type, to the range and
20137 precision of the _FloatN type; evaluate all other operations and
20138 constants to the range and precision of the semantic type;
20140 If we have the ARMv8.2-A extensions then we support _Float16 in native
20141 precision, so we should set this to 16. Otherwise, we support the type,
20142 but want to evaluate expressions in float precision, so set this to
20145 static enum flt_eval_method
20146 aarch64_excess_precision (enum excess_precision_type type
)
20150 case EXCESS_PRECISION_TYPE_FAST
:
20151 case EXCESS_PRECISION_TYPE_STANDARD
:
20152 /* We can calculate either in 16-bit range and precision or
20153 32-bit range and precision. Make that decision based on whether
20154 we have native support for the ARMv8.2-A 16-bit floating-point
20155 instructions or not. */
20156 return (TARGET_FP_F16INST
20157 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20158 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
20159 case EXCESS_PRECISION_TYPE_IMPLICIT
:
20160 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
20162 gcc_unreachable ();
20164 return FLT_EVAL_METHOD_UNPREDICTABLE
;
20167 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
20168 scheduled for speculative execution. Reject the long-running division
20169 and square-root instructions. */
20172 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
20174 switch (get_attr_type (insn
))
20182 case TYPE_NEON_FP_SQRT_S
:
20183 case TYPE_NEON_FP_SQRT_D
:
20184 case TYPE_NEON_FP_SQRT_S_Q
:
20185 case TYPE_NEON_FP_SQRT_D_Q
:
20186 case TYPE_NEON_FP_DIV_S
:
20187 case TYPE_NEON_FP_DIV_D
:
20188 case TYPE_NEON_FP_DIV_S_Q
:
20189 case TYPE_NEON_FP_DIV_D_Q
:
20196 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
20199 aarch64_compute_pressure_classes (reg_class
*classes
)
20202 classes
[i
++] = GENERAL_REGS
;
20203 classes
[i
++] = FP_REGS
;
20204 /* PR_REGS isn't a useful pressure class because many predicate pseudo
20205 registers need to go in PR_LO_REGS at some point during their
20206 lifetime. Splitting it into two halves has the effect of making
20207 all predicates count against PR_LO_REGS, so that we try whenever
20208 possible to restrict the number of live predicates to 8. This
20209 greatly reduces the amount of spilling in certain loops. */
20210 classes
[i
++] = PR_LO_REGS
;
20211 classes
[i
++] = PR_HI_REGS
;
20215 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
20218 aarch64_can_change_mode_class (machine_mode from
,
20219 machine_mode to
, reg_class_t
)
20221 if (BYTES_BIG_ENDIAN
)
20223 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
20224 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
20226 /* Don't allow changes between SVE data modes and non-SVE modes.
20227 See the comment at the head of aarch64-sve.md for details. */
20228 if (from_sve_p
!= to_sve_p
)
20231 /* Don't allow changes in element size: lane 0 of the new vector
20232 would not then be lane 0 of the old vector. See the comment
20233 above aarch64_maybe_expand_sve_subreg_move for a more detailed
20236 In the worst case, this forces a register to be spilled in
20237 one mode and reloaded in the other, which handles the
20238 endianness correctly. */
20239 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
20245 /* Implement TARGET_EARLY_REMAT_MODES. */
20248 aarch64_select_early_remat_modes (sbitmap modes
)
20250 /* SVE values are not normally live across a call, so it should be
20251 worth doing early rematerialization even in VL-specific mode. */
20252 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
20253 if (aarch64_sve_mode_p ((machine_mode
) i
))
20254 bitmap_set_bit (modes
, i
);
20257 /* Override the default target speculation_safe_value. */
20259 aarch64_speculation_safe_value (machine_mode mode
,
20260 rtx result
, rtx val
, rtx failval
)
20262 /* Maybe we should warn if falling back to hard barriers. They are
20263 likely to be noticably more expensive than the alternative below. */
20264 if (!aarch64_track_speculation
)
20265 return default_speculation_safe_value (mode
, result
, val
, failval
);
20268 val
= copy_to_mode_reg (mode
, val
);
20270 if (!aarch64_reg_or_zero (failval
, mode
))
20271 failval
= copy_to_mode_reg (mode
, failval
);
20273 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
20277 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20278 Look into the tuning structure for an estimate.
20279 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20280 Advanced SIMD 128 bits. */
20282 static HOST_WIDE_INT
20283 aarch64_estimated_poly_value (poly_int64 val
)
20285 enum aarch64_sve_vector_bits_enum width_source
20286 = aarch64_tune_params
.sve_width
;
20288 /* If we still don't have an estimate, use the default. */
20289 if (width_source
== SVE_SCALABLE
)
20290 return default_estimated_poly_value (val
);
20292 HOST_WIDE_INT over_128
= width_source
- 128;
20293 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
20297 /* Return true for types that could be supported as SIMD return or
20301 supported_simd_type (tree t
)
20303 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
20305 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
20306 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
20311 /* Return true for types that currently are supported as SIMD return
20312 or argument types. */
20315 currently_supported_simd_type (tree t
, tree b
)
20317 if (COMPLEX_FLOAT_TYPE_P (t
))
20320 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
20323 return supported_simd_type (t
);
20326 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20329 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
20330 struct cgraph_simd_clone
*clonei
,
20331 tree base_type
, int num
)
20333 tree t
, ret_type
, arg_type
;
20334 unsigned int elt_bits
, vec_bits
, count
;
20339 if (clonei
->simdlen
20340 && (clonei
->simdlen
< 2
20341 || clonei
->simdlen
> 1024
20342 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
20344 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20345 "unsupported simdlen %d", clonei
->simdlen
);
20349 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
20350 if (TREE_CODE (ret_type
) != VOID_TYPE
20351 && !currently_supported_simd_type (ret_type
, base_type
))
20353 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
20354 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20355 "GCC does not currently support mixed size types "
20356 "for %<simd%> functions");
20357 else if (supported_simd_type (ret_type
))
20358 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20359 "GCC does not currently support return type %qT "
20360 "for %<simd%> functions", ret_type
);
20362 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20363 "unsupported return type %qT for %<simd%> functions",
20368 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
20370 arg_type
= TREE_TYPE (t
);
20372 if (!currently_supported_simd_type (arg_type
, base_type
))
20374 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
20375 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20376 "GCC does not currently support mixed size types "
20377 "for %<simd%> functions");
20379 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20380 "GCC does not currently support argument type %qT "
20381 "for %<simd%> functions", arg_type
);
20386 clonei
->vecsize_mangle
= 'n';
20387 clonei
->mask_mode
= VOIDmode
;
20388 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
20389 if (clonei
->simdlen
== 0)
20392 vec_bits
= (num
== 0 ? 64 : 128);
20393 clonei
->simdlen
= vec_bits
/ elt_bits
;
20398 vec_bits
= clonei
->simdlen
* elt_bits
;
20399 if (vec_bits
!= 64 && vec_bits
!= 128)
20401 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
20402 "GCC does not currently support simdlen %d for type %qT",
20403 clonei
->simdlen
, base_type
);
20407 clonei
->vecsize_int
= vec_bits
;
20408 clonei
->vecsize_float
= vec_bits
;
20412 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20415 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
20417 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20418 use the correct ABI. */
20420 tree t
= TREE_TYPE (node
->decl
);
20421 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
20422 TYPE_ATTRIBUTES (t
));
20425 /* Implement TARGET_SIMD_CLONE_USABLE. */
20428 aarch64_simd_clone_usable (struct cgraph_node
*node
)
20430 switch (node
->simdclone
->vecsize_mangle
)
20437 gcc_unreachable ();
20441 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20444 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
20446 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
20447 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
20452 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20454 static const char *
20455 aarch64_get_multilib_abi_name (void)
20457 if (TARGET_BIG_END
)
20458 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
20459 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
20462 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20463 global variable based guard use the default else
20464 return a null tree. */
20466 aarch64_stack_protect_guard (void)
20468 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
20469 return default_stack_protect_guard ();
20474 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20475 section at the end if needed. */
20476 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20477 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20478 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20480 aarch64_file_end_indicate_exec_stack ()
20482 file_end_indicate_exec_stack ();
20484 unsigned feature_1_and
= 0;
20485 if (aarch64_bti_enabled ())
20486 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
20488 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
20489 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
20493 /* Generate .note.gnu.property section. */
20494 switch_to_section (get_section (".note.gnu.property",
20495 SECTION_NOTYPE
, NULL
));
20497 /* PT_NOTE header: namesz, descsz, type.
20498 namesz = 4 ("GNU\0")
20499 descsz = 16 (Size of the program property array)
20500 [(12 + padding) * Number of array elements]
20501 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20502 assemble_align (POINTER_SIZE
);
20503 assemble_integer (GEN_INT (4), 4, 32, 1);
20504 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
20505 assemble_integer (GEN_INT (5), 4, 32, 1);
20507 /* PT_NOTE name. */
20508 assemble_string ("GNU", 4);
20510 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20511 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20513 data = feature_1_and. */
20514 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
20515 assemble_integer (GEN_INT (4), 4, 32, 1);
20516 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
20518 /* Pad the size of the note to the required alignment. */
20519 assemble_align (POINTER_SIZE
);
20522 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20523 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20524 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20526 /* Target-specific selftests. */
20530 namespace selftest
{
20532 /* Selftest for the RTL loader.
20533 Verify that the RTL loader copes with a dump from
20534 print_rtx_function. This is essentially just a test that class
20535 function_reader can handle a real dump, but it also verifies
20536 that lookup_reg_by_dump_name correctly handles hard regs.
20537 The presence of hard reg names in the dump means that the test is
20538 target-specific, hence it is in this file. */
20541 aarch64_test_loading_full_dump ()
20543 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
20545 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
20547 rtx_insn
*insn_1
= get_insn_by_uid (1);
20548 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
20550 rtx_insn
*insn_15
= get_insn_by_uid (15);
20551 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
20552 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
20554 /* Verify crtl->return_rtx. */
20555 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
20556 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
20557 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
20560 /* Run all target-specific selftests. */
20563 aarch64_run_selftests (void)
20565 aarch64_test_loading_full_dump ();
20568 } // namespace selftest
20570 #endif /* #if CHECKING_P */
20572 #undef TARGET_STACK_PROTECT_GUARD
20573 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20575 #undef TARGET_ADDRESS_COST
20576 #define TARGET_ADDRESS_COST aarch64_address_cost
20578 /* This hook will determines whether unnamed bitfields affect the alignment
20579 of the containing structure. The hook returns true if the structure
20580 should inherit the alignment requirements of an unnamed bitfield's
20582 #undef TARGET_ALIGN_ANON_BITFIELD
20583 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20585 #undef TARGET_ASM_ALIGNED_DI_OP
20586 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20588 #undef TARGET_ASM_ALIGNED_HI_OP
20589 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20591 #undef TARGET_ASM_ALIGNED_SI_OP
20592 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20594 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20595 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20596 hook_bool_const_tree_hwi_hwi_const_tree_true
20598 #undef TARGET_ASM_FILE_START
20599 #define TARGET_ASM_FILE_START aarch64_start_file
20601 #undef TARGET_ASM_OUTPUT_MI_THUNK
20602 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20604 #undef TARGET_ASM_SELECT_RTX_SECTION
20605 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20607 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20608 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20610 #undef TARGET_BUILD_BUILTIN_VA_LIST
20611 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20613 #undef TARGET_CALLEE_COPIES
20614 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20616 #undef TARGET_CAN_ELIMINATE
20617 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20619 #undef TARGET_CAN_INLINE_P
20620 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20622 #undef TARGET_CANNOT_FORCE_CONST_MEM
20623 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20625 #undef TARGET_CASE_VALUES_THRESHOLD
20626 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20628 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20629 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20631 /* Only the least significant bit is used for initialization guard
20633 #undef TARGET_CXX_GUARD_MASK_BIT
20634 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20636 #undef TARGET_C_MODE_FOR_SUFFIX
20637 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20639 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20640 #undef TARGET_DEFAULT_TARGET_FLAGS
20641 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20644 #undef TARGET_CLASS_MAX_NREGS
20645 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20647 #undef TARGET_BUILTIN_DECL
20648 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20650 #undef TARGET_BUILTIN_RECIPROCAL
20651 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20653 #undef TARGET_C_EXCESS_PRECISION
20654 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20656 #undef TARGET_EXPAND_BUILTIN
20657 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20659 #undef TARGET_EXPAND_BUILTIN_VA_START
20660 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20662 #undef TARGET_FOLD_BUILTIN
20663 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20665 #undef TARGET_FUNCTION_ARG
20666 #define TARGET_FUNCTION_ARG aarch64_function_arg
20668 #undef TARGET_FUNCTION_ARG_ADVANCE
20669 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20671 #undef TARGET_FUNCTION_ARG_BOUNDARY
20672 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20674 #undef TARGET_FUNCTION_ARG_PADDING
20675 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20677 #undef TARGET_GET_RAW_RESULT_MODE
20678 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20679 #undef TARGET_GET_RAW_ARG_MODE
20680 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20682 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20683 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20685 #undef TARGET_FUNCTION_VALUE
20686 #define TARGET_FUNCTION_VALUE aarch64_function_value
20688 #undef TARGET_FUNCTION_VALUE_REGNO_P
20689 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20691 #undef TARGET_GIMPLE_FOLD_BUILTIN
20692 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20694 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20695 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20697 #undef TARGET_INIT_BUILTINS
20698 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20700 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20701 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20702 aarch64_ira_change_pseudo_allocno_class
20704 #undef TARGET_LEGITIMATE_ADDRESS_P
20705 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20707 #undef TARGET_LEGITIMATE_CONSTANT_P
20708 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20710 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20711 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20712 aarch64_legitimize_address_displacement
20714 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20715 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20717 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20718 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20719 aarch64_libgcc_floating_mode_supported_p
20721 #undef TARGET_MANGLE_TYPE
20722 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20724 #undef TARGET_MEMORY_MOVE_COST
20725 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20727 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20728 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20730 #undef TARGET_MUST_PASS_IN_STACK
20731 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20733 /* This target hook should return true if accesses to volatile bitfields
20734 should use the narrowest mode possible. It should return false if these
20735 accesses should use the bitfield container type. */
20736 #undef TARGET_NARROW_VOLATILE_BITFIELD
20737 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20739 #undef TARGET_OPTION_OVERRIDE
20740 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20742 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20743 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20744 aarch64_override_options_after_change
20746 #undef TARGET_OPTION_SAVE
20747 #define TARGET_OPTION_SAVE aarch64_option_save
20749 #undef TARGET_OPTION_RESTORE
20750 #define TARGET_OPTION_RESTORE aarch64_option_restore
20752 #undef TARGET_OPTION_PRINT
20753 #define TARGET_OPTION_PRINT aarch64_option_print
20755 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20756 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20758 #undef TARGET_SET_CURRENT_FUNCTION
20759 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20761 #undef TARGET_PASS_BY_REFERENCE
20762 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20764 #undef TARGET_PREFERRED_RELOAD_CLASS
20765 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20767 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20768 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20770 #undef TARGET_PROMOTED_TYPE
20771 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20773 #undef TARGET_SECONDARY_RELOAD
20774 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20776 #undef TARGET_SHIFT_TRUNCATION_MASK
20777 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20779 #undef TARGET_SETUP_INCOMING_VARARGS
20780 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20782 #undef TARGET_STRUCT_VALUE_RTX
20783 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20785 #undef TARGET_REGISTER_MOVE_COST
20786 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20788 #undef TARGET_RETURN_IN_MEMORY
20789 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20791 #undef TARGET_RETURN_IN_MSB
20792 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20794 #undef TARGET_RTX_COSTS
20795 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20797 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20798 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20800 #undef TARGET_SCHED_ISSUE_RATE
20801 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20803 #undef TARGET_SCHED_VARIABLE_ISSUE
20804 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20806 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20807 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20808 aarch64_sched_first_cycle_multipass_dfa_lookahead
20810 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20811 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20812 aarch64_first_cycle_multipass_dfa_lookahead_guard
20814 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20815 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20816 aarch64_get_separate_components
20818 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20819 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20820 aarch64_components_for_bb
20822 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20823 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20824 aarch64_disqualify_components
20826 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20827 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20828 aarch64_emit_prologue_components
20830 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20831 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20832 aarch64_emit_epilogue_components
20834 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20835 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20836 aarch64_set_handled_components
20838 #undef TARGET_TRAMPOLINE_INIT
20839 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20841 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20842 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20844 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20845 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20847 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20848 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20849 aarch64_builtin_support_vector_misalignment
20851 #undef TARGET_ARRAY_MODE
20852 #define TARGET_ARRAY_MODE aarch64_array_mode
20854 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20855 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20857 #undef TARGET_VECTORIZE_ADD_STMT_COST
20858 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20860 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20861 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20862 aarch64_builtin_vectorization_cost
20864 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20865 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20867 #undef TARGET_VECTORIZE_BUILTINS
20868 #define TARGET_VECTORIZE_BUILTINS
20870 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20871 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20872 aarch64_builtin_vectorized_function
20874 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20875 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20876 aarch64_autovectorize_vector_sizes
20878 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20879 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20880 aarch64_atomic_assign_expand_fenv
20882 /* Section anchor support. */
20884 #undef TARGET_MIN_ANCHOR_OFFSET
20885 #define TARGET_MIN_ANCHOR_OFFSET -256
20887 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20888 byte offset; we can do much more for larger data types, but have no way
20889 to determine the size of the access. We assume accesses are aligned. */
20890 #undef TARGET_MAX_ANCHOR_OFFSET
20891 #define TARGET_MAX_ANCHOR_OFFSET 4095
20893 #undef TARGET_VECTOR_ALIGNMENT
20894 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20896 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20897 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20898 aarch64_vectorize_preferred_vector_alignment
20899 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20900 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20901 aarch64_simd_vector_alignment_reachable
20903 /* vec_perm support. */
20905 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20906 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20907 aarch64_vectorize_vec_perm_const
20909 #undef TARGET_VECTORIZE_GET_MASK_MODE
20910 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20911 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20912 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20913 aarch64_empty_mask_is_expensive
20914 #undef TARGET_PREFERRED_ELSE_VALUE
20915 #define TARGET_PREFERRED_ELSE_VALUE \
20916 aarch64_preferred_else_value
20918 #undef TARGET_INIT_LIBFUNCS
20919 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20921 #undef TARGET_FIXED_CONDITION_CODE_REGS
20922 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20924 #undef TARGET_FLAGS_REGNUM
20925 #define TARGET_FLAGS_REGNUM CC_REGNUM
20927 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20928 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20930 #undef TARGET_ASAN_SHADOW_OFFSET
20931 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20933 #undef TARGET_LEGITIMIZE_ADDRESS
20934 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20936 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20937 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20939 #undef TARGET_CAN_USE_DOLOOP_P
20940 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20942 #undef TARGET_SCHED_ADJUST_PRIORITY
20943 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20945 #undef TARGET_SCHED_MACRO_FUSION_P
20946 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20948 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20949 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20951 #undef TARGET_SCHED_FUSION_PRIORITY
20952 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20954 #undef TARGET_UNSPEC_MAY_TRAP_P
20955 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20957 #undef TARGET_USE_PSEUDO_PIC_REG
20958 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20960 #undef TARGET_PRINT_OPERAND
20961 #define TARGET_PRINT_OPERAND aarch64_print_operand
20963 #undef TARGET_PRINT_OPERAND_ADDRESS
20964 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20966 #undef TARGET_OPTAB_SUPPORTED_P
20967 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20969 #undef TARGET_OMIT_STRUCT_RETURN_REG
20970 #define TARGET_OMIT_STRUCT_RETURN_REG true
20972 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20973 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20974 aarch64_dwarf_poly_indeterminate_value
20976 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20977 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20978 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20980 #undef TARGET_HARD_REGNO_NREGS
20981 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20982 #undef TARGET_HARD_REGNO_MODE_OK
20983 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20985 #undef TARGET_MODES_TIEABLE_P
20986 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20988 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20989 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20990 aarch64_hard_regno_call_part_clobbered
20992 #undef TARGET_INSN_CALLEE_ABI
20993 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
20995 #undef TARGET_CONSTANT_ALIGNMENT
20996 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20998 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20999 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
21000 aarch64_stack_clash_protection_alloca_probe_range
21002 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21003 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21005 #undef TARGET_CAN_CHANGE_MODE_CLASS
21006 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21008 #undef TARGET_SELECT_EARLY_REMAT_MODES
21009 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21011 #undef TARGET_SPECULATION_SAFE_VALUE
21012 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21014 #undef TARGET_ESTIMATED_POLY_VALUE
21015 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21017 #undef TARGET_ATTRIBUTE_TABLE
21018 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21020 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21021 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21022 aarch64_simd_clone_compute_vecsize_and_simdlen
21024 #undef TARGET_SIMD_CLONE_ADJUST
21025 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21027 #undef TARGET_SIMD_CLONE_USABLE
21028 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21030 #undef TARGET_COMP_TYPE_ATTRIBUTES
21031 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21033 #undef TARGET_GET_MULTILIB_ABI_NAME
21034 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21036 #undef TARGET_FNTYPE_ABI
21037 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21040 #undef TARGET_RUN_TARGET_SELFTESTS
21041 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21042 #endif /* #if CHECKING_P */
21044 #undef TARGET_ASM_POST_CFI_STARTPROC
21045 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21047 struct gcc_target targetm
= TARGET_INITIALIZER
;
21049 #include "gt-aarch64.h"