[AArch64] Use calls for SVE TLSDESC
[gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78
79 /* This file should be included last. */
80 #include "target-def.h"
81
82 /* Defined for convenience. */
83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84
85 /* Information about a legitimate vector immediate operand. */
86 struct simd_immediate_info
87 {
88 enum insn_type { MOV, MVN, INDEX, PTRUE };
89 enum modifier_type { LSL, MSL };
90
91 simd_immediate_info () {}
92 simd_immediate_info (scalar_float_mode, rtx);
93 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
94 insn_type = MOV, modifier_type = LSL,
95 unsigned int = 0);
96 simd_immediate_info (scalar_mode, rtx, rtx);
97 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
98
99 /* The mode of the elements. */
100 scalar_mode elt_mode;
101
102 /* The instruction to use to move the immediate into a vector. */
103 insn_type insn;
104
105 union
106 {
107 /* For MOV and MVN. */
108 struct
109 {
110 /* The value of each element. */
111 rtx value;
112
113 /* The kind of shift modifier to use, and the number of bits to shift.
114 This is (LSL, 0) if no shift is needed. */
115 modifier_type modifier;
116 unsigned int shift;
117 } mov;
118
119 /* For INDEX. */
120 struct
121 {
122 /* The value of the first element and the step to be added for each
123 subsequent element. */
124 rtx base, step;
125 } index;
126
127 /* For PTRUE. */
128 aarch64_svpattern pattern;
129 } u;
130 };
131
132 /* Construct a floating-point immediate in which each element has mode
133 ELT_MODE_IN and value VALUE_IN. */
134 inline simd_immediate_info
135 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
136 : elt_mode (elt_mode_in), insn (MOV)
137 {
138 u.mov.value = value_in;
139 u.mov.modifier = LSL;
140 u.mov.shift = 0;
141 }
142
143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
144 and value VALUE_IN. The other parameters are as for the structure
145 fields. */
146 inline simd_immediate_info
147 ::simd_immediate_info (scalar_int_mode elt_mode_in,
148 unsigned HOST_WIDE_INT value_in,
149 insn_type insn_in, modifier_type modifier_in,
150 unsigned int shift_in)
151 : elt_mode (elt_mode_in), insn (insn_in)
152 {
153 u.mov.value = gen_int_mode (value_in, elt_mode_in);
154 u.mov.modifier = modifier_in;
155 u.mov.shift = shift_in;
156 }
157
158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
159 and where element I is equal to BASE_IN + I * STEP_IN. */
160 inline simd_immediate_info
161 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
162 : elt_mode (elt_mode_in), insn (INDEX)
163 {
164 u.index.base = base_in;
165 u.index.step = step_in;
166 }
167
168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
169 and has PTRUE pattern PATTERN_IN. */
170 inline simd_immediate_info
171 ::simd_immediate_info (scalar_int_mode elt_mode_in,
172 aarch64_svpattern pattern_in)
173 : elt_mode (elt_mode_in), insn (PTRUE)
174 {
175 u.pattern = pattern_in;
176 }
177
178 /* The current code model. */
179 enum aarch64_code_model aarch64_cmodel;
180
181 /* The number of 64-bit elements in an SVE vector. */
182 poly_uint16 aarch64_sve_vg;
183
184 #ifdef HAVE_AS_TLS
185 #undef TARGET_HAVE_TLS
186 #define TARGET_HAVE_TLS 1
187 #endif
188
189 static bool aarch64_composite_type_p (const_tree, machine_mode);
190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
191 const_tree,
192 machine_mode *, int *,
193 bool *);
194 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
196 static void aarch64_override_options_after_change (void);
197 static bool aarch64_vector_mode_supported_p (machine_mode);
198 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
200 const_tree type,
201 int misalignment,
202 bool is_packed);
203 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
204 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
205 aarch64_addr_query_type);
206 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
207
208 /* Major revision number of the ARM Architecture implemented by the target. */
209 unsigned aarch64_architecture_version;
210
211 /* The processor for which instructions should be scheduled. */
212 enum aarch64_processor aarch64_tune = cortexa53;
213
214 /* Mask to specify which instruction scheduling options should be used. */
215 uint64_t aarch64_tune_flags = 0;
216
217 /* Global flag for PC relative loads. */
218 bool aarch64_pcrelative_literal_loads;
219
220 /* Global flag for whether frame pointer is enabled. */
221 bool aarch64_use_frame_pointer;
222
223 #define BRANCH_PROTECT_STR_MAX 255
224 char *accepted_branch_protection_string = NULL;
225
226 static enum aarch64_parse_opt_result
227 aarch64_parse_branch_protection (const char*, char**);
228
229 /* Support for command line parsing of boolean flags in the tuning
230 structures. */
231 struct aarch64_flag_desc
232 {
233 const char* name;
234 unsigned int flag;
235 };
236
237 #define AARCH64_FUSION_PAIR(name, internal_name) \
238 { name, AARCH64_FUSE_##internal_name },
239 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
240 {
241 { "none", AARCH64_FUSE_NOTHING },
242 #include "aarch64-fusion-pairs.def"
243 { "all", AARCH64_FUSE_ALL },
244 { NULL, AARCH64_FUSE_NOTHING }
245 };
246
247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
248 { name, AARCH64_EXTRA_TUNE_##internal_name },
249 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
250 {
251 { "none", AARCH64_EXTRA_TUNE_NONE },
252 #include "aarch64-tuning-flags.def"
253 { "all", AARCH64_EXTRA_TUNE_ALL },
254 { NULL, AARCH64_EXTRA_TUNE_NONE }
255 };
256
257 /* Tuning parameters. */
258
259 static const struct cpu_addrcost_table generic_addrcost_table =
260 {
261 {
262 1, /* hi */
263 0, /* si */
264 0, /* di */
265 1, /* ti */
266 },
267 0, /* pre_modify */
268 0, /* post_modify */
269 0, /* register_offset */
270 0, /* register_sextend */
271 0, /* register_zextend */
272 0 /* imm_offset */
273 };
274
275 static const struct cpu_addrcost_table exynosm1_addrcost_table =
276 {
277 {
278 0, /* hi */
279 0, /* si */
280 0, /* di */
281 2, /* ti */
282 },
283 0, /* pre_modify */
284 0, /* post_modify */
285 1, /* register_offset */
286 1, /* register_sextend */
287 2, /* register_zextend */
288 0, /* imm_offset */
289 };
290
291 static const struct cpu_addrcost_table xgene1_addrcost_table =
292 {
293 {
294 1, /* hi */
295 0, /* si */
296 0, /* di */
297 1, /* ti */
298 },
299 1, /* pre_modify */
300 1, /* post_modify */
301 0, /* register_offset */
302 1, /* register_sextend */
303 1, /* register_zextend */
304 0, /* imm_offset */
305 };
306
307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
308 {
309 {
310 1, /* hi */
311 1, /* si */
312 1, /* di */
313 2, /* ti */
314 },
315 0, /* pre_modify */
316 0, /* post_modify */
317 2, /* register_offset */
318 3, /* register_sextend */
319 3, /* register_zextend */
320 0, /* imm_offset */
321 };
322
323 static const struct cpu_addrcost_table tsv110_addrcost_table =
324 {
325 {
326 1, /* hi */
327 0, /* si */
328 0, /* di */
329 1, /* ti */
330 },
331 0, /* pre_modify */
332 0, /* post_modify */
333 0, /* register_offset */
334 1, /* register_sextend */
335 1, /* register_zextend */
336 0, /* imm_offset */
337 };
338
339 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
340 {
341 {
342 1, /* hi */
343 1, /* si */
344 1, /* di */
345 2, /* ti */
346 },
347 1, /* pre_modify */
348 1, /* post_modify */
349 3, /* register_offset */
350 3, /* register_sextend */
351 3, /* register_zextend */
352 2, /* imm_offset */
353 };
354
355 static const struct cpu_regmove_cost generic_regmove_cost =
356 {
357 1, /* GP2GP */
358 /* Avoid the use of slow int<->fp moves for spilling by setting
359 their cost higher than memmov_cost. */
360 5, /* GP2FP */
361 5, /* FP2GP */
362 2 /* FP2FP */
363 };
364
365 static const struct cpu_regmove_cost cortexa57_regmove_cost =
366 {
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 5, /* GP2FP */
371 5, /* FP2GP */
372 2 /* FP2FP */
373 };
374
375 static const struct cpu_regmove_cost cortexa53_regmove_cost =
376 {
377 1, /* GP2GP */
378 /* Avoid the use of slow int<->fp moves for spilling by setting
379 their cost higher than memmov_cost. */
380 5, /* GP2FP */
381 5, /* FP2GP */
382 2 /* FP2FP */
383 };
384
385 static const struct cpu_regmove_cost exynosm1_regmove_cost =
386 {
387 1, /* GP2GP */
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost (actual, 4 and 9). */
390 9, /* GP2FP */
391 9, /* FP2GP */
392 1 /* FP2FP */
393 };
394
395 static const struct cpu_regmove_cost thunderx_regmove_cost =
396 {
397 2, /* GP2GP */
398 2, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
401 };
402
403 static const struct cpu_regmove_cost xgene1_regmove_cost =
404 {
405 1, /* GP2GP */
406 /* Avoid the use of slow int<->fp moves for spilling by setting
407 their cost higher than memmov_cost. */
408 8, /* GP2FP */
409 8, /* FP2GP */
410 2 /* FP2FP */
411 };
412
413 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
414 {
415 2, /* GP2GP */
416 /* Avoid the use of int<->fp moves for spilling. */
417 6, /* GP2FP */
418 6, /* FP2GP */
419 4 /* FP2FP */
420 };
421
422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
423 {
424 1, /* GP2GP */
425 /* Avoid the use of int<->fp moves for spilling. */
426 8, /* GP2FP */
427 8, /* FP2GP */
428 4 /* FP2FP */
429 };
430
431 static const struct cpu_regmove_cost tsv110_regmove_cost =
432 {
433 1, /* GP2GP */
434 /* Avoid the use of slow int<->fp moves for spilling by setting
435 their cost higher than memmov_cost. */
436 2, /* GP2FP */
437 3, /* FP2GP */
438 2 /* FP2FP */
439 };
440
441 /* Generic costs for vector insn classes. */
442 static const struct cpu_vector_cost generic_vector_cost =
443 {
444 1, /* scalar_int_stmt_cost */
445 1, /* scalar_fp_stmt_cost */
446 1, /* scalar_load_cost */
447 1, /* scalar_store_cost */
448 1, /* vec_int_stmt_cost */
449 1, /* vec_fp_stmt_cost */
450 2, /* vec_permute_cost */
451 1, /* vec_to_scalar_cost */
452 1, /* scalar_to_vec_cost */
453 1, /* vec_align_load_cost */
454 1, /* vec_unalign_load_cost */
455 1, /* vec_unalign_store_cost */
456 1, /* vec_store_cost */
457 3, /* cond_taken_branch_cost */
458 1 /* cond_not_taken_branch_cost */
459 };
460
461 /* QDF24XX costs for vector insn classes. */
462 static const struct cpu_vector_cost qdf24xx_vector_cost =
463 {
464 1, /* scalar_int_stmt_cost */
465 1, /* scalar_fp_stmt_cost */
466 1, /* scalar_load_cost */
467 1, /* scalar_store_cost */
468 1, /* vec_int_stmt_cost */
469 3, /* vec_fp_stmt_cost */
470 2, /* vec_permute_cost */
471 1, /* vec_to_scalar_cost */
472 1, /* scalar_to_vec_cost */
473 1, /* vec_align_load_cost */
474 1, /* vec_unalign_load_cost */
475 1, /* vec_unalign_store_cost */
476 1, /* vec_store_cost */
477 3, /* cond_taken_branch_cost */
478 1 /* cond_not_taken_branch_cost */
479 };
480
481 /* ThunderX costs for vector insn classes. */
482 static const struct cpu_vector_cost thunderx_vector_cost =
483 {
484 1, /* scalar_int_stmt_cost */
485 1, /* scalar_fp_stmt_cost */
486 3, /* scalar_load_cost */
487 1, /* scalar_store_cost */
488 4, /* vec_int_stmt_cost */
489 1, /* vec_fp_stmt_cost */
490 4, /* vec_permute_cost */
491 2, /* vec_to_scalar_cost */
492 2, /* scalar_to_vec_cost */
493 3, /* vec_align_load_cost */
494 5, /* vec_unalign_load_cost */
495 5, /* vec_unalign_store_cost */
496 1, /* vec_store_cost */
497 3, /* cond_taken_branch_cost */
498 3 /* cond_not_taken_branch_cost */
499 };
500
501 static const struct cpu_vector_cost tsv110_vector_cost =
502 {
503 1, /* scalar_int_stmt_cost */
504 1, /* scalar_fp_stmt_cost */
505 5, /* scalar_load_cost */
506 1, /* scalar_store_cost */
507 2, /* vec_int_stmt_cost */
508 2, /* vec_fp_stmt_cost */
509 2, /* vec_permute_cost */
510 3, /* vec_to_scalar_cost */
511 2, /* scalar_to_vec_cost */
512 5, /* vec_align_load_cost */
513 5, /* vec_unalign_load_cost */
514 1, /* vec_unalign_store_cost */
515 1, /* vec_store_cost */
516 1, /* cond_taken_branch_cost */
517 1 /* cond_not_taken_branch_cost */
518 };
519
520 /* Generic costs for vector insn classes. */
521 static const struct cpu_vector_cost cortexa57_vector_cost =
522 {
523 1, /* scalar_int_stmt_cost */
524 1, /* scalar_fp_stmt_cost */
525 4, /* scalar_load_cost */
526 1, /* scalar_store_cost */
527 2, /* vec_int_stmt_cost */
528 2, /* vec_fp_stmt_cost */
529 3, /* vec_permute_cost */
530 8, /* vec_to_scalar_cost */
531 8, /* scalar_to_vec_cost */
532 4, /* vec_align_load_cost */
533 4, /* vec_unalign_load_cost */
534 1, /* vec_unalign_store_cost */
535 1, /* vec_store_cost */
536 1, /* cond_taken_branch_cost */
537 1 /* cond_not_taken_branch_cost */
538 };
539
540 static const struct cpu_vector_cost exynosm1_vector_cost =
541 {
542 1, /* scalar_int_stmt_cost */
543 1, /* scalar_fp_stmt_cost */
544 5, /* scalar_load_cost */
545 1, /* scalar_store_cost */
546 3, /* vec_int_stmt_cost */
547 3, /* vec_fp_stmt_cost */
548 3, /* vec_permute_cost */
549 3, /* vec_to_scalar_cost */
550 3, /* scalar_to_vec_cost */
551 5, /* vec_align_load_cost */
552 5, /* vec_unalign_load_cost */
553 1, /* vec_unalign_store_cost */
554 1, /* vec_store_cost */
555 1, /* cond_taken_branch_cost */
556 1 /* cond_not_taken_branch_cost */
557 };
558
559 /* Generic costs for vector insn classes. */
560 static const struct cpu_vector_cost xgene1_vector_cost =
561 {
562 1, /* scalar_int_stmt_cost */
563 1, /* scalar_fp_stmt_cost */
564 5, /* scalar_load_cost */
565 1, /* scalar_store_cost */
566 2, /* vec_int_stmt_cost */
567 2, /* vec_fp_stmt_cost */
568 2, /* vec_permute_cost */
569 4, /* vec_to_scalar_cost */
570 4, /* scalar_to_vec_cost */
571 10, /* vec_align_load_cost */
572 10, /* vec_unalign_load_cost */
573 2, /* vec_unalign_store_cost */
574 2, /* vec_store_cost */
575 2, /* cond_taken_branch_cost */
576 1 /* cond_not_taken_branch_cost */
577 };
578
579 /* Costs for vector insn classes for Vulcan. */
580 static const struct cpu_vector_cost thunderx2t99_vector_cost =
581 {
582 1, /* scalar_int_stmt_cost */
583 6, /* scalar_fp_stmt_cost */
584 4, /* scalar_load_cost */
585 1, /* scalar_store_cost */
586 5, /* vec_int_stmt_cost */
587 6, /* vec_fp_stmt_cost */
588 3, /* vec_permute_cost */
589 6, /* vec_to_scalar_cost */
590 5, /* scalar_to_vec_cost */
591 8, /* vec_align_load_cost */
592 8, /* vec_unalign_load_cost */
593 4, /* vec_unalign_store_cost */
594 4, /* vec_store_cost */
595 2, /* cond_taken_branch_cost */
596 1 /* cond_not_taken_branch_cost */
597 };
598
599 /* Generic costs for branch instructions. */
600 static const struct cpu_branch_cost generic_branch_cost =
601 {
602 1, /* Predictable. */
603 3 /* Unpredictable. */
604 };
605
606 /* Generic approximation modes. */
607 static const cpu_approx_modes generic_approx_modes =
608 {
609 AARCH64_APPROX_NONE, /* division */
610 AARCH64_APPROX_NONE, /* sqrt */
611 AARCH64_APPROX_NONE /* recip_sqrt */
612 };
613
614 /* Approximation modes for Exynos M1. */
615 static const cpu_approx_modes exynosm1_approx_modes =
616 {
617 AARCH64_APPROX_NONE, /* division */
618 AARCH64_APPROX_ALL, /* sqrt */
619 AARCH64_APPROX_ALL /* recip_sqrt */
620 };
621
622 /* Approximation modes for X-Gene 1. */
623 static const cpu_approx_modes xgene1_approx_modes =
624 {
625 AARCH64_APPROX_NONE, /* division */
626 AARCH64_APPROX_NONE, /* sqrt */
627 AARCH64_APPROX_ALL /* recip_sqrt */
628 };
629
630 /* Generic prefetch settings (which disable prefetch). */
631 static const cpu_prefetch_tune generic_prefetch_tune =
632 {
633 0, /* num_slots */
634 -1, /* l1_cache_size */
635 -1, /* l1_cache_line_size */
636 -1, /* l2_cache_size */
637 true, /* prefetch_dynamic_strides */
638 -1, /* minimum_stride */
639 -1 /* default_opt_level */
640 };
641
642 static const cpu_prefetch_tune exynosm1_prefetch_tune =
643 {
644 0, /* num_slots */
645 -1, /* l1_cache_size */
646 64, /* l1_cache_line_size */
647 -1, /* l2_cache_size */
648 true, /* prefetch_dynamic_strides */
649 -1, /* minimum_stride */
650 -1 /* default_opt_level */
651 };
652
653 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
654 {
655 4, /* num_slots */
656 32, /* l1_cache_size */
657 64, /* l1_cache_line_size */
658 512, /* l2_cache_size */
659 false, /* prefetch_dynamic_strides */
660 2048, /* minimum_stride */
661 3 /* default_opt_level */
662 };
663
664 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
665 {
666 8, /* num_slots */
667 32, /* l1_cache_size */
668 128, /* l1_cache_line_size */
669 16*1024, /* l2_cache_size */
670 true, /* prefetch_dynamic_strides */
671 -1, /* minimum_stride */
672 3 /* default_opt_level */
673 };
674
675 static const cpu_prefetch_tune thunderx_prefetch_tune =
676 {
677 8, /* num_slots */
678 32, /* l1_cache_size */
679 128, /* l1_cache_line_size */
680 -1, /* l2_cache_size */
681 true, /* prefetch_dynamic_strides */
682 -1, /* minimum_stride */
683 -1 /* default_opt_level */
684 };
685
686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
687 {
688 8, /* num_slots */
689 32, /* l1_cache_size */
690 64, /* l1_cache_line_size */
691 256, /* l2_cache_size */
692 true, /* prefetch_dynamic_strides */
693 -1, /* minimum_stride */
694 -1 /* default_opt_level */
695 };
696
697 static const cpu_prefetch_tune tsv110_prefetch_tune =
698 {
699 0, /* num_slots */
700 64, /* l1_cache_size */
701 64, /* l1_cache_line_size */
702 512, /* l2_cache_size */
703 true, /* prefetch_dynamic_strides */
704 -1, /* minimum_stride */
705 -1 /* default_opt_level */
706 };
707
708 static const cpu_prefetch_tune xgene1_prefetch_tune =
709 {
710 8, /* num_slots */
711 32, /* l1_cache_size */
712 64, /* l1_cache_line_size */
713 256, /* l2_cache_size */
714 true, /* prefetch_dynamic_strides */
715 -1, /* minimum_stride */
716 -1 /* default_opt_level */
717 };
718
719 static const struct tune_params generic_tunings =
720 {
721 &cortexa57_extra_costs,
722 &generic_addrcost_table,
723 &generic_regmove_cost,
724 &generic_vector_cost,
725 &generic_branch_cost,
726 &generic_approx_modes,
727 SVE_NOT_IMPLEMENTED, /* sve_width */
728 4, /* memmov_cost */
729 2, /* issue_rate */
730 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
731 "16:12", /* function_align. */
732 "4", /* jump_align. */
733 "8", /* loop_align. */
734 2, /* int_reassoc_width. */
735 4, /* fp_reassoc_width. */
736 1, /* vec_reassoc_width. */
737 2, /* min_div_recip_mul_sf. */
738 2, /* min_div_recip_mul_df. */
739 0, /* max_case_values. */
740 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
741 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
742 &generic_prefetch_tune
743 };
744
745 static const struct tune_params cortexa35_tunings =
746 {
747 &cortexa53_extra_costs,
748 &generic_addrcost_table,
749 &cortexa53_regmove_cost,
750 &generic_vector_cost,
751 &generic_branch_cost,
752 &generic_approx_modes,
753 SVE_NOT_IMPLEMENTED, /* sve_width */
754 4, /* memmov_cost */
755 1, /* issue_rate */
756 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
757 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
758 "16", /* function_align. */
759 "4", /* jump_align. */
760 "8", /* loop_align. */
761 2, /* int_reassoc_width. */
762 4, /* fp_reassoc_width. */
763 1, /* vec_reassoc_width. */
764 2, /* min_div_recip_mul_sf. */
765 2, /* min_div_recip_mul_df. */
766 0, /* max_case_values. */
767 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
768 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
769 &generic_prefetch_tune
770 };
771
772 static const struct tune_params cortexa53_tunings =
773 {
774 &cortexa53_extra_costs,
775 &generic_addrcost_table,
776 &cortexa53_regmove_cost,
777 &generic_vector_cost,
778 &generic_branch_cost,
779 &generic_approx_modes,
780 SVE_NOT_IMPLEMENTED, /* sve_width */
781 4, /* memmov_cost */
782 2, /* issue_rate */
783 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
784 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
785 "16", /* function_align. */
786 "4", /* jump_align. */
787 "8", /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
797 };
798
799 static const struct tune_params cortexa57_tunings =
800 {
801 &cortexa57_extra_costs,
802 &generic_addrcost_table,
803 &cortexa57_regmove_cost,
804 &cortexa57_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 SVE_NOT_IMPLEMENTED, /* sve_width */
808 4, /* memmov_cost */
809 3, /* issue_rate */
810 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
811 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
812 "16", /* function_align. */
813 "4", /* jump_align. */
814 "8", /* loop_align. */
815 2, /* int_reassoc_width. */
816 4, /* fp_reassoc_width. */
817 1, /* vec_reassoc_width. */
818 2, /* min_div_recip_mul_sf. */
819 2, /* min_div_recip_mul_df. */
820 0, /* max_case_values. */
821 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
822 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
823 &generic_prefetch_tune
824 };
825
826 static const struct tune_params cortexa72_tunings =
827 {
828 &cortexa57_extra_costs,
829 &generic_addrcost_table,
830 &cortexa57_regmove_cost,
831 &cortexa57_vector_cost,
832 &generic_branch_cost,
833 &generic_approx_modes,
834 SVE_NOT_IMPLEMENTED, /* sve_width */
835 4, /* memmov_cost */
836 3, /* issue_rate */
837 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
838 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
839 "16", /* function_align. */
840 "4", /* jump_align. */
841 "8", /* loop_align. */
842 2, /* int_reassoc_width. */
843 4, /* fp_reassoc_width. */
844 1, /* vec_reassoc_width. */
845 2, /* min_div_recip_mul_sf. */
846 2, /* min_div_recip_mul_df. */
847 0, /* max_case_values. */
848 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
849 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
850 &generic_prefetch_tune
851 };
852
853 static const struct tune_params cortexa73_tunings =
854 {
855 &cortexa57_extra_costs,
856 &generic_addrcost_table,
857 &cortexa57_regmove_cost,
858 &cortexa57_vector_cost,
859 &generic_branch_cost,
860 &generic_approx_modes,
861 SVE_NOT_IMPLEMENTED, /* sve_width */
862 4, /* memmov_cost. */
863 2, /* issue_rate. */
864 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
865 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
866 "16", /* function_align. */
867 "4", /* jump_align. */
868 "8", /* loop_align. */
869 2, /* int_reassoc_width. */
870 4, /* fp_reassoc_width. */
871 1, /* vec_reassoc_width. */
872 2, /* min_div_recip_mul_sf. */
873 2, /* min_div_recip_mul_df. */
874 0, /* max_case_values. */
875 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
876 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
877 &generic_prefetch_tune
878 };
879
880
881
882 static const struct tune_params exynosm1_tunings =
883 {
884 &exynosm1_extra_costs,
885 &exynosm1_addrcost_table,
886 &exynosm1_regmove_cost,
887 &exynosm1_vector_cost,
888 &generic_branch_cost,
889 &exynosm1_approx_modes,
890 SVE_NOT_IMPLEMENTED, /* sve_width */
891 4, /* memmov_cost */
892 3, /* issue_rate */
893 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
894 "4", /* function_align. */
895 "4", /* jump_align. */
896 "4", /* loop_align. */
897 2, /* int_reassoc_width. */
898 4, /* fp_reassoc_width. */
899 1, /* vec_reassoc_width. */
900 2, /* min_div_recip_mul_sf. */
901 2, /* min_div_recip_mul_df. */
902 48, /* max_case_values. */
903 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
904 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
905 &exynosm1_prefetch_tune
906 };
907
908 static const struct tune_params thunderxt88_tunings =
909 {
910 &thunderx_extra_costs,
911 &generic_addrcost_table,
912 &thunderx_regmove_cost,
913 &thunderx_vector_cost,
914 &generic_branch_cost,
915 &generic_approx_modes,
916 SVE_NOT_IMPLEMENTED, /* sve_width */
917 6, /* memmov_cost */
918 2, /* issue_rate */
919 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
920 "8", /* function_align. */
921 "8", /* jump_align. */
922 "8", /* loop_align. */
923 2, /* int_reassoc_width. */
924 4, /* fp_reassoc_width. */
925 1, /* vec_reassoc_width. */
926 2, /* min_div_recip_mul_sf. */
927 2, /* min_div_recip_mul_df. */
928 0, /* max_case_values. */
929 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
930 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
931 &thunderxt88_prefetch_tune
932 };
933
934 static const struct tune_params thunderx_tunings =
935 {
936 &thunderx_extra_costs,
937 &generic_addrcost_table,
938 &thunderx_regmove_cost,
939 &thunderx_vector_cost,
940 &generic_branch_cost,
941 &generic_approx_modes,
942 SVE_NOT_IMPLEMENTED, /* sve_width */
943 6, /* memmov_cost */
944 2, /* issue_rate */
945 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
946 "8", /* function_align. */
947 "8", /* jump_align. */
948 "8", /* loop_align. */
949 2, /* int_reassoc_width. */
950 4, /* fp_reassoc_width. */
951 1, /* vec_reassoc_width. */
952 2, /* min_div_recip_mul_sf. */
953 2, /* min_div_recip_mul_df. */
954 0, /* max_case_values. */
955 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
956 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
957 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
958 &thunderx_prefetch_tune
959 };
960
961 static const struct tune_params tsv110_tunings =
962 {
963 &tsv110_extra_costs,
964 &tsv110_addrcost_table,
965 &tsv110_regmove_cost,
966 &tsv110_vector_cost,
967 &generic_branch_cost,
968 &generic_approx_modes,
969 SVE_NOT_IMPLEMENTED, /* sve_width */
970 4, /* memmov_cost */
971 4, /* issue_rate */
972 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
973 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
974 "16", /* function_align. */
975 "4", /* jump_align. */
976 "8", /* loop_align. */
977 2, /* int_reassoc_width. */
978 4, /* fp_reassoc_width. */
979 1, /* vec_reassoc_width. */
980 2, /* min_div_recip_mul_sf. */
981 2, /* min_div_recip_mul_df. */
982 0, /* max_case_values. */
983 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
984 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
985 &tsv110_prefetch_tune
986 };
987
988 static const struct tune_params xgene1_tunings =
989 {
990 &xgene1_extra_costs,
991 &xgene1_addrcost_table,
992 &xgene1_regmove_cost,
993 &xgene1_vector_cost,
994 &generic_branch_cost,
995 &xgene1_approx_modes,
996 SVE_NOT_IMPLEMENTED, /* sve_width */
997 6, /* memmov_cost */
998 4, /* issue_rate */
999 AARCH64_FUSE_NOTHING, /* fusible_ops */
1000 "16", /* function_align. */
1001 "16", /* jump_align. */
1002 "16", /* loop_align. */
1003 2, /* int_reassoc_width. */
1004 4, /* fp_reassoc_width. */
1005 1, /* vec_reassoc_width. */
1006 2, /* min_div_recip_mul_sf. */
1007 2, /* min_div_recip_mul_df. */
1008 17, /* max_case_values. */
1009 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1010 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1011 &xgene1_prefetch_tune
1012 };
1013
1014 static const struct tune_params emag_tunings =
1015 {
1016 &xgene1_extra_costs,
1017 &xgene1_addrcost_table,
1018 &xgene1_regmove_cost,
1019 &xgene1_vector_cost,
1020 &generic_branch_cost,
1021 &xgene1_approx_modes,
1022 SVE_NOT_IMPLEMENTED,
1023 6, /* memmov_cost */
1024 4, /* issue_rate */
1025 AARCH64_FUSE_NOTHING, /* fusible_ops */
1026 "16", /* function_align. */
1027 "16", /* jump_align. */
1028 "16", /* loop_align. */
1029 2, /* int_reassoc_width. */
1030 4, /* fp_reassoc_width. */
1031 1, /* vec_reassoc_width. */
1032 2, /* min_div_recip_mul_sf. */
1033 2, /* min_div_recip_mul_df. */
1034 17, /* max_case_values. */
1035 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1036 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1037 &xgene1_prefetch_tune
1038 };
1039
1040 static const struct tune_params qdf24xx_tunings =
1041 {
1042 &qdf24xx_extra_costs,
1043 &qdf24xx_addrcost_table,
1044 &qdf24xx_regmove_cost,
1045 &qdf24xx_vector_cost,
1046 &generic_branch_cost,
1047 &generic_approx_modes,
1048 SVE_NOT_IMPLEMENTED, /* sve_width */
1049 4, /* memmov_cost */
1050 4, /* issue_rate */
1051 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1052 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1053 "16", /* function_align. */
1054 "8", /* jump_align. */
1055 "16", /* loop_align. */
1056 2, /* int_reassoc_width. */
1057 4, /* fp_reassoc_width. */
1058 1, /* vec_reassoc_width. */
1059 2, /* min_div_recip_mul_sf. */
1060 2, /* min_div_recip_mul_df. */
1061 0, /* max_case_values. */
1062 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1063 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1064 &qdf24xx_prefetch_tune
1065 };
1066
1067 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 for now. */
1069 static const struct tune_params saphira_tunings =
1070 {
1071 &generic_extra_costs,
1072 &generic_addrcost_table,
1073 &generic_regmove_cost,
1074 &generic_vector_cost,
1075 &generic_branch_cost,
1076 &generic_approx_modes,
1077 SVE_NOT_IMPLEMENTED, /* sve_width */
1078 4, /* memmov_cost */
1079 4, /* issue_rate */
1080 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1081 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1082 "16", /* function_align. */
1083 "8", /* jump_align. */
1084 "16", /* loop_align. */
1085 2, /* int_reassoc_width. */
1086 4, /* fp_reassoc_width. */
1087 1, /* vec_reassoc_width. */
1088 2, /* min_div_recip_mul_sf. */
1089 2, /* min_div_recip_mul_df. */
1090 0, /* max_case_values. */
1091 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1092 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1093 &generic_prefetch_tune
1094 };
1095
1096 static const struct tune_params thunderx2t99_tunings =
1097 {
1098 &thunderx2t99_extra_costs,
1099 &thunderx2t99_addrcost_table,
1100 &thunderx2t99_regmove_cost,
1101 &thunderx2t99_vector_cost,
1102 &generic_branch_cost,
1103 &generic_approx_modes,
1104 SVE_NOT_IMPLEMENTED, /* sve_width */
1105 4, /* memmov_cost. */
1106 4, /* issue_rate. */
1107 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1108 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1109 "16", /* function_align. */
1110 "8", /* jump_align. */
1111 "16", /* loop_align. */
1112 3, /* int_reassoc_width. */
1113 2, /* fp_reassoc_width. */
1114 2, /* vec_reassoc_width. */
1115 2, /* min_div_recip_mul_sf. */
1116 2, /* min_div_recip_mul_df. */
1117 0, /* max_case_values. */
1118 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1119 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1120 &thunderx2t99_prefetch_tune
1121 };
1122
1123 static const struct tune_params neoversen1_tunings =
1124 {
1125 &cortexa57_extra_costs,
1126 &generic_addrcost_table,
1127 &generic_regmove_cost,
1128 &cortexa57_vector_cost,
1129 &generic_branch_cost,
1130 &generic_approx_modes,
1131 SVE_NOT_IMPLEMENTED, /* sve_width */
1132 4, /* memmov_cost */
1133 3, /* issue_rate */
1134 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1135 "32:16", /* function_align. */
1136 "32:16", /* jump_align. */
1137 "32:16", /* loop_align. */
1138 2, /* int_reassoc_width. */
1139 4, /* fp_reassoc_width. */
1140 2, /* vec_reassoc_width. */
1141 2, /* min_div_recip_mul_sf. */
1142 2, /* min_div_recip_mul_df. */
1143 0, /* max_case_values. */
1144 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1145 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1146 &generic_prefetch_tune
1147 };
1148
1149 /* Support for fine-grained override of the tuning structures. */
1150 struct aarch64_tuning_override_function
1151 {
1152 const char* name;
1153 void (*parse_override)(const char*, struct tune_params*);
1154 };
1155
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1159
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions[] =
1162 {
1163 { "fuse", aarch64_parse_fuse_string },
1164 { "tune", aarch64_parse_tune_string },
1165 { "sve_width", aarch64_parse_sve_width_string },
1166 { NULL, NULL }
1167 };
1168
1169 /* A processor implementing AArch64. */
1170 struct processor
1171 {
1172 const char *const name;
1173 enum aarch64_processor ident;
1174 enum aarch64_processor sched_core;
1175 enum aarch64_arch arch;
1176 unsigned architecture_version;
1177 const uint64_t flags;
1178 const struct tune_params *const tune;
1179 };
1180
1181 /* Architectures implementing AArch64. */
1182 static const struct processor all_architectures[] =
1183 {
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1188 };
1189
1190 /* Processor cores implementing AArch64. */
1191 static const struct processor all_cores[] =
1192 {
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1195 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1196 FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1199 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1200 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1201 };
1202
1203
1204 /* Target specification. These are populated by the -march, -mtune, -mcpu
1205 handling code or by target attributes. */
1206 static const struct processor *selected_arch;
1207 static const struct processor *selected_cpu;
1208 static const struct processor *selected_tune;
1209
1210 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1211
1212 /* The current tuning set. */
1213 struct tune_params aarch64_tune_params = generic_tunings;
1214
1215 /* Table of machine attributes. */
1216 static const struct attribute_spec aarch64_attribute_table[] =
1217 {
1218 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1219 affects_type_identity, handler, exclude } */
1220 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1221 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1222 };
1223
1224 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1225
1226 /* An ISA extension in the co-processor and main instruction set space. */
1227 struct aarch64_option_extension
1228 {
1229 const char *const name;
1230 const unsigned long flags_on;
1231 const unsigned long flags_off;
1232 };
1233
1234 typedef enum aarch64_cond_code
1235 {
1236 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1237 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1238 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1239 }
1240 aarch64_cc;
1241
1242 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1243
1244 struct aarch64_branch_protect_type
1245 {
1246 /* The type's name that the user passes to the branch-protection option
1247 string. */
1248 const char* name;
1249 /* Function to handle the protection type and set global variables.
1250 First argument is the string token corresponding with this type and the
1251 second argument is the next token in the option string.
1252 Return values:
1253 * AARCH64_PARSE_OK: Handling was sucessful.
1254 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1255 should print an error.
1256 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1257 own error. */
1258 enum aarch64_parse_opt_result (*handler)(char*, char*);
1259 /* A list of types that can follow this type in the option string. */
1260 const aarch64_branch_protect_type* subtypes;
1261 unsigned int num_subtypes;
1262 };
1263
1264 static enum aarch64_parse_opt_result
1265 aarch64_handle_no_branch_protection (char* str, char* rest)
1266 {
1267 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1268 aarch64_enable_bti = 0;
1269 if (rest)
1270 {
1271 error ("unexpected %<%s%> after %<%s%>", rest, str);
1272 return AARCH64_PARSE_INVALID_FEATURE;
1273 }
1274 return AARCH64_PARSE_OK;
1275 }
1276
1277 static enum aarch64_parse_opt_result
1278 aarch64_handle_standard_branch_protection (char* str, char* rest)
1279 {
1280 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1281 aarch64_ra_sign_key = AARCH64_KEY_A;
1282 aarch64_enable_bti = 1;
1283 if (rest)
1284 {
1285 error ("unexpected %<%s%> after %<%s%>", rest, str);
1286 return AARCH64_PARSE_INVALID_FEATURE;
1287 }
1288 return AARCH64_PARSE_OK;
1289 }
1290
1291 static enum aarch64_parse_opt_result
1292 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1293 char* rest ATTRIBUTE_UNUSED)
1294 {
1295 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1296 aarch64_ra_sign_key = AARCH64_KEY_A;
1297 return AARCH64_PARSE_OK;
1298 }
1299
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1302 char* rest ATTRIBUTE_UNUSED)
1303 {
1304 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1305 return AARCH64_PARSE_OK;
1306 }
1307
1308 static enum aarch64_parse_opt_result
1309 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1310 char* rest ATTRIBUTE_UNUSED)
1311 {
1312 aarch64_ra_sign_key = AARCH64_KEY_B;
1313 return AARCH64_PARSE_OK;
1314 }
1315
1316 static enum aarch64_parse_opt_result
1317 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1318 char* rest ATTRIBUTE_UNUSED)
1319 {
1320 aarch64_enable_bti = 1;
1321 return AARCH64_PARSE_OK;
1322 }
1323
1324 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1325 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1326 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1327 { NULL, NULL, NULL, 0 }
1328 };
1329
1330 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1331 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1332 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1333 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1334 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1335 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1336 { NULL, NULL, NULL, 0 }
1337 };
1338
1339 /* The condition codes of the processor, and the inverse function. */
1340 static const char * const aarch64_condition_codes[] =
1341 {
1342 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1343 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 };
1345
1346 /* The preferred condition codes for SVE conditions. */
1347 static const char *const aarch64_sve_condition_codes[] =
1348 {
1349 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1350 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 };
1352
1353 /* Return the assembly token for svpattern value VALUE. */
1354
1355 static const char *
1356 svpattern_token (enum aarch64_svpattern pattern)
1357 {
1358 switch (pattern)
1359 {
1360 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1361 AARCH64_FOR_SVPATTERN (CASE)
1362 #undef CASE
1363 case AARCH64_NUM_SVPATTERNS:
1364 break;
1365 }
1366 gcc_unreachable ();
1367 }
1368
1369 /* Return the descriptor of the SIMD ABI. */
1370
1371 static const predefined_function_abi &
1372 aarch64_simd_abi (void)
1373 {
1374 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1375 if (!simd_abi.initialized_p ())
1376 {
1377 HARD_REG_SET full_reg_clobbers
1378 = default_function_abi.full_reg_clobbers ();
1379 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1380 if (FP_SIMD_SAVED_REGNUM_P (regno))
1381 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1382 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1383 }
1384 return simd_abi;
1385 }
1386
1387 /* Generate code to enable conditional branches in functions over 1 MiB. */
1388 const char *
1389 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1390 const char * branch_format)
1391 {
1392 rtx_code_label * tmp_label = gen_label_rtx ();
1393 char label_buf[256];
1394 char buffer[128];
1395 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1396 CODE_LABEL_NUMBER (tmp_label));
1397 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1398 rtx dest_label = operands[pos_label];
1399 operands[pos_label] = tmp_label;
1400
1401 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1402 output_asm_insn (buffer, operands);
1403
1404 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1405 operands[pos_label] = dest_label;
1406 output_asm_insn (buffer, operands);
1407 return "";
1408 }
1409
1410 void
1411 aarch64_err_no_fpadvsimd (machine_mode mode)
1412 {
1413 if (TARGET_GENERAL_REGS_ONLY)
1414 if (FLOAT_MODE_P (mode))
1415 error ("%qs is incompatible with the use of floating-point types",
1416 "-mgeneral-regs-only");
1417 else
1418 error ("%qs is incompatible with the use of vector types",
1419 "-mgeneral-regs-only");
1420 else
1421 if (FLOAT_MODE_P (mode))
1422 error ("%qs feature modifier is incompatible with the use of"
1423 " floating-point types", "+nofp");
1424 else
1425 error ("%qs feature modifier is incompatible with the use of"
1426 " vector types", "+nofp");
1427 }
1428
1429 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1430 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1431 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1432 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1433 and GENERAL_REGS is lower than the memory cost (in this case the best class
1434 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1435 cost results in bad allocations with many redundant int<->FP moves which
1436 are expensive on various cores.
1437 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1438 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1439 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1440 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1441 The result of this is that it is no longer inefficient to have a higher
1442 memory move cost than the register move cost.
1443 */
1444
1445 static reg_class_t
1446 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1447 reg_class_t best_class)
1448 {
1449 machine_mode mode;
1450
1451 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1452 || !reg_class_subset_p (FP_REGS, allocno_class))
1453 return allocno_class;
1454
1455 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1456 || !reg_class_subset_p (FP_REGS, best_class))
1457 return best_class;
1458
1459 mode = PSEUDO_REGNO_MODE (regno);
1460 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1461 }
1462
1463 static unsigned int
1464 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1465 {
1466 if (GET_MODE_UNIT_SIZE (mode) == 4)
1467 return aarch64_tune_params.min_div_recip_mul_sf;
1468 return aarch64_tune_params.min_div_recip_mul_df;
1469 }
1470
1471 /* Return the reassociation width of treeop OPC with mode MODE. */
1472 static int
1473 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1474 {
1475 if (VECTOR_MODE_P (mode))
1476 return aarch64_tune_params.vec_reassoc_width;
1477 if (INTEGRAL_MODE_P (mode))
1478 return aarch64_tune_params.int_reassoc_width;
1479 /* Avoid reassociating floating point addition so we emit more FMAs. */
1480 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1481 return aarch64_tune_params.fp_reassoc_width;
1482 return 1;
1483 }
1484
1485 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1486 unsigned
1487 aarch64_dbx_register_number (unsigned regno)
1488 {
1489 if (GP_REGNUM_P (regno))
1490 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1491 else if (regno == SP_REGNUM)
1492 return AARCH64_DWARF_SP;
1493 else if (FP_REGNUM_P (regno))
1494 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1495 else if (PR_REGNUM_P (regno))
1496 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1497 else if (regno == VG_REGNUM)
1498 return AARCH64_DWARF_VG;
1499
1500 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1501 equivalent DWARF register. */
1502 return DWARF_FRAME_REGISTERS;
1503 }
1504
1505 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1506 integer, otherwise return X unmodified. */
1507 static rtx
1508 aarch64_bit_representation (rtx x)
1509 {
1510 if (CONST_DOUBLE_P (x))
1511 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1512 return x;
1513 }
1514
1515 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1516 static bool
1517 aarch64_advsimd_struct_mode_p (machine_mode mode)
1518 {
1519 return (TARGET_SIMD
1520 && (mode == OImode || mode == CImode || mode == XImode));
1521 }
1522
1523 /* Return true if MODE is an SVE predicate mode. */
1524 static bool
1525 aarch64_sve_pred_mode_p (machine_mode mode)
1526 {
1527 return (TARGET_SVE
1528 && (mode == VNx16BImode
1529 || mode == VNx8BImode
1530 || mode == VNx4BImode
1531 || mode == VNx2BImode));
1532 }
1533
1534 /* Three mutually-exclusive flags describing a vector or predicate type. */
1535 const unsigned int VEC_ADVSIMD = 1;
1536 const unsigned int VEC_SVE_DATA = 2;
1537 const unsigned int VEC_SVE_PRED = 4;
1538 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1539 a structure of 2, 3 or 4 vectors. */
1540 const unsigned int VEC_STRUCT = 8;
1541 /* Useful combinations of the above. */
1542 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1543 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1544
1545 /* Return a set of flags describing the vector properties of mode MODE.
1546 Ignore modes that are not supported by the current target. */
1547 static unsigned int
1548 aarch64_classify_vector_mode (machine_mode mode)
1549 {
1550 if (aarch64_advsimd_struct_mode_p (mode))
1551 return VEC_ADVSIMD | VEC_STRUCT;
1552
1553 if (aarch64_sve_pred_mode_p (mode))
1554 return VEC_SVE_PRED;
1555
1556 /* Make the decision based on the mode's enum value rather than its
1557 properties, so that we keep the correct classification regardless
1558 of -msve-vector-bits. */
1559 switch (mode)
1560 {
1561 /* Single SVE vectors. */
1562 case E_VNx16QImode:
1563 case E_VNx8HImode:
1564 case E_VNx4SImode:
1565 case E_VNx2DImode:
1566 case E_VNx8HFmode:
1567 case E_VNx4SFmode:
1568 case E_VNx2DFmode:
1569 return TARGET_SVE ? VEC_SVE_DATA : 0;
1570
1571 /* x2 SVE vectors. */
1572 case E_VNx32QImode:
1573 case E_VNx16HImode:
1574 case E_VNx8SImode:
1575 case E_VNx4DImode:
1576 case E_VNx16HFmode:
1577 case E_VNx8SFmode:
1578 case E_VNx4DFmode:
1579 /* x3 SVE vectors. */
1580 case E_VNx48QImode:
1581 case E_VNx24HImode:
1582 case E_VNx12SImode:
1583 case E_VNx6DImode:
1584 case E_VNx24HFmode:
1585 case E_VNx12SFmode:
1586 case E_VNx6DFmode:
1587 /* x4 SVE vectors. */
1588 case E_VNx64QImode:
1589 case E_VNx32HImode:
1590 case E_VNx16SImode:
1591 case E_VNx8DImode:
1592 case E_VNx32HFmode:
1593 case E_VNx16SFmode:
1594 case E_VNx8DFmode:
1595 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1596
1597 /* 64-bit Advanced SIMD vectors. */
1598 case E_V8QImode:
1599 case E_V4HImode:
1600 case E_V2SImode:
1601 /* ...E_V1DImode doesn't exist. */
1602 case E_V4HFmode:
1603 case E_V2SFmode:
1604 case E_V1DFmode:
1605 /* 128-bit Advanced SIMD vectors. */
1606 case E_V16QImode:
1607 case E_V8HImode:
1608 case E_V4SImode:
1609 case E_V2DImode:
1610 case E_V8HFmode:
1611 case E_V4SFmode:
1612 case E_V2DFmode:
1613 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1614
1615 default:
1616 return 0;
1617 }
1618 }
1619
1620 /* Return true if MODE is any of the data vector modes, including
1621 structure modes. */
1622 static bool
1623 aarch64_vector_data_mode_p (machine_mode mode)
1624 {
1625 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1626 }
1627
1628 /* Return true if MODE is any form of SVE mode, including predicates,
1629 vectors and structures. */
1630 bool
1631 aarch64_sve_mode_p (machine_mode mode)
1632 {
1633 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1634 }
1635
1636 /* Return true if MODE is an SVE data vector mode; either a single vector
1637 or a structure of vectors. */
1638 static bool
1639 aarch64_sve_data_mode_p (machine_mode mode)
1640 {
1641 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1642 }
1643
1644 /* Implement target hook TARGET_ARRAY_MODE. */
1645 static opt_machine_mode
1646 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1647 {
1648 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1649 && IN_RANGE (nelems, 2, 4))
1650 return mode_for_vector (GET_MODE_INNER (mode),
1651 GET_MODE_NUNITS (mode) * nelems);
1652
1653 return opt_machine_mode ();
1654 }
1655
1656 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1657 static bool
1658 aarch64_array_mode_supported_p (machine_mode mode,
1659 unsigned HOST_WIDE_INT nelems)
1660 {
1661 if (TARGET_SIMD
1662 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1663 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1664 && (nelems >= 2 && nelems <= 4))
1665 return true;
1666
1667 return false;
1668 }
1669
1670 /* Return the SVE predicate mode to use for elements that have
1671 ELEM_NBYTES bytes, if such a mode exists. */
1672
1673 opt_machine_mode
1674 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1675 {
1676 if (TARGET_SVE)
1677 {
1678 if (elem_nbytes == 1)
1679 return VNx16BImode;
1680 if (elem_nbytes == 2)
1681 return VNx8BImode;
1682 if (elem_nbytes == 4)
1683 return VNx4BImode;
1684 if (elem_nbytes == 8)
1685 return VNx2BImode;
1686 }
1687 return opt_machine_mode ();
1688 }
1689
1690 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1691
1692 static opt_machine_mode
1693 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1694 {
1695 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1696 {
1697 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1698 machine_mode pred_mode;
1699 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1700 return pred_mode;
1701 }
1702
1703 return default_get_mask_mode (nunits, nbytes);
1704 }
1705
1706 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1707
1708 static opt_machine_mode
1709 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1710 {
1711 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1712 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1713 machine_mode mode;
1714 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1715 if (inner_mode == GET_MODE_INNER (mode)
1716 && known_eq (nunits, GET_MODE_NUNITS (mode))
1717 && aarch64_sve_data_mode_p (mode))
1718 return mode;
1719 return opt_machine_mode ();
1720 }
1721
1722 /* Return the integer element mode associated with SVE mode MODE. */
1723
1724 static scalar_int_mode
1725 aarch64_sve_element_int_mode (machine_mode mode)
1726 {
1727 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1728 GET_MODE_NUNITS (mode));
1729 return int_mode_for_size (elt_bits, 0).require ();
1730 }
1731
1732 /* Return the integer vector mode associated with SVE mode MODE.
1733 Unlike mode_for_int_vector, this can handle the case in which
1734 MODE is a predicate (and thus has a different total size). */
1735
1736 static machine_mode
1737 aarch64_sve_int_mode (machine_mode mode)
1738 {
1739 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1740 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1741 }
1742
1743 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1744 prefer to use the first arithmetic operand as the else value if
1745 the else value doesn't matter, since that exactly matches the SVE
1746 destructive merging form. For ternary operations we could either
1747 pick the first operand and use FMAD-like instructions or the last
1748 operand and use FMLA-like instructions; the latter seems more
1749 natural. */
1750
1751 static tree
1752 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1753 {
1754 return nops == 3 ? ops[2] : ops[0];
1755 }
1756
1757 /* Implement TARGET_HARD_REGNO_NREGS. */
1758
1759 static unsigned int
1760 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1761 {
1762 /* ??? Logically we should only need to provide a value when
1763 HARD_REGNO_MODE_OK says that the combination is valid,
1764 but at the moment we need to handle all modes. Just ignore
1765 any runtime parts for registers that can't store them. */
1766 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1767 switch (aarch64_regno_regclass (regno))
1768 {
1769 case FP_REGS:
1770 case FP_LO_REGS:
1771 case FP_LO8_REGS:
1772 if (aarch64_sve_data_mode_p (mode))
1773 return exact_div (GET_MODE_SIZE (mode),
1774 BYTES_PER_SVE_VECTOR).to_constant ();
1775 return CEIL (lowest_size, UNITS_PER_VREG);
1776 case PR_REGS:
1777 case PR_LO_REGS:
1778 case PR_HI_REGS:
1779 return 1;
1780 default:
1781 return CEIL (lowest_size, UNITS_PER_WORD);
1782 }
1783 gcc_unreachable ();
1784 }
1785
1786 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1787
1788 static bool
1789 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1790 {
1791 if (GET_MODE_CLASS (mode) == MODE_CC)
1792 return regno == CC_REGNUM;
1793
1794 if (regno == VG_REGNUM)
1795 /* This must have the same size as _Unwind_Word. */
1796 return mode == DImode;
1797
1798 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1799 if (vec_flags & VEC_SVE_PRED)
1800 return PR_REGNUM_P (regno);
1801
1802 if (PR_REGNUM_P (regno))
1803 return 0;
1804
1805 if (regno == SP_REGNUM)
1806 /* The purpose of comparing with ptr_mode is to support the
1807 global register variable associated with the stack pointer
1808 register via the syntax of asm ("wsp") in ILP32. */
1809 return mode == Pmode || mode == ptr_mode;
1810
1811 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1812 return mode == Pmode;
1813
1814 if (GP_REGNUM_P (regno))
1815 {
1816 if (known_le (GET_MODE_SIZE (mode), 8))
1817 return true;
1818 else if (known_le (GET_MODE_SIZE (mode), 16))
1819 return (regno & 1) == 0;
1820 }
1821 else if (FP_REGNUM_P (regno))
1822 {
1823 if (vec_flags & VEC_STRUCT)
1824 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1825 else
1826 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1827 }
1828
1829 return false;
1830 }
1831
1832 /* Implement TARGET_FNTYPE_ABI. */
1833
1834 static const predefined_function_abi &
1835 aarch64_fntype_abi (const_tree fntype)
1836 {
1837 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
1838 return aarch64_simd_abi ();
1839 return default_function_abi;
1840 }
1841
1842 /* Return true if this is a definition of a vectorized simd function. */
1843
1844 static bool
1845 aarch64_simd_decl_p (tree fndecl)
1846 {
1847 tree fntype;
1848
1849 if (fndecl == NULL)
1850 return false;
1851 fntype = TREE_TYPE (fndecl);
1852 if (fntype == NULL)
1853 return false;
1854
1855 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1856 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1857 return true;
1858
1859 return false;
1860 }
1861
1862 /* Return the mode a register save/restore should use. DImode for integer
1863 registers, DFmode for FP registers in non-SIMD functions (they only save
1864 the bottom half of a 128 bit register), or TFmode for FP registers in
1865 SIMD functions. */
1866
1867 static machine_mode
1868 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1869 {
1870 return GP_REGNUM_P (regno)
1871 ? E_DImode
1872 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1873 }
1874
1875 /* Implement TARGET_INSN_CALLEE_ABI. */
1876
1877 const predefined_function_abi &
1878 aarch64_insn_callee_abi (const rtx_insn *insn)
1879 {
1880 rtx pat = PATTERN (insn);
1881 gcc_assert (GET_CODE (pat) == PARALLEL);
1882 rtx unspec = XVECEXP (pat, 0, 1);
1883 gcc_assert (GET_CODE (unspec) == UNSPEC
1884 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
1885 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
1886 }
1887
1888 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1889 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1890 clobbers the top 64 bits when restoring the bottom 64 bits. */
1891
1892 static bool
1893 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
1894 unsigned int regno,
1895 machine_mode mode)
1896 {
1897 if (FP_REGNUM_P (regno))
1898 {
1899 poly_int64 per_register_size = GET_MODE_SIZE (mode);
1900 unsigned int nregs = hard_regno_nregs (regno, mode);
1901 if (nregs > 1)
1902 per_register_size = exact_div (per_register_size, nregs);
1903 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
1904 return maybe_gt (per_register_size, 16);
1905 return maybe_gt (per_register_size, 8);
1906 }
1907 return false;
1908 }
1909
1910 /* Implement REGMODE_NATURAL_SIZE. */
1911 poly_uint64
1912 aarch64_regmode_natural_size (machine_mode mode)
1913 {
1914 /* The natural size for SVE data modes is one SVE data vector,
1915 and similarly for predicates. We can't independently modify
1916 anything smaller than that. */
1917 /* ??? For now, only do this for variable-width SVE registers.
1918 Doing it for constant-sized registers breaks lower-subreg.c. */
1919 /* ??? And once that's fixed, we should probably have similar
1920 code for Advanced SIMD. */
1921 if (!aarch64_sve_vg.is_constant ())
1922 {
1923 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1924 if (vec_flags & VEC_SVE_PRED)
1925 return BYTES_PER_SVE_PRED;
1926 if (vec_flags & VEC_SVE_DATA)
1927 return BYTES_PER_SVE_VECTOR;
1928 }
1929 return UNITS_PER_WORD;
1930 }
1931
1932 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1933 machine_mode
1934 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1935 machine_mode mode)
1936 {
1937 /* The predicate mode determines which bits are significant and
1938 which are "don't care". Decreasing the number of lanes would
1939 lose data while increasing the number of lanes would make bits
1940 unnecessarily significant. */
1941 if (PR_REGNUM_P (regno))
1942 return mode;
1943 if (known_ge (GET_MODE_SIZE (mode), 4))
1944 return mode;
1945 else
1946 return SImode;
1947 }
1948
1949 /* Return true if I's bits are consecutive ones from the MSB. */
1950 bool
1951 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1952 {
1953 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1954 }
1955
1956 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1957 that strcpy from constants will be faster. */
1958
1959 static HOST_WIDE_INT
1960 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1961 {
1962 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1963 return MAX (align, BITS_PER_WORD);
1964 return align;
1965 }
1966
1967 /* Return true if calls to DECL should be treated as
1968 long-calls (ie called via a register). */
1969 static bool
1970 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1971 {
1972 return false;
1973 }
1974
1975 /* Return true if calls to symbol-ref SYM should be treated as
1976 long-calls (ie called via a register). */
1977 bool
1978 aarch64_is_long_call_p (rtx sym)
1979 {
1980 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1981 }
1982
1983 /* Return true if calls to symbol-ref SYM should not go through
1984 plt stubs. */
1985
1986 bool
1987 aarch64_is_noplt_call_p (rtx sym)
1988 {
1989 const_tree decl = SYMBOL_REF_DECL (sym);
1990
1991 if (flag_pic
1992 && decl
1993 && (!flag_plt
1994 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1995 && !targetm.binds_local_p (decl))
1996 return true;
1997
1998 return false;
1999 }
2000
2001 /* Return true if the offsets to a zero/sign-extract operation
2002 represent an expression that matches an extend operation. The
2003 operands represent the paramters from
2004
2005 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2006 bool
2007 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2008 rtx extract_imm)
2009 {
2010 HOST_WIDE_INT mult_val, extract_val;
2011
2012 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2013 return false;
2014
2015 mult_val = INTVAL (mult_imm);
2016 extract_val = INTVAL (extract_imm);
2017
2018 if (extract_val > 8
2019 && extract_val < GET_MODE_BITSIZE (mode)
2020 && exact_log2 (extract_val & ~7) > 0
2021 && (extract_val & 7) <= 4
2022 && mult_val == (1 << (extract_val & 7)))
2023 return true;
2024
2025 return false;
2026 }
2027
2028 /* Emit an insn that's a simple single-set. Both the operands must be
2029 known to be valid. */
2030 inline static rtx_insn *
2031 emit_set_insn (rtx x, rtx y)
2032 {
2033 return emit_insn (gen_rtx_SET (x, y));
2034 }
2035
2036 /* X and Y are two things to compare using CODE. Emit the compare insn and
2037 return the rtx for register 0 in the proper mode. */
2038 rtx
2039 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2040 {
2041 machine_mode cmp_mode = GET_MODE (x);
2042 machine_mode cc_mode;
2043 rtx cc_reg;
2044
2045 if (cmp_mode == TImode)
2046 {
2047 gcc_assert (code == NE);
2048
2049 cc_mode = CCmode;
2050 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2051
2052 rtx x_lo = operand_subword (x, 0, 0, TImode);
2053 rtx y_lo = operand_subword (y, 0, 0, TImode);
2054 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2055
2056 rtx x_hi = operand_subword (x, 1, 0, TImode);
2057 rtx y_hi = operand_subword (y, 1, 0, TImode);
2058 emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2059 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2060 GEN_INT (AARCH64_EQ)));
2061 }
2062 else
2063 {
2064 cc_mode = SELECT_CC_MODE (code, x, y);
2065 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2066 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2067 }
2068 return cc_reg;
2069 }
2070
2071 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2072
2073 static rtx
2074 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2075 machine_mode y_mode)
2076 {
2077 if (y_mode == E_QImode || y_mode == E_HImode)
2078 {
2079 if (CONST_INT_P (y))
2080 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2081 else
2082 {
2083 rtx t, cc_reg;
2084 machine_mode cc_mode;
2085
2086 t = gen_rtx_ZERO_EXTEND (SImode, y);
2087 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2088 cc_mode = CC_SWPmode;
2089 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2090 emit_set_insn (cc_reg, t);
2091 return cc_reg;
2092 }
2093 }
2094
2095 if (!aarch64_plus_operand (y, y_mode))
2096 y = force_reg (y_mode, y);
2097
2098 return aarch64_gen_compare_reg (code, x, y);
2099 }
2100
2101 /* Build the SYMBOL_REF for __tls_get_addr. */
2102
2103 static GTY(()) rtx tls_get_addr_libfunc;
2104
2105 rtx
2106 aarch64_tls_get_addr (void)
2107 {
2108 if (!tls_get_addr_libfunc)
2109 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2110 return tls_get_addr_libfunc;
2111 }
2112
2113 /* Return the TLS model to use for ADDR. */
2114
2115 static enum tls_model
2116 tls_symbolic_operand_type (rtx addr)
2117 {
2118 enum tls_model tls_kind = TLS_MODEL_NONE;
2119 if (GET_CODE (addr) == CONST)
2120 {
2121 poly_int64 addend;
2122 rtx sym = strip_offset (addr, &addend);
2123 if (GET_CODE (sym) == SYMBOL_REF)
2124 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2125 }
2126 else if (GET_CODE (addr) == SYMBOL_REF)
2127 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2128
2129 return tls_kind;
2130 }
2131
2132 /* We'll allow lo_sum's in addresses in our legitimate addresses
2133 so that combine would take care of combining addresses where
2134 necessary, but for generation purposes, we'll generate the address
2135 as :
2136 RTL Absolute
2137 tmp = hi (symbol_ref); adrp x1, foo
2138 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2139 nop
2140
2141 PIC TLS
2142 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2143 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2144 bl __tls_get_addr
2145 nop
2146
2147 Load TLS symbol, depending on TLS mechanism and TLS access model.
2148
2149 Global Dynamic - Traditional TLS:
2150 adrp tmp, :tlsgd:imm
2151 add dest, tmp, #:tlsgd_lo12:imm
2152 bl __tls_get_addr
2153
2154 Global Dynamic - TLS Descriptors:
2155 adrp dest, :tlsdesc:imm
2156 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2157 add dest, dest, #:tlsdesc_lo12:imm
2158 blr tmp
2159 mrs tp, tpidr_el0
2160 add dest, dest, tp
2161
2162 Initial Exec:
2163 mrs tp, tpidr_el0
2164 adrp tmp, :gottprel:imm
2165 ldr dest, [tmp, #:gottprel_lo12:imm]
2166 add dest, dest, tp
2167
2168 Local Exec:
2169 mrs tp, tpidr_el0
2170 add t0, tp, #:tprel_hi12:imm, lsl #12
2171 add t0, t0, #:tprel_lo12_nc:imm
2172 */
2173
2174 static void
2175 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2176 enum aarch64_symbol_type type)
2177 {
2178 switch (type)
2179 {
2180 case SYMBOL_SMALL_ABSOLUTE:
2181 {
2182 /* In ILP32, the mode of dest can be either SImode or DImode. */
2183 rtx tmp_reg = dest;
2184 machine_mode mode = GET_MODE (dest);
2185
2186 gcc_assert (mode == Pmode || mode == ptr_mode);
2187
2188 if (can_create_pseudo_p ())
2189 tmp_reg = gen_reg_rtx (mode);
2190
2191 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2192 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2193 return;
2194 }
2195
2196 case SYMBOL_TINY_ABSOLUTE:
2197 emit_insn (gen_rtx_SET (dest, imm));
2198 return;
2199
2200 case SYMBOL_SMALL_GOT_28K:
2201 {
2202 machine_mode mode = GET_MODE (dest);
2203 rtx gp_rtx = pic_offset_table_rtx;
2204 rtx insn;
2205 rtx mem;
2206
2207 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2208 here before rtl expand. Tree IVOPT will generate rtl pattern to
2209 decide rtx costs, in which case pic_offset_table_rtx is not
2210 initialized. For that case no need to generate the first adrp
2211 instruction as the final cost for global variable access is
2212 one instruction. */
2213 if (gp_rtx != NULL)
2214 {
2215 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2216 using the page base as GOT base, the first page may be wasted,
2217 in the worst scenario, there is only 28K space for GOT).
2218
2219 The generate instruction sequence for accessing global variable
2220 is:
2221
2222 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2223
2224 Only one instruction needed. But we must initialize
2225 pic_offset_table_rtx properly. We generate initialize insn for
2226 every global access, and allow CSE to remove all redundant.
2227
2228 The final instruction sequences will look like the following
2229 for multiply global variables access.
2230
2231 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2232
2233 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2234 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2235 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2236 ... */
2237
2238 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2239 crtl->uses_pic_offset_table = 1;
2240 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2241
2242 if (mode != GET_MODE (gp_rtx))
2243 gp_rtx = gen_lowpart (mode, gp_rtx);
2244
2245 }
2246
2247 if (mode == ptr_mode)
2248 {
2249 if (mode == DImode)
2250 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2251 else
2252 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2253
2254 mem = XVECEXP (SET_SRC (insn), 0, 0);
2255 }
2256 else
2257 {
2258 gcc_assert (mode == Pmode);
2259
2260 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2261 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2262 }
2263
2264 /* The operand is expected to be MEM. Whenever the related insn
2265 pattern changed, above code which calculate mem should be
2266 updated. */
2267 gcc_assert (GET_CODE (mem) == MEM);
2268 MEM_READONLY_P (mem) = 1;
2269 MEM_NOTRAP_P (mem) = 1;
2270 emit_insn (insn);
2271 return;
2272 }
2273
2274 case SYMBOL_SMALL_GOT_4G:
2275 {
2276 /* In ILP32, the mode of dest can be either SImode or DImode,
2277 while the got entry is always of SImode size. The mode of
2278 dest depends on how dest is used: if dest is assigned to a
2279 pointer (e.g. in the memory), it has SImode; it may have
2280 DImode if dest is dereferenced to access the memeory.
2281 This is why we have to handle three different ldr_got_small
2282 patterns here (two patterns for ILP32). */
2283
2284 rtx insn;
2285 rtx mem;
2286 rtx tmp_reg = dest;
2287 machine_mode mode = GET_MODE (dest);
2288
2289 if (can_create_pseudo_p ())
2290 tmp_reg = gen_reg_rtx (mode);
2291
2292 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2293 if (mode == ptr_mode)
2294 {
2295 if (mode == DImode)
2296 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2297 else
2298 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2299
2300 mem = XVECEXP (SET_SRC (insn), 0, 0);
2301 }
2302 else
2303 {
2304 gcc_assert (mode == Pmode);
2305
2306 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2307 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2308 }
2309
2310 gcc_assert (GET_CODE (mem) == MEM);
2311 MEM_READONLY_P (mem) = 1;
2312 MEM_NOTRAP_P (mem) = 1;
2313 emit_insn (insn);
2314 return;
2315 }
2316
2317 case SYMBOL_SMALL_TLSGD:
2318 {
2319 rtx_insn *insns;
2320 machine_mode mode = GET_MODE (dest);
2321 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2322
2323 start_sequence ();
2324 if (TARGET_ILP32)
2325 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2326 else
2327 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2328 insns = get_insns ();
2329 end_sequence ();
2330
2331 RTL_CONST_CALL_P (insns) = 1;
2332 emit_libcall_block (insns, dest, result, imm);
2333 return;
2334 }
2335
2336 case SYMBOL_SMALL_TLSDESC:
2337 {
2338 machine_mode mode = GET_MODE (dest);
2339 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2340 rtx tp;
2341
2342 gcc_assert (mode == Pmode || mode == ptr_mode);
2343
2344 /* In ILP32, the got entry is always of SImode size. Unlike
2345 small GOT, the dest is fixed at reg 0. */
2346 if (TARGET_ILP32)
2347 emit_insn (gen_tlsdesc_small_si (imm));
2348 else
2349 emit_insn (gen_tlsdesc_small_di (imm));
2350 tp = aarch64_load_tp (NULL);
2351
2352 if (mode != Pmode)
2353 tp = gen_lowpart (mode, tp);
2354
2355 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2356 if (REG_P (dest))
2357 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2358 return;
2359 }
2360
2361 case SYMBOL_SMALL_TLSIE:
2362 {
2363 /* In ILP32, the mode of dest can be either SImode or DImode,
2364 while the got entry is always of SImode size. The mode of
2365 dest depends on how dest is used: if dest is assigned to a
2366 pointer (e.g. in the memory), it has SImode; it may have
2367 DImode if dest is dereferenced to access the memeory.
2368 This is why we have to handle three different tlsie_small
2369 patterns here (two patterns for ILP32). */
2370 machine_mode mode = GET_MODE (dest);
2371 rtx tmp_reg = gen_reg_rtx (mode);
2372 rtx tp = aarch64_load_tp (NULL);
2373
2374 if (mode == ptr_mode)
2375 {
2376 if (mode == DImode)
2377 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2378 else
2379 {
2380 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2381 tp = gen_lowpart (mode, tp);
2382 }
2383 }
2384 else
2385 {
2386 gcc_assert (mode == Pmode);
2387 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2388 }
2389
2390 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2391 if (REG_P (dest))
2392 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2393 return;
2394 }
2395
2396 case SYMBOL_TLSLE12:
2397 case SYMBOL_TLSLE24:
2398 case SYMBOL_TLSLE32:
2399 case SYMBOL_TLSLE48:
2400 {
2401 machine_mode mode = GET_MODE (dest);
2402 rtx tp = aarch64_load_tp (NULL);
2403
2404 if (mode != Pmode)
2405 tp = gen_lowpart (mode, tp);
2406
2407 switch (type)
2408 {
2409 case SYMBOL_TLSLE12:
2410 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2411 (dest, tp, imm));
2412 break;
2413 case SYMBOL_TLSLE24:
2414 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2415 (dest, tp, imm));
2416 break;
2417 case SYMBOL_TLSLE32:
2418 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2419 (dest, imm));
2420 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2421 (dest, dest, tp));
2422 break;
2423 case SYMBOL_TLSLE48:
2424 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2425 (dest, imm));
2426 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2427 (dest, dest, tp));
2428 break;
2429 default:
2430 gcc_unreachable ();
2431 }
2432
2433 if (REG_P (dest))
2434 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2435 return;
2436 }
2437
2438 case SYMBOL_TINY_GOT:
2439 emit_insn (gen_ldr_got_tiny (dest, imm));
2440 return;
2441
2442 case SYMBOL_TINY_TLSIE:
2443 {
2444 machine_mode mode = GET_MODE (dest);
2445 rtx tp = aarch64_load_tp (NULL);
2446
2447 if (mode == ptr_mode)
2448 {
2449 if (mode == DImode)
2450 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2451 else
2452 {
2453 tp = gen_lowpart (mode, tp);
2454 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2455 }
2456 }
2457 else
2458 {
2459 gcc_assert (mode == Pmode);
2460 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2461 }
2462
2463 if (REG_P (dest))
2464 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2465 return;
2466 }
2467
2468 default:
2469 gcc_unreachable ();
2470 }
2471 }
2472
2473 /* Emit a move from SRC to DEST. Assume that the move expanders can
2474 handle all moves if !can_create_pseudo_p (). The distinction is
2475 important because, unlike emit_move_insn, the move expanders know
2476 how to force Pmode objects into the constant pool even when the
2477 constant pool address is not itself legitimate. */
2478 static rtx
2479 aarch64_emit_move (rtx dest, rtx src)
2480 {
2481 return (can_create_pseudo_p ()
2482 ? emit_move_insn (dest, src)
2483 : emit_move_insn_1 (dest, src));
2484 }
2485
2486 /* Apply UNOPTAB to OP and store the result in DEST. */
2487
2488 static void
2489 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2490 {
2491 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2492 if (dest != tmp)
2493 emit_move_insn (dest, tmp);
2494 }
2495
2496 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2497
2498 static void
2499 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2500 {
2501 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2502 OPTAB_DIRECT);
2503 if (dest != tmp)
2504 emit_move_insn (dest, tmp);
2505 }
2506
2507 /* Split a 128-bit move operation into two 64-bit move operations,
2508 taking care to handle partial overlap of register to register
2509 copies. Special cases are needed when moving between GP regs and
2510 FP regs. SRC can be a register, constant or memory; DST a register
2511 or memory. If either operand is memory it must not have any side
2512 effects. */
2513 void
2514 aarch64_split_128bit_move (rtx dst, rtx src)
2515 {
2516 rtx dst_lo, dst_hi;
2517 rtx src_lo, src_hi;
2518
2519 machine_mode mode = GET_MODE (dst);
2520
2521 gcc_assert (mode == TImode || mode == TFmode);
2522 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2523 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2524
2525 if (REG_P (dst) && REG_P (src))
2526 {
2527 int src_regno = REGNO (src);
2528 int dst_regno = REGNO (dst);
2529
2530 /* Handle FP <-> GP regs. */
2531 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2532 {
2533 src_lo = gen_lowpart (word_mode, src);
2534 src_hi = gen_highpart (word_mode, src);
2535
2536 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2537 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2538 return;
2539 }
2540 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2541 {
2542 dst_lo = gen_lowpart (word_mode, dst);
2543 dst_hi = gen_highpart (word_mode, dst);
2544
2545 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2546 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2547 return;
2548 }
2549 }
2550
2551 dst_lo = gen_lowpart (word_mode, dst);
2552 dst_hi = gen_highpart (word_mode, dst);
2553 src_lo = gen_lowpart (word_mode, src);
2554 src_hi = gen_highpart_mode (word_mode, mode, src);
2555
2556 /* At most one pairing may overlap. */
2557 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2558 {
2559 aarch64_emit_move (dst_hi, src_hi);
2560 aarch64_emit_move (dst_lo, src_lo);
2561 }
2562 else
2563 {
2564 aarch64_emit_move (dst_lo, src_lo);
2565 aarch64_emit_move (dst_hi, src_hi);
2566 }
2567 }
2568
2569 bool
2570 aarch64_split_128bit_move_p (rtx dst, rtx src)
2571 {
2572 return (! REG_P (src)
2573 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2574 }
2575
2576 /* Split a complex SIMD combine. */
2577
2578 void
2579 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2580 {
2581 machine_mode src_mode = GET_MODE (src1);
2582 machine_mode dst_mode = GET_MODE (dst);
2583
2584 gcc_assert (VECTOR_MODE_P (dst_mode));
2585 gcc_assert (register_operand (dst, dst_mode)
2586 && register_operand (src1, src_mode)
2587 && register_operand (src2, src_mode));
2588
2589 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2590 return;
2591 }
2592
2593 /* Split a complex SIMD move. */
2594
2595 void
2596 aarch64_split_simd_move (rtx dst, rtx src)
2597 {
2598 machine_mode src_mode = GET_MODE (src);
2599 machine_mode dst_mode = GET_MODE (dst);
2600
2601 gcc_assert (VECTOR_MODE_P (dst_mode));
2602
2603 if (REG_P (dst) && REG_P (src))
2604 {
2605 gcc_assert (VECTOR_MODE_P (src_mode));
2606 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2607 }
2608 }
2609
2610 bool
2611 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2612 machine_mode ymode, rtx y)
2613 {
2614 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2615 gcc_assert (r != NULL);
2616 return rtx_equal_p (x, r);
2617 }
2618
2619 /* Return TARGET if it is nonnull and a register of mode MODE.
2620 Otherwise, return a fresh register of mode MODE if we can,
2621 or TARGET reinterpreted as MODE if we can't. */
2622
2623 static rtx
2624 aarch64_target_reg (rtx target, machine_mode mode)
2625 {
2626 if (target && REG_P (target) && GET_MODE (target) == mode)
2627 return target;
2628 if (!can_create_pseudo_p ())
2629 {
2630 gcc_assert (target);
2631 return gen_lowpart (mode, target);
2632 }
2633 return gen_reg_rtx (mode);
2634 }
2635
2636 /* Return a register that contains the constant in BUILDER, given that
2637 the constant is a legitimate move operand. Use TARGET as the register
2638 if it is nonnull and convenient. */
2639
2640 static rtx
2641 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2642 {
2643 rtx src = builder.build ();
2644 target = aarch64_target_reg (target, GET_MODE (src));
2645 emit_insn (gen_rtx_SET (target, src));
2646 return target;
2647 }
2648
2649 static rtx
2650 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2651 {
2652 if (can_create_pseudo_p ())
2653 return force_reg (mode, value);
2654 else
2655 {
2656 gcc_assert (x);
2657 aarch64_emit_move (x, value);
2658 return x;
2659 }
2660 }
2661
2662 /* Return true if predicate value X is a constant in which every element
2663 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2664 value, i.e. as a predicate in which all bits are significant. */
2665
2666 static bool
2667 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2668 {
2669 if (GET_CODE (x) != CONST_VECTOR)
2670 return false;
2671
2672 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2673 GET_MODE_NUNITS (GET_MODE (x)));
2674 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2675 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2676 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2677
2678 unsigned int nelts = const_vector_encoded_nelts (x);
2679 for (unsigned int i = 0; i < nelts; ++i)
2680 {
2681 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2682 if (!CONST_INT_P (elt))
2683 return false;
2684
2685 builder.quick_push (elt);
2686 for (unsigned int j = 1; j < factor; ++j)
2687 builder.quick_push (const0_rtx);
2688 }
2689 builder.finalize ();
2690 return true;
2691 }
2692
2693 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2694 widest predicate element size it can have (that is, the largest size
2695 for which each element would still be 0 or 1). */
2696
2697 unsigned int
2698 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2699 {
2700 /* Start with the most optimistic assumption: that we only need
2701 one bit per pattern. This is what we will use if only the first
2702 bit in each pattern is ever set. */
2703 unsigned int mask = GET_MODE_SIZE (DImode);
2704 mask |= builder.npatterns ();
2705
2706 /* Look for set bits. */
2707 unsigned int nelts = builder.encoded_nelts ();
2708 for (unsigned int i = 1; i < nelts; ++i)
2709 if (INTVAL (builder.elt (i)) != 0)
2710 {
2711 if (i & 1)
2712 return 1;
2713 mask |= i;
2714 }
2715 return mask & -mask;
2716 }
2717
2718 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2719 that the constant would have with predicate element size ELT_SIZE
2720 (ignoring the upper bits in each element) and return:
2721
2722 * -1 if all bits are set
2723 * N if the predicate has N leading set bits followed by all clear bits
2724 * 0 if the predicate does not have any of these forms. */
2725
2726 int
2727 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2728 unsigned int elt_size)
2729 {
2730 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2731 followed by set bits. */
2732 if (builder.nelts_per_pattern () == 3)
2733 return 0;
2734
2735 /* Skip over leading set bits. */
2736 unsigned int nelts = builder.encoded_nelts ();
2737 unsigned int i = 0;
2738 for (; i < nelts; i += elt_size)
2739 if (INTVAL (builder.elt (i)) == 0)
2740 break;
2741 unsigned int vl = i / elt_size;
2742
2743 /* Check for the all-true case. */
2744 if (i == nelts)
2745 return -1;
2746
2747 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2748 repeating pattern of set bits followed by clear bits. */
2749 if (builder.nelts_per_pattern () != 2)
2750 return 0;
2751
2752 /* We have a "foreground" value and a duplicated "background" value.
2753 If the background might repeat and the last set bit belongs to it,
2754 we might have set bits followed by clear bits followed by set bits. */
2755 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2756 return 0;
2757
2758 /* Make sure that the rest are all clear. */
2759 for (; i < nelts; i += elt_size)
2760 if (INTVAL (builder.elt (i)) != 0)
2761 return 0;
2762
2763 return vl;
2764 }
2765
2766 /* See if there is an svpattern that encodes an SVE predicate of mode
2767 PRED_MODE in which the first VL bits are set and the rest are clear.
2768 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2769 A VL of -1 indicates an all-true vector. */
2770
2771 aarch64_svpattern
2772 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2773 {
2774 if (vl < 0)
2775 return AARCH64_SV_ALL;
2776
2777 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2778 return AARCH64_NUM_SVPATTERNS;
2779
2780 if (vl >= 1 && vl <= 8)
2781 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2782
2783 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2784 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2785
2786 int max_vl;
2787 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2788 {
2789 if (vl == (max_vl / 3) * 3)
2790 return AARCH64_SV_MUL3;
2791 /* These would only trigger for non-power-of-2 lengths. */
2792 if (vl == (max_vl & -4))
2793 return AARCH64_SV_MUL4;
2794 if (vl == (1 << floor_log2 (max_vl)))
2795 return AARCH64_SV_POW2;
2796 if (vl == max_vl)
2797 return AARCH64_SV_ALL;
2798 }
2799 return AARCH64_NUM_SVPATTERNS;
2800 }
2801
2802 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2803 bits has the lowest bit set and the upper bits clear. This is the
2804 VNx16BImode equivalent of a PTRUE for controlling elements of
2805 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2806 all bits are significant, even the upper zeros. */
2807
2808 rtx
2809 aarch64_ptrue_all (unsigned int elt_size)
2810 {
2811 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2812 builder.quick_push (const1_rtx);
2813 for (unsigned int i = 1; i < elt_size; ++i)
2814 builder.quick_push (const0_rtx);
2815 return builder.build ();
2816 }
2817
2818 /* Return an all-true predicate register of mode MODE. */
2819
2820 rtx
2821 aarch64_ptrue_reg (machine_mode mode)
2822 {
2823 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2824 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2825 return gen_lowpart (mode, reg);
2826 }
2827
2828 /* Return an all-false predicate register of mode MODE. */
2829
2830 rtx
2831 aarch64_pfalse_reg (machine_mode mode)
2832 {
2833 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2834 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2835 return gen_lowpart (mode, reg);
2836 }
2837
2838 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2839 true, or alternatively if we know that the operation predicated by
2840 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2841 aarch64_sve_gp_strictness operand that describes the operation
2842 predicated by PRED1[0]. */
2843
2844 bool
2845 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2846 {
2847 machine_mode mode = GET_MODE (pred2);
2848 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2849 && mode == GET_MODE (pred1[0])
2850 && aarch64_sve_gp_strictness (pred1[1], SImode));
2851 return (pred1[0] == CONSTM1_RTX (mode)
2852 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2853 || rtx_equal_p (pred1[0], pred2));
2854 }
2855
2856 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2857 for it. PRED2[0] is the predicate for the instruction whose result
2858 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2859 for it. Return true if we can prove that the two predicates are
2860 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2861 with PRED1[0] without changing behavior. */
2862
2863 bool
2864 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2865 {
2866 machine_mode mode = GET_MODE (pred1[0]);
2867 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2868 && mode == GET_MODE (pred2[0])
2869 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2870 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2871
2872 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2873 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2874 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2875 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2876 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2877 }
2878
2879 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2880 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2881 Use TARGET as the target register if nonnull and convenient. */
2882
2883 static rtx
2884 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2885 machine_mode data_mode, rtx op1, rtx op2)
2886 {
2887 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2888 expand_operand ops[5];
2889 create_output_operand (&ops[0], target, pred_mode);
2890 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2891 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2892 create_input_operand (&ops[3], op1, data_mode);
2893 create_input_operand (&ops[4], op2, data_mode);
2894 expand_insn (icode, 5, ops);
2895 return ops[0].value;
2896 }
2897
2898 /* Use a comparison to convert integer vector SRC into MODE, which is
2899 the corresponding SVE predicate mode. Use TARGET for the result
2900 if it's nonnull and convenient. */
2901
2902 static rtx
2903 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2904 {
2905 machine_mode src_mode = GET_MODE (src);
2906 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2907 src, CONST0_RTX (src_mode));
2908 }
2909
2910 /* Return true if we can move VALUE into a register using a single
2911 CNT[BHWD] instruction. */
2912
2913 static bool
2914 aarch64_sve_cnt_immediate_p (poly_int64 value)
2915 {
2916 HOST_WIDE_INT factor = value.coeffs[0];
2917 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2918 return (value.coeffs[1] == factor
2919 && IN_RANGE (factor, 2, 16 * 16)
2920 && (factor & 1) == 0
2921 && factor <= 16 * (factor & -factor));
2922 }
2923
2924 /* Likewise for rtx X. */
2925
2926 bool
2927 aarch64_sve_cnt_immediate_p (rtx x)
2928 {
2929 poly_int64 value;
2930 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2931 }
2932
2933 /* Return the asm string for an instruction with a CNT-like vector size
2934 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2935 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2936 first part of the operands template (the part that comes before the
2937 vector size itself). PATTERN is the pattern to use. FACTOR is the
2938 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2939 in each quadword. If it is zero, we can use any element size. */
2940
2941 static char *
2942 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2943 aarch64_svpattern pattern,
2944 unsigned int factor,
2945 unsigned int nelts_per_vq)
2946 {
2947 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2948
2949 if (nelts_per_vq == 0)
2950 /* There is some overlap in the ranges of the four CNT instructions.
2951 Here we always use the smallest possible element size, so that the
2952 multiplier is 1 whereever possible. */
2953 nelts_per_vq = factor & -factor;
2954 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2955 gcc_assert (IN_RANGE (shift, 1, 4));
2956 char suffix = "dwhb"[shift - 1];
2957
2958 factor >>= shift;
2959 unsigned int written;
2960 if (pattern == AARCH64_SV_ALL && factor == 1)
2961 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2962 prefix, suffix, operands);
2963 else if (factor == 1)
2964 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2965 prefix, suffix, operands, svpattern_token (pattern));
2966 else
2967 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2968 prefix, suffix, operands, svpattern_token (pattern),
2969 factor);
2970 gcc_assert (written < sizeof (buffer));
2971 return buffer;
2972 }
2973
2974 /* Return the asm string for an instruction with a CNT-like vector size
2975 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2976 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2977 first part of the operands template (the part that comes before the
2978 vector size itself). X is the value of the vector size operand,
2979 as a polynomial integer rtx; we need to convert this into an "all"
2980 pattern with a multiplier. */
2981
2982 char *
2983 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2984 rtx x)
2985 {
2986 poly_int64 value = rtx_to_poly_int64 (x);
2987 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2988 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
2989 value.coeffs[1], 0);
2990 }
2991
2992 /* Return true if we can add X using a single SVE INC or DEC instruction. */
2993
2994 bool
2995 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2996 {
2997 poly_int64 value;
2998 return (poly_int_rtx_p (x, &value)
2999 && (aarch64_sve_cnt_immediate_p (value)
3000 || aarch64_sve_cnt_immediate_p (-value)));
3001 }
3002
3003 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3004 operand 0. */
3005
3006 char *
3007 aarch64_output_sve_scalar_inc_dec (rtx offset)
3008 {
3009 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3010 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3011 if (offset_value.coeffs[1] > 0)
3012 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3013 offset_value.coeffs[1], 0);
3014 else
3015 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3016 -offset_value.coeffs[1], 0);
3017 }
3018
3019 /* Return true if we can add VALUE to a register using a single ADDVL
3020 or ADDPL instruction. */
3021
3022 static bool
3023 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3024 {
3025 HOST_WIDE_INT factor = value.coeffs[0];
3026 if (factor == 0 || value.coeffs[1] != factor)
3027 return false;
3028 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3029 and a value of 16 is one vector width. */
3030 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3031 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3032 }
3033
3034 /* Likewise for rtx X. */
3035
3036 bool
3037 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3038 {
3039 poly_int64 value;
3040 return (poly_int_rtx_p (x, &value)
3041 && aarch64_sve_addvl_addpl_immediate_p (value));
3042 }
3043
3044 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3045 to operand 1 and storing the result in operand 0. */
3046
3047 char *
3048 aarch64_output_sve_addvl_addpl (rtx offset)
3049 {
3050 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3051 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3052 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3053
3054 int factor = offset_value.coeffs[1];
3055 if ((factor & 15) == 0)
3056 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3057 else
3058 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3059 return buffer;
3060 }
3061
3062 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3063 instruction. If it is, store the number of elements in each vector
3064 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3065 factor in *FACTOR_OUT (if nonnull). */
3066
3067 bool
3068 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3069 unsigned int *nelts_per_vq_out)
3070 {
3071 rtx elt;
3072 poly_int64 value;
3073
3074 if (!const_vec_duplicate_p (x, &elt)
3075 || !poly_int_rtx_p (elt, &value))
3076 return false;
3077
3078 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3079 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3080 /* There's no vector INCB. */
3081 return false;
3082
3083 HOST_WIDE_INT factor = value.coeffs[0];
3084 if (value.coeffs[1] != factor)
3085 return false;
3086
3087 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3088 if ((factor % nelts_per_vq) != 0
3089 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3090 return false;
3091
3092 if (factor_out)
3093 *factor_out = factor;
3094 if (nelts_per_vq_out)
3095 *nelts_per_vq_out = nelts_per_vq;
3096 return true;
3097 }
3098
3099 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3100 instruction. */
3101
3102 bool
3103 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3104 {
3105 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3106 }
3107
3108 /* Return the asm template for an SVE vector INC or DEC instruction.
3109 OPERANDS gives the operands before the vector count and X is the
3110 value of the vector count operand itself. */
3111
3112 char *
3113 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3114 {
3115 int factor;
3116 unsigned int nelts_per_vq;
3117 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3118 gcc_unreachable ();
3119 if (factor < 0)
3120 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3121 -factor, nelts_per_vq);
3122 else
3123 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3124 factor, nelts_per_vq);
3125 }
3126
3127 static int
3128 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3129 scalar_int_mode mode)
3130 {
3131 int i;
3132 unsigned HOST_WIDE_INT val, val2, mask;
3133 int one_match, zero_match;
3134 int num_insns;
3135
3136 val = INTVAL (imm);
3137
3138 if (aarch64_move_imm (val, mode))
3139 {
3140 if (generate)
3141 emit_insn (gen_rtx_SET (dest, imm));
3142 return 1;
3143 }
3144
3145 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3146 (with XXXX non-zero). In that case check to see if the move can be done in
3147 a smaller mode. */
3148 val2 = val & 0xffffffff;
3149 if (mode == DImode
3150 && aarch64_move_imm (val2, SImode)
3151 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3152 {
3153 if (generate)
3154 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3155
3156 /* Check if we have to emit a second instruction by checking to see
3157 if any of the upper 32 bits of the original DI mode value is set. */
3158 if (val == val2)
3159 return 1;
3160
3161 i = (val >> 48) ? 48 : 32;
3162
3163 if (generate)
3164 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3165 GEN_INT ((val >> i) & 0xffff)));
3166
3167 return 2;
3168 }
3169
3170 if ((val >> 32) == 0 || mode == SImode)
3171 {
3172 if (generate)
3173 {
3174 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3175 if (mode == SImode)
3176 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3177 GEN_INT ((val >> 16) & 0xffff)));
3178 else
3179 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3180 GEN_INT ((val >> 16) & 0xffff)));
3181 }
3182 return 2;
3183 }
3184
3185 /* Remaining cases are all for DImode. */
3186
3187 mask = 0xffff;
3188 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3189 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3190 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3191 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3192
3193 if (zero_match != 2 && one_match != 2)
3194 {
3195 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3196 For a 64-bit bitmask try whether changing 16 bits to all ones or
3197 zeroes creates a valid bitmask. To check any repeated bitmask,
3198 try using 16 bits from the other 32-bit half of val. */
3199
3200 for (i = 0; i < 64; i += 16, mask <<= 16)
3201 {
3202 val2 = val & ~mask;
3203 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3204 break;
3205 val2 = val | mask;
3206 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3207 break;
3208 val2 = val2 & ~mask;
3209 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3210 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3211 break;
3212 }
3213 if (i != 64)
3214 {
3215 if (generate)
3216 {
3217 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3218 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3219 GEN_INT ((val >> i) & 0xffff)));
3220 }
3221 return 2;
3222 }
3223 }
3224
3225 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3226 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3227 otherwise skip zero bits. */
3228
3229 num_insns = 1;
3230 mask = 0xffff;
3231 val2 = one_match > zero_match ? ~val : val;
3232 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3233
3234 if (generate)
3235 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3236 ? (val | ~(mask << i))
3237 : (val & (mask << i)))));
3238 for (i += 16; i < 64; i += 16)
3239 {
3240 if ((val2 & (mask << i)) == 0)
3241 continue;
3242 if (generate)
3243 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3244 GEN_INT ((val >> i) & 0xffff)));
3245 num_insns ++;
3246 }
3247
3248 return num_insns;
3249 }
3250
3251 /* Return whether imm is a 128-bit immediate which is simple enough to
3252 expand inline. */
3253 bool
3254 aarch64_mov128_immediate (rtx imm)
3255 {
3256 if (GET_CODE (imm) == CONST_INT)
3257 return true;
3258
3259 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3260
3261 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3262 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3263
3264 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3265 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3266 }
3267
3268
3269 /* Return the number of temporary registers that aarch64_add_offset_1
3270 would need to add OFFSET to a register. */
3271
3272 static unsigned int
3273 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3274 {
3275 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3276 }
3277
3278 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3279 a non-polynomial OFFSET. MODE is the mode of the addition.
3280 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3281 be set and CFA adjustments added to the generated instructions.
3282
3283 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3284 temporary if register allocation is already complete. This temporary
3285 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3286 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3287 the immediate again.
3288
3289 Since this function may be used to adjust the stack pointer, we must
3290 ensure that it cannot cause transient stack deallocation (for example
3291 by first incrementing SP and then decrementing when adjusting by a
3292 large immediate). */
3293
3294 static void
3295 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3296 rtx src, HOST_WIDE_INT offset, rtx temp1,
3297 bool frame_related_p, bool emit_move_imm)
3298 {
3299 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3300 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3301
3302 HOST_WIDE_INT moffset = abs_hwi (offset);
3303 rtx_insn *insn;
3304
3305 if (!moffset)
3306 {
3307 if (!rtx_equal_p (dest, src))
3308 {
3309 insn = emit_insn (gen_rtx_SET (dest, src));
3310 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3311 }
3312 return;
3313 }
3314
3315 /* Single instruction adjustment. */
3316 if (aarch64_uimm12_shift (moffset))
3317 {
3318 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3319 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3320 return;
3321 }
3322
3323 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3324 and either:
3325
3326 a) the offset cannot be loaded by a 16-bit move or
3327 b) there is no spare register into which we can move it. */
3328 if (moffset < 0x1000000
3329 && ((!temp1 && !can_create_pseudo_p ())
3330 || !aarch64_move_imm (moffset, mode)))
3331 {
3332 HOST_WIDE_INT low_off = moffset & 0xfff;
3333
3334 low_off = offset < 0 ? -low_off : low_off;
3335 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3336 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3337 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3338 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3339 return;
3340 }
3341
3342 /* Emit a move immediate if required and an addition/subtraction. */
3343 if (emit_move_imm)
3344 {
3345 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3346 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3347 }
3348 insn = emit_insn (offset < 0
3349 ? gen_sub3_insn (dest, src, temp1)
3350 : gen_add3_insn (dest, src, temp1));
3351 if (frame_related_p)
3352 {
3353 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3354 rtx adj = plus_constant (mode, src, offset);
3355 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3356 }
3357 }
3358
3359 /* Return the number of temporary registers that aarch64_add_offset
3360 would need to move OFFSET into a register or add OFFSET to a register;
3361 ADD_P is true if we want the latter rather than the former. */
3362
3363 static unsigned int
3364 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3365 {
3366 /* This follows the same structure as aarch64_add_offset. */
3367 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3368 return 0;
3369
3370 unsigned int count = 0;
3371 HOST_WIDE_INT factor = offset.coeffs[1];
3372 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3373 poly_int64 poly_offset (factor, factor);
3374 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3375 /* Need one register for the ADDVL/ADDPL result. */
3376 count += 1;
3377 else if (factor != 0)
3378 {
3379 factor = abs (factor);
3380 if (factor > 16 * (factor & -factor))
3381 /* Need one register for the CNT result and one for the multiplication
3382 factor. If necessary, the second temporary can be reused for the
3383 constant part of the offset. */
3384 return 2;
3385 /* Need one register for the CNT result (which might then
3386 be shifted). */
3387 count += 1;
3388 }
3389 return count + aarch64_add_offset_1_temporaries (constant);
3390 }
3391
3392 /* If X can be represented as a poly_int64, return the number
3393 of temporaries that are required to add it to a register.
3394 Return -1 otherwise. */
3395
3396 int
3397 aarch64_add_offset_temporaries (rtx x)
3398 {
3399 poly_int64 offset;
3400 if (!poly_int_rtx_p (x, &offset))
3401 return -1;
3402 return aarch64_offset_temporaries (true, offset);
3403 }
3404
3405 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3406 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3407 be set and CFA adjustments added to the generated instructions.
3408
3409 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3410 temporary if register allocation is already complete. This temporary
3411 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3412 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3413 false to avoid emitting the immediate again.
3414
3415 TEMP2, if nonnull, is a second temporary register that doesn't
3416 overlap either DEST or REG.
3417
3418 Since this function may be used to adjust the stack pointer, we must
3419 ensure that it cannot cause transient stack deallocation (for example
3420 by first incrementing SP and then decrementing when adjusting by a
3421 large immediate). */
3422
3423 static void
3424 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3425 poly_int64 offset, rtx temp1, rtx temp2,
3426 bool frame_related_p, bool emit_move_imm = true)
3427 {
3428 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3429 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3430 gcc_assert (temp1 == NULL_RTX
3431 || !frame_related_p
3432 || !reg_overlap_mentioned_p (temp1, dest));
3433 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3434
3435 /* Try using ADDVL or ADDPL to add the whole value. */
3436 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3437 {
3438 rtx offset_rtx = gen_int_mode (offset, mode);
3439 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3440 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3441 return;
3442 }
3443
3444 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3445 SVE vector register, over and above the minimum size of 128 bits.
3446 This is equivalent to half the value returned by CNTD with a
3447 vector shape of ALL. */
3448 HOST_WIDE_INT factor = offset.coeffs[1];
3449 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3450
3451 /* Try using ADDVL or ADDPL to add the VG-based part. */
3452 poly_int64 poly_offset (factor, factor);
3453 if (src != const0_rtx
3454 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3455 {
3456 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3457 if (frame_related_p)
3458 {
3459 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3460 RTX_FRAME_RELATED_P (insn) = true;
3461 src = dest;
3462 }
3463 else
3464 {
3465 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3466 src = aarch64_force_temporary (mode, temp1, addr);
3467 temp1 = temp2;
3468 temp2 = NULL_RTX;
3469 }
3470 }
3471 /* Otherwise use a CNT-based sequence. */
3472 else if (factor != 0)
3473 {
3474 /* Use a subtraction if we have a negative factor. */
3475 rtx_code code = PLUS;
3476 if (factor < 0)
3477 {
3478 factor = -factor;
3479 code = MINUS;
3480 }
3481
3482 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3483 into the multiplication. */
3484 rtx val;
3485 int shift = 0;
3486 if (factor & 1)
3487 /* Use a right shift by 1. */
3488 shift = -1;
3489 else
3490 factor /= 2;
3491 HOST_WIDE_INT low_bit = factor & -factor;
3492 if (factor <= 16 * low_bit)
3493 {
3494 if (factor > 16 * 8)
3495 {
3496 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3497 the value with the minimum multiplier and shift it into
3498 position. */
3499 int extra_shift = exact_log2 (low_bit);
3500 shift += extra_shift;
3501 factor >>= extra_shift;
3502 }
3503 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3504 }
3505 else
3506 {
3507 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3508 directly, since that should increase the chances of being
3509 able to use a shift and add sequence. If LOW_BIT itself
3510 is out of range, just use CNTD. */
3511 if (low_bit <= 16 * 8)
3512 factor /= low_bit;
3513 else
3514 low_bit = 1;
3515
3516 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3517 val = aarch64_force_temporary (mode, temp1, val);
3518
3519 if (can_create_pseudo_p ())
3520 {
3521 rtx coeff1 = gen_int_mode (factor, mode);
3522 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3523 }
3524 else
3525 {
3526 /* Go back to using a negative multiplication factor if we have
3527 no register from which to subtract. */
3528 if (code == MINUS && src == const0_rtx)
3529 {
3530 factor = -factor;
3531 code = PLUS;
3532 }
3533 rtx coeff1 = gen_int_mode (factor, mode);
3534 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3535 val = gen_rtx_MULT (mode, val, coeff1);
3536 }
3537 }
3538
3539 if (shift > 0)
3540 {
3541 /* Multiply by 1 << SHIFT. */
3542 val = aarch64_force_temporary (mode, temp1, val);
3543 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3544 }
3545 else if (shift == -1)
3546 {
3547 /* Divide by 2. */
3548 val = aarch64_force_temporary (mode, temp1, val);
3549 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3550 }
3551
3552 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3553 if (src != const0_rtx)
3554 {
3555 val = aarch64_force_temporary (mode, temp1, val);
3556 val = gen_rtx_fmt_ee (code, mode, src, val);
3557 }
3558 else if (code == MINUS)
3559 {
3560 val = aarch64_force_temporary (mode, temp1, val);
3561 val = gen_rtx_NEG (mode, val);
3562 }
3563
3564 if (constant == 0 || frame_related_p)
3565 {
3566 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3567 if (frame_related_p)
3568 {
3569 RTX_FRAME_RELATED_P (insn) = true;
3570 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3571 gen_rtx_SET (dest, plus_constant (Pmode, src,
3572 poly_offset)));
3573 }
3574 src = dest;
3575 if (constant == 0)
3576 return;
3577 }
3578 else
3579 {
3580 src = aarch64_force_temporary (mode, temp1, val);
3581 temp1 = temp2;
3582 temp2 = NULL_RTX;
3583 }
3584
3585 emit_move_imm = true;
3586 }
3587
3588 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3589 frame_related_p, emit_move_imm);
3590 }
3591
3592 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3593 than a poly_int64. */
3594
3595 void
3596 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3597 rtx offset_rtx, rtx temp1, rtx temp2)
3598 {
3599 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3600 temp1, temp2, false);
3601 }
3602
3603 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3604 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3605 if TEMP1 already contains abs (DELTA). */
3606
3607 static inline void
3608 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3609 {
3610 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3611 temp1, temp2, true, emit_move_imm);
3612 }
3613
3614 /* Subtract DELTA from the stack pointer, marking the instructions
3615 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3616 if nonnull. */
3617
3618 static inline void
3619 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3620 bool emit_move_imm = true)
3621 {
3622 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3623 temp1, temp2, frame_related_p, emit_move_imm);
3624 }
3625
3626 /* Set DEST to (vec_series BASE STEP). */
3627
3628 static void
3629 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3630 {
3631 machine_mode mode = GET_MODE (dest);
3632 scalar_mode inner = GET_MODE_INNER (mode);
3633
3634 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3635 if (!aarch64_sve_index_immediate_p (base))
3636 base = force_reg (inner, base);
3637 if (!aarch64_sve_index_immediate_p (step))
3638 step = force_reg (inner, step);
3639
3640 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3641 }
3642
3643 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3644 register of mode MODE. Use TARGET for the result if it's nonnull
3645 and convenient.
3646
3647 The two vector modes must have the same element mode. The behavior
3648 is to duplicate architectural lane N of SRC into architectural lanes
3649 N + I * STEP of the result. On big-endian targets, architectural
3650 lane 0 of an Advanced SIMD vector is the last element of the vector
3651 in memory layout, so for big-endian targets this operation has the
3652 effect of reversing SRC before duplicating it. Callers need to
3653 account for this. */
3654
3655 rtx
3656 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3657 {
3658 machine_mode src_mode = GET_MODE (src);
3659 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3660 insn_code icode = (BYTES_BIG_ENDIAN
3661 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3662 : code_for_aarch64_vec_duplicate_vq_le (mode));
3663
3664 unsigned int i = 0;
3665 expand_operand ops[3];
3666 create_output_operand (&ops[i++], target, mode);
3667 create_output_operand (&ops[i++], src, src_mode);
3668 if (BYTES_BIG_ENDIAN)
3669 {
3670 /* Create a PARALLEL describing the reversal of SRC. */
3671 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3672 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3673 nelts_per_vq - 1, -1);
3674 create_fixed_operand (&ops[i++], sel);
3675 }
3676 expand_insn (icode, i, ops);
3677 return ops[0].value;
3678 }
3679
3680 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3681 the memory image into DEST. Return true on success. */
3682
3683 static bool
3684 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3685 {
3686 src = force_const_mem (GET_MODE (src), src);
3687 if (!src)
3688 return false;
3689
3690 /* Make sure that the address is legitimate. */
3691 if (!aarch64_sve_ld1rq_operand_p (src))
3692 {
3693 rtx addr = force_reg (Pmode, XEXP (src, 0));
3694 src = replace_equiv_address (src, addr);
3695 }
3696
3697 machine_mode mode = GET_MODE (dest);
3698 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3699 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3700 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3701 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3702 return true;
3703 }
3704
3705 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3706 SVE data mode and isn't a legitimate constant. Use TARGET for the
3707 result if convenient.
3708
3709 The returned register can have whatever mode seems most natural
3710 given the contents of SRC. */
3711
3712 static rtx
3713 aarch64_expand_sve_const_vector (rtx target, rtx src)
3714 {
3715 machine_mode mode = GET_MODE (src);
3716 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3717 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3718 scalar_mode elt_mode = GET_MODE_INNER (mode);
3719 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3720 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3721
3722 if (nelts_per_pattern == 1 && encoded_bits == 128)
3723 {
3724 /* The constant is a duplicated quadword but can't be narrowed
3725 beyond a quadword. Get the memory image of the first quadword
3726 as a 128-bit vector and try using LD1RQ to load it from memory.
3727
3728 The effect for both endiannesses is to load memory lane N into
3729 architectural lanes N + I * STEP of the result. On big-endian
3730 targets, the layout of the 128-bit vector in an Advanced SIMD
3731 register would be different from its layout in an SVE register,
3732 but this 128-bit vector is a memory value only. */
3733 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3734 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3735 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3736 return target;
3737 }
3738
3739 if (nelts_per_pattern == 1 && encoded_bits < 128)
3740 {
3741 /* The vector is a repeating sequence of 64 bits or fewer.
3742 See if we can load them using an Advanced SIMD move and then
3743 duplicate it to fill a vector. This is better than using a GPR
3744 move because it keeps everything in the same register file. */
3745 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3746 rtx_vector_builder builder (vq_mode, npatterns, 1);
3747 for (unsigned int i = 0; i < npatterns; ++i)
3748 {
3749 /* We want memory lane N to go into architectural lane N,
3750 so reverse for big-endian targets. The DUP .Q pattern
3751 has a compensating reverse built-in. */
3752 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3753 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3754 }
3755 rtx vq_src = builder.build ();
3756 if (aarch64_simd_valid_immediate (vq_src, NULL))
3757 {
3758 vq_src = force_reg (vq_mode, vq_src);
3759 return aarch64_expand_sve_dupq (target, mode, vq_src);
3760 }
3761
3762 /* Get an integer representation of the repeating part of Advanced
3763 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3764 which for big-endian targets is lane-swapped wrt a normal
3765 Advanced SIMD vector. This means that for both endiannesses,
3766 memory lane N of SVE vector SRC corresponds to architectural
3767 lane N of a register holding VQ_SRC. This in turn means that
3768 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3769 as a single 128-bit value) and thus that memory lane 0 of SRC is
3770 in the lsb of the integer. Duplicating the integer therefore
3771 ensures that memory lane N of SRC goes into architectural lane
3772 N + I * INDEX of the SVE register. */
3773 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3774 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3775 if (elt_value)
3776 {
3777 /* Pretend that we had a vector of INT_MODE to start with. */
3778 elt_mode = int_mode;
3779 mode = aarch64_full_sve_mode (int_mode).require ();
3780
3781 /* If the integer can be moved into a general register by a
3782 single instruction, do that and duplicate the result. */
3783 if (CONST_INT_P (elt_value)
3784 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3785 {
3786 elt_value = force_reg (elt_mode, elt_value);
3787 return expand_vector_broadcast (mode, elt_value);
3788 }
3789 }
3790 else if (npatterns == 1)
3791 /* We're duplicating a single value, but can't do better than
3792 force it to memory and load from there. This handles things
3793 like symbolic constants. */
3794 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3795
3796 if (elt_value)
3797 {
3798 /* Load the element from memory if we can, otherwise move it into
3799 a register and use a DUP. */
3800 rtx op = force_const_mem (elt_mode, elt_value);
3801 if (!op)
3802 op = force_reg (elt_mode, elt_value);
3803 return expand_vector_broadcast (mode, op);
3804 }
3805 }
3806
3807 /* Try using INDEX. */
3808 rtx base, step;
3809 if (const_vec_series_p (src, &base, &step))
3810 {
3811 aarch64_expand_vec_series (target, base, step);
3812 return target;
3813 }
3814
3815 /* From here on, it's better to force the whole constant to memory
3816 if we can. */
3817 if (GET_MODE_NUNITS (mode).is_constant ())
3818 return NULL_RTX;
3819
3820 /* Expand each pattern individually. */
3821 gcc_assert (npatterns > 1);
3822 rtx_vector_builder builder;
3823 auto_vec<rtx, 16> vectors (npatterns);
3824 for (unsigned int i = 0; i < npatterns; ++i)
3825 {
3826 builder.new_vector (mode, 1, nelts_per_pattern);
3827 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3828 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3829 vectors.quick_push (force_reg (mode, builder.build ()));
3830 }
3831
3832 /* Use permutes to interleave the separate vectors. */
3833 while (npatterns > 1)
3834 {
3835 npatterns /= 2;
3836 for (unsigned int i = 0; i < npatterns; ++i)
3837 {
3838 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3839 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3840 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3841 vectors[i] = tmp;
3842 }
3843 }
3844 gcc_assert (vectors[0] == target);
3845 return target;
3846 }
3847
3848 /* Use WHILE to set a predicate register of mode MODE in which the first
3849 VL bits are set and the rest are clear. Use TARGET for the register
3850 if it's nonnull and convenient. */
3851
3852 static rtx
3853 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3854 unsigned int vl)
3855 {
3856 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3857 target = aarch64_target_reg (target, mode);
3858 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3859 return target;
3860 }
3861
3862 static rtx
3863 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3864
3865 /* BUILDER is a constant predicate in which the index of every set bit
3866 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3867 by inverting every element at a multiple of ELT_SIZE and EORing the
3868 result with an ELT_SIZE PTRUE.
3869
3870 Return a register that contains the constant on success, otherwise
3871 return null. Use TARGET as the register if it is nonnull and
3872 convenient. */
3873
3874 static rtx
3875 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3876 unsigned int elt_size)
3877 {
3878 /* Invert every element at a multiple of ELT_SIZE, keeping the
3879 other bits zero. */
3880 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3881 builder.nelts_per_pattern ());
3882 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3883 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3884 inv_builder.quick_push (const1_rtx);
3885 else
3886 inv_builder.quick_push (const0_rtx);
3887 inv_builder.finalize ();
3888
3889 /* See if we can load the constant cheaply. */
3890 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3891 if (!inv)
3892 return NULL_RTX;
3893
3894 /* EOR the result with an ELT_SIZE PTRUE. */
3895 rtx mask = aarch64_ptrue_all (elt_size);
3896 mask = force_reg (VNx16BImode, mask);
3897 target = aarch64_target_reg (target, VNx16BImode);
3898 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3899 return target;
3900 }
3901
3902 /* BUILDER is a constant predicate in which the index of every set bit
3903 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3904 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3905 register on success, otherwise return null. Use TARGET as the register
3906 if nonnull and convenient. */
3907
3908 static rtx
3909 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3910 unsigned int elt_size,
3911 unsigned int permute_size)
3912 {
3913 /* We're going to split the constant into two new constants A and B,
3914 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3915 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3916
3917 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3918 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3919
3920 where _ indicates elements that will be discarded by the permute.
3921
3922 First calculate the ELT_SIZEs for A and B. */
3923 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3924 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3925 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3926 if (INTVAL (builder.elt (i)) != 0)
3927 {
3928 if (i & permute_size)
3929 b_elt_size |= i - permute_size;
3930 else
3931 a_elt_size |= i;
3932 }
3933 a_elt_size &= -a_elt_size;
3934 b_elt_size &= -b_elt_size;
3935
3936 /* Now construct the vectors themselves. */
3937 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3938 builder.nelts_per_pattern ());
3939 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3940 builder.nelts_per_pattern ());
3941 unsigned int nelts = builder.encoded_nelts ();
3942 for (unsigned int i = 0; i < nelts; ++i)
3943 if (i & (elt_size - 1))
3944 {
3945 a_builder.quick_push (const0_rtx);
3946 b_builder.quick_push (const0_rtx);
3947 }
3948 else if ((i & permute_size) == 0)
3949 {
3950 /* The A and B elements are significant. */
3951 a_builder.quick_push (builder.elt (i));
3952 b_builder.quick_push (builder.elt (i + permute_size));
3953 }
3954 else
3955 {
3956 /* The A and B elements are going to be discarded, so pick whatever
3957 is likely to give a nice constant. We are targeting element
3958 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3959 with the aim of each being a sequence of ones followed by
3960 a sequence of zeros. So:
3961
3962 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3963 duplicate the last X_ELT_SIZE element, to extend the
3964 current sequence of ones or zeros.
3965
3966 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3967 zero, so that the constant really does have X_ELT_SIZE and
3968 not a smaller size. */
3969 if (a_elt_size > permute_size)
3970 a_builder.quick_push (const0_rtx);
3971 else
3972 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3973 if (b_elt_size > permute_size)
3974 b_builder.quick_push (const0_rtx);
3975 else
3976 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3977 }
3978 a_builder.finalize ();
3979 b_builder.finalize ();
3980
3981 /* Try loading A into a register. */
3982 rtx_insn *last = get_last_insn ();
3983 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3984 if (!a)
3985 return NULL_RTX;
3986
3987 /* Try loading B into a register. */
3988 rtx b = a;
3989 if (a_builder != b_builder)
3990 {
3991 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3992 if (!b)
3993 {
3994 delete_insns_since (last);
3995 return NULL_RTX;
3996 }
3997 }
3998
3999 /* Emit the TRN1 itself. */
4000 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4001 target = aarch64_target_reg (target, mode);
4002 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4003 gen_lowpart (mode, a),
4004 gen_lowpart (mode, b)));
4005 return target;
4006 }
4007
4008 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4009 constant in BUILDER into an SVE predicate register. Return the register
4010 on success, otherwise return null. Use TARGET for the register if
4011 nonnull and convenient.
4012
4013 ALLOW_RECURSE_P is true if we can use methods that would call this
4014 function recursively. */
4015
4016 static rtx
4017 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4018 bool allow_recurse_p)
4019 {
4020 if (builder.encoded_nelts () == 1)
4021 /* A PFALSE or a PTRUE .B ALL. */
4022 return aarch64_emit_set_immediate (target, builder);
4023
4024 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4025 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4026 {
4027 /* If we can load the constant using PTRUE, use it as-is. */
4028 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4029 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4030 return aarch64_emit_set_immediate (target, builder);
4031
4032 /* Otherwise use WHILE to set the first VL bits. */
4033 return aarch64_sve_move_pred_via_while (target, mode, vl);
4034 }
4035
4036 if (!allow_recurse_p)
4037 return NULL_RTX;
4038
4039 /* Try inverting the vector in element size ELT_SIZE and then EORing
4040 the result with an ELT_SIZE PTRUE. */
4041 if (INTVAL (builder.elt (0)) == 0)
4042 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4043 elt_size))
4044 return res;
4045
4046 /* Try using TRN1 to permute two simpler constants. */
4047 for (unsigned int i = elt_size; i <= 8; i *= 2)
4048 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4049 elt_size, i))
4050 return res;
4051
4052 return NULL_RTX;
4053 }
4054
4055 /* Return an SVE predicate register that contains the VNx16BImode
4056 constant in BUILDER, without going through the move expanders.
4057
4058 The returned register can have whatever mode seems most natural
4059 given the contents of BUILDER. Use TARGET for the result if
4060 convenient. */
4061
4062 static rtx
4063 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4064 {
4065 /* Try loading the constant using pure predicate operations. */
4066 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4067 return res;
4068
4069 /* Try forcing the constant to memory. */
4070 if (builder.full_nelts ().is_constant ())
4071 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4072 {
4073 target = aarch64_target_reg (target, VNx16BImode);
4074 emit_move_insn (target, mem);
4075 return target;
4076 }
4077
4078 /* The last resort is to load the constant as an integer and then
4079 compare it against zero. Use -1 for set bits in order to increase
4080 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4081 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4082 builder.nelts_per_pattern ());
4083 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4084 int_builder.quick_push (INTVAL (builder.elt (i))
4085 ? constm1_rtx : const0_rtx);
4086 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4087 int_builder.build ());
4088 }
4089
4090 /* Set DEST to immediate IMM. */
4091
4092 void
4093 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4094 {
4095 machine_mode mode = GET_MODE (dest);
4096
4097 /* Check on what type of symbol it is. */
4098 scalar_int_mode int_mode;
4099 if ((GET_CODE (imm) == SYMBOL_REF
4100 || GET_CODE (imm) == LABEL_REF
4101 || GET_CODE (imm) == CONST
4102 || GET_CODE (imm) == CONST_POLY_INT)
4103 && is_a <scalar_int_mode> (mode, &int_mode))
4104 {
4105 rtx mem;
4106 poly_int64 offset;
4107 HOST_WIDE_INT const_offset;
4108 enum aarch64_symbol_type sty;
4109
4110 /* If we have (const (plus symbol offset)), separate out the offset
4111 before we start classifying the symbol. */
4112 rtx base = strip_offset (imm, &offset);
4113
4114 /* We must always add an offset involving VL separately, rather than
4115 folding it into the relocation. */
4116 if (!offset.is_constant (&const_offset))
4117 {
4118 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4119 emit_insn (gen_rtx_SET (dest, imm));
4120 else
4121 {
4122 /* Do arithmetic on 32-bit values if the result is smaller
4123 than that. */
4124 if (partial_subreg_p (int_mode, SImode))
4125 {
4126 /* It is invalid to do symbol calculations in modes
4127 narrower than SImode. */
4128 gcc_assert (base == const0_rtx);
4129 dest = gen_lowpart (SImode, dest);
4130 int_mode = SImode;
4131 }
4132 if (base != const0_rtx)
4133 {
4134 base = aarch64_force_temporary (int_mode, dest, base);
4135 aarch64_add_offset (int_mode, dest, base, offset,
4136 NULL_RTX, NULL_RTX, false);
4137 }
4138 else
4139 aarch64_add_offset (int_mode, dest, base, offset,
4140 dest, NULL_RTX, false);
4141 }
4142 return;
4143 }
4144
4145 sty = aarch64_classify_symbol (base, const_offset);
4146 switch (sty)
4147 {
4148 case SYMBOL_FORCE_TO_MEM:
4149 if (const_offset != 0
4150 && targetm.cannot_force_const_mem (int_mode, imm))
4151 {
4152 gcc_assert (can_create_pseudo_p ());
4153 base = aarch64_force_temporary (int_mode, dest, base);
4154 aarch64_add_offset (int_mode, dest, base, const_offset,
4155 NULL_RTX, NULL_RTX, false);
4156 return;
4157 }
4158
4159 mem = force_const_mem (ptr_mode, imm);
4160 gcc_assert (mem);
4161
4162 /* If we aren't generating PC relative literals, then
4163 we need to expand the literal pool access carefully.
4164 This is something that needs to be done in a number
4165 of places, so could well live as a separate function. */
4166 if (!aarch64_pcrelative_literal_loads)
4167 {
4168 gcc_assert (can_create_pseudo_p ());
4169 base = gen_reg_rtx (ptr_mode);
4170 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4171 if (ptr_mode != Pmode)
4172 base = convert_memory_address (Pmode, base);
4173 mem = gen_rtx_MEM (ptr_mode, base);
4174 }
4175
4176 if (int_mode != ptr_mode)
4177 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4178
4179 emit_insn (gen_rtx_SET (dest, mem));
4180
4181 return;
4182
4183 case SYMBOL_SMALL_TLSGD:
4184 case SYMBOL_SMALL_TLSDESC:
4185 case SYMBOL_SMALL_TLSIE:
4186 case SYMBOL_SMALL_GOT_28K:
4187 case SYMBOL_SMALL_GOT_4G:
4188 case SYMBOL_TINY_GOT:
4189 case SYMBOL_TINY_TLSIE:
4190 if (const_offset != 0)
4191 {
4192 gcc_assert(can_create_pseudo_p ());
4193 base = aarch64_force_temporary (int_mode, dest, base);
4194 aarch64_add_offset (int_mode, dest, base, const_offset,
4195 NULL_RTX, NULL_RTX, false);
4196 return;
4197 }
4198 /* FALLTHRU */
4199
4200 case SYMBOL_SMALL_ABSOLUTE:
4201 case SYMBOL_TINY_ABSOLUTE:
4202 case SYMBOL_TLSLE12:
4203 case SYMBOL_TLSLE24:
4204 case SYMBOL_TLSLE32:
4205 case SYMBOL_TLSLE48:
4206 aarch64_load_symref_appropriately (dest, imm, sty);
4207 return;
4208
4209 default:
4210 gcc_unreachable ();
4211 }
4212 }
4213
4214 if (!CONST_INT_P (imm))
4215 {
4216 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4217 {
4218 /* Only the low bit of each .H, .S and .D element is defined,
4219 so we can set the upper bits to whatever we like. If the
4220 predicate is all-true in MODE, prefer to set all the undefined
4221 bits as well, so that we can share a single .B predicate for
4222 all modes. */
4223 if (imm == CONSTM1_RTX (mode))
4224 imm = CONSTM1_RTX (VNx16BImode);
4225
4226 /* All methods for constructing predicate modes wider than VNx16BI
4227 will set the upper bits of each element to zero. Expose this
4228 by moving such constants as a VNx16BI, so that all bits are
4229 significant and so that constants for different modes can be
4230 shared. The wider constant will still be available as a
4231 REG_EQUAL note. */
4232 rtx_vector_builder builder;
4233 if (aarch64_get_sve_pred_bits (builder, imm))
4234 {
4235 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4236 if (dest != res)
4237 emit_move_insn (dest, gen_lowpart (mode, res));
4238 return;
4239 }
4240 }
4241
4242 if (GET_CODE (imm) == HIGH
4243 || aarch64_simd_valid_immediate (imm, NULL))
4244 {
4245 emit_insn (gen_rtx_SET (dest, imm));
4246 return;
4247 }
4248
4249 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4250 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4251 {
4252 if (dest != res)
4253 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4254 return;
4255 }
4256
4257 rtx mem = force_const_mem (mode, imm);
4258 gcc_assert (mem);
4259 emit_move_insn (dest, mem);
4260 return;
4261 }
4262
4263 aarch64_internal_mov_immediate (dest, imm, true,
4264 as_a <scalar_int_mode> (mode));
4265 }
4266
4267 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4268 that is known to contain PTRUE. */
4269
4270 void
4271 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4272 {
4273 expand_operand ops[3];
4274 machine_mode mode = GET_MODE (dest);
4275 create_output_operand (&ops[0], dest, mode);
4276 create_input_operand (&ops[1], pred, GET_MODE(pred));
4277 create_input_operand (&ops[2], src, mode);
4278 temporary_volatile_ok v (true);
4279 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4280 }
4281
4282 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4283 operand is in memory. In this case we need to use the predicated LD1
4284 and ST1 instead of LDR and STR, both for correctness on big-endian
4285 targets and because LD1 and ST1 support a wider range of addressing modes.
4286 PRED_MODE is the mode of the predicate.
4287
4288 See the comment at the head of aarch64-sve.md for details about the
4289 big-endian handling. */
4290
4291 void
4292 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4293 {
4294 machine_mode mode = GET_MODE (dest);
4295 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4296 if (!register_operand (src, mode)
4297 && !register_operand (dest, mode))
4298 {
4299 rtx tmp = gen_reg_rtx (mode);
4300 if (MEM_P (src))
4301 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4302 else
4303 emit_move_insn (tmp, src);
4304 src = tmp;
4305 }
4306 aarch64_emit_sve_pred_move (dest, ptrue, src);
4307 }
4308
4309 /* Called only on big-endian targets. See whether an SVE vector move
4310 from SRC to DEST is effectively a REV[BHW] instruction, because at
4311 least one operand is a subreg of an SVE vector that has wider or
4312 narrower elements. Return true and emit the instruction if so.
4313
4314 For example:
4315
4316 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4317
4318 represents a VIEW_CONVERT between the following vectors, viewed
4319 in memory order:
4320
4321 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4322 R1: { [0], [1], [2], [3], ... }
4323
4324 The high part of lane X in R2 should therefore correspond to lane X*2
4325 of R1, but the register representations are:
4326
4327 msb lsb
4328 R2: ...... [1].high [1].low [0].high [0].low
4329 R1: ...... [3] [2] [1] [0]
4330
4331 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4332 We therefore need a reverse operation to swap the high and low values
4333 around.
4334
4335 This is purely an optimization. Without it we would spill the
4336 subreg operand to the stack in one mode and reload it in the
4337 other mode, which has the same effect as the REV. */
4338
4339 bool
4340 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4341 {
4342 gcc_assert (BYTES_BIG_ENDIAN);
4343 if (GET_CODE (dest) == SUBREG)
4344 dest = SUBREG_REG (dest);
4345 if (GET_CODE (src) == SUBREG)
4346 src = SUBREG_REG (src);
4347
4348 /* The optimization handles two single SVE REGs with different element
4349 sizes. */
4350 if (!REG_P (dest)
4351 || !REG_P (src)
4352 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4353 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4354 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4355 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4356 return false;
4357
4358 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4359 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4360 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4361 UNSPEC_REV_SUBREG);
4362 emit_insn (gen_rtx_SET (dest, unspec));
4363 return true;
4364 }
4365
4366 /* Return a copy of X with mode MODE, without changing its other
4367 attributes. Unlike gen_lowpart, this doesn't care whether the
4368 mode change is valid. */
4369
4370 static rtx
4371 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4372 {
4373 if (GET_MODE (x) == mode)
4374 return x;
4375
4376 x = shallow_copy_rtx (x);
4377 set_mode_and_regno (x, mode, REGNO (x));
4378 return x;
4379 }
4380
4381 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4382 stored in wider integer containers. */
4383
4384 static unsigned int
4385 aarch64_sve_rev_unspec (machine_mode mode)
4386 {
4387 switch (GET_MODE_UNIT_SIZE (mode))
4388 {
4389 case 1: return UNSPEC_REVB;
4390 case 2: return UNSPEC_REVH;
4391 case 4: return UNSPEC_REVW;
4392 }
4393 gcc_unreachable ();
4394 }
4395
4396 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4397 operands. */
4398
4399 void
4400 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4401 {
4402 /* Decide which REV operation we need. The mode with wider elements
4403 determines the mode of the operands and the mode with the narrower
4404 elements determines the reverse width. */
4405 machine_mode mode_with_wider_elts = GET_MODE (dest);
4406 machine_mode mode_with_narrower_elts = GET_MODE (src);
4407 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4408 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4409 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4410
4411 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4412 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4413 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4414
4415 /* Get the operands in the appropriate modes and emit the instruction. */
4416 ptrue = gen_lowpart (pred_mode, ptrue);
4417 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4418 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4419 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4420 dest, ptrue, src));
4421 }
4422
4423 static bool
4424 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4425 tree exp ATTRIBUTE_UNUSED)
4426 {
4427 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4428 return false;
4429
4430 return true;
4431 }
4432
4433 /* Implement TARGET_PASS_BY_REFERENCE. */
4434
4435 static bool
4436 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4437 {
4438 HOST_WIDE_INT size;
4439 machine_mode dummymode;
4440 int nregs;
4441
4442 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4443 if (arg.mode == BLKmode && arg.type)
4444 size = int_size_in_bytes (arg.type);
4445 else
4446 /* No frontends can create types with variable-sized modes, so we
4447 shouldn't be asked to pass or return them. */
4448 size = GET_MODE_SIZE (arg.mode).to_constant ();
4449
4450 /* Aggregates are passed by reference based on their size. */
4451 if (arg.aggregate_type_p ())
4452 size = int_size_in_bytes (arg.type);
4453
4454 /* Variable sized arguments are always returned by reference. */
4455 if (size < 0)
4456 return true;
4457
4458 /* Can this be a candidate to be passed in fp/simd register(s)? */
4459 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4460 &dummymode, &nregs,
4461 NULL))
4462 return false;
4463
4464 /* Arguments which are variable sized or larger than 2 registers are
4465 passed by reference unless they are a homogenous floating point
4466 aggregate. */
4467 return size > 2 * UNITS_PER_WORD;
4468 }
4469
4470 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4471 static bool
4472 aarch64_return_in_msb (const_tree valtype)
4473 {
4474 machine_mode dummy_mode;
4475 int dummy_int;
4476
4477 /* Never happens in little-endian mode. */
4478 if (!BYTES_BIG_ENDIAN)
4479 return false;
4480
4481 /* Only composite types smaller than or equal to 16 bytes can
4482 be potentially returned in registers. */
4483 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4484 || int_size_in_bytes (valtype) <= 0
4485 || int_size_in_bytes (valtype) > 16)
4486 return false;
4487
4488 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4489 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4490 is always passed/returned in the least significant bits of fp/simd
4491 register(s). */
4492 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4493 &dummy_mode, &dummy_int, NULL))
4494 return false;
4495
4496 return true;
4497 }
4498
4499 /* Implement TARGET_FUNCTION_VALUE.
4500 Define how to find the value returned by a function. */
4501
4502 static rtx
4503 aarch64_function_value (const_tree type, const_tree func,
4504 bool outgoing ATTRIBUTE_UNUSED)
4505 {
4506 machine_mode mode;
4507 int unsignedp;
4508 int count;
4509 machine_mode ag_mode;
4510
4511 mode = TYPE_MODE (type);
4512 if (INTEGRAL_TYPE_P (type))
4513 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4514
4515 if (aarch64_return_in_msb (type))
4516 {
4517 HOST_WIDE_INT size = int_size_in_bytes (type);
4518
4519 if (size % UNITS_PER_WORD != 0)
4520 {
4521 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4522 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4523 }
4524 }
4525
4526 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4527 &ag_mode, &count, NULL))
4528 {
4529 if (!aarch64_composite_type_p (type, mode))
4530 {
4531 gcc_assert (count == 1 && mode == ag_mode);
4532 return gen_rtx_REG (mode, V0_REGNUM);
4533 }
4534 else
4535 {
4536 int i;
4537 rtx par;
4538
4539 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4540 for (i = 0; i < count; i++)
4541 {
4542 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4543 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4544 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4545 XVECEXP (par, 0, i) = tmp;
4546 }
4547 return par;
4548 }
4549 }
4550 else
4551 return gen_rtx_REG (mode, R0_REGNUM);
4552 }
4553
4554 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4555 Return true if REGNO is the number of a hard register in which the values
4556 of called function may come back. */
4557
4558 static bool
4559 aarch64_function_value_regno_p (const unsigned int regno)
4560 {
4561 /* Maximum of 16 bytes can be returned in the general registers. Examples
4562 of 16-byte return values are: 128-bit integers and 16-byte small
4563 structures (excluding homogeneous floating-point aggregates). */
4564 if (regno == R0_REGNUM || regno == R1_REGNUM)
4565 return true;
4566
4567 /* Up to four fp/simd registers can return a function value, e.g. a
4568 homogeneous floating-point aggregate having four members. */
4569 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4570 return TARGET_FLOAT;
4571
4572 return false;
4573 }
4574
4575 /* Implement TARGET_RETURN_IN_MEMORY.
4576
4577 If the type T of the result of a function is such that
4578 void func (T arg)
4579 would require that arg be passed as a value in a register (or set of
4580 registers) according to the parameter passing rules, then the result
4581 is returned in the same registers as would be used for such an
4582 argument. */
4583
4584 static bool
4585 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4586 {
4587 HOST_WIDE_INT size;
4588 machine_mode ag_mode;
4589 int count;
4590
4591 if (!AGGREGATE_TYPE_P (type)
4592 && TREE_CODE (type) != COMPLEX_TYPE
4593 && TREE_CODE (type) != VECTOR_TYPE)
4594 /* Simple scalar types always returned in registers. */
4595 return false;
4596
4597 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4598 type,
4599 &ag_mode,
4600 &count,
4601 NULL))
4602 return false;
4603
4604 /* Types larger than 2 registers returned in memory. */
4605 size = int_size_in_bytes (type);
4606 return (size < 0 || size > 2 * UNITS_PER_WORD);
4607 }
4608
4609 static bool
4610 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4611 const_tree type, int *nregs)
4612 {
4613 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4614 return aarch64_vfp_is_call_or_return_candidate (mode,
4615 type,
4616 &pcum->aapcs_vfp_rmode,
4617 nregs,
4618 NULL);
4619 }
4620
4621 /* Given MODE and TYPE of a function argument, return the alignment in
4622 bits. The idea is to suppress any stronger alignment requested by
4623 the user and opt for the natural alignment (specified in AAPCS64 \S
4624 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4625 calculated in versions of GCC prior to GCC-9. This is a helper
4626 function for local use only. */
4627
4628 static unsigned int
4629 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4630 bool *abi_break)
4631 {
4632 *abi_break = false;
4633 if (!type)
4634 return GET_MODE_ALIGNMENT (mode);
4635
4636 if (integer_zerop (TYPE_SIZE (type)))
4637 return 0;
4638
4639 gcc_assert (TYPE_MODE (type) == mode);
4640
4641 if (!AGGREGATE_TYPE_P (type))
4642 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4643
4644 if (TREE_CODE (type) == ARRAY_TYPE)
4645 return TYPE_ALIGN (TREE_TYPE (type));
4646
4647 unsigned int alignment = 0;
4648 unsigned int bitfield_alignment = 0;
4649 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4650 if (TREE_CODE (field) == FIELD_DECL)
4651 {
4652 alignment = std::max (alignment, DECL_ALIGN (field));
4653 if (DECL_BIT_FIELD_TYPE (field))
4654 bitfield_alignment
4655 = std::max (bitfield_alignment,
4656 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4657 }
4658
4659 if (bitfield_alignment > alignment)
4660 {
4661 *abi_break = true;
4662 return bitfield_alignment;
4663 }
4664
4665 return alignment;
4666 }
4667
4668 /* Layout a function argument according to the AAPCS64 rules. The rule
4669 numbers refer to the rule numbers in the AAPCS64. */
4670
4671 static void
4672 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4673 const_tree type,
4674 bool named ATTRIBUTE_UNUSED)
4675 {
4676 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4677 int ncrn, nvrn, nregs;
4678 bool allocate_ncrn, allocate_nvrn;
4679 HOST_WIDE_INT size;
4680 bool abi_break;
4681
4682 /* We need to do this once per argument. */
4683 if (pcum->aapcs_arg_processed)
4684 return;
4685
4686 pcum->aapcs_arg_processed = true;
4687
4688 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4689 if (type)
4690 size = int_size_in_bytes (type);
4691 else
4692 /* No frontends can create types with variable-sized modes, so we
4693 shouldn't be asked to pass or return them. */
4694 size = GET_MODE_SIZE (mode).to_constant ();
4695 size = ROUND_UP (size, UNITS_PER_WORD);
4696
4697 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4698 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4699 mode,
4700 type,
4701 &nregs);
4702
4703 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4704 The following code thus handles passing by SIMD/FP registers first. */
4705
4706 nvrn = pcum->aapcs_nvrn;
4707
4708 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4709 and homogenous short-vector aggregates (HVA). */
4710 if (allocate_nvrn)
4711 {
4712 if (!TARGET_FLOAT)
4713 aarch64_err_no_fpadvsimd (mode);
4714
4715 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4716 {
4717 pcum->aapcs_nextnvrn = nvrn + nregs;
4718 if (!aarch64_composite_type_p (type, mode))
4719 {
4720 gcc_assert (nregs == 1);
4721 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4722 }
4723 else
4724 {
4725 rtx par;
4726 int i;
4727 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4728 for (i = 0; i < nregs; i++)
4729 {
4730 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4731 V0_REGNUM + nvrn + i);
4732 rtx offset = gen_int_mode
4733 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4734 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4735 XVECEXP (par, 0, i) = tmp;
4736 }
4737 pcum->aapcs_reg = par;
4738 }
4739 return;
4740 }
4741 else
4742 {
4743 /* C.3 NSRN is set to 8. */
4744 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4745 goto on_stack;
4746 }
4747 }
4748
4749 ncrn = pcum->aapcs_ncrn;
4750 nregs = size / UNITS_PER_WORD;
4751
4752 /* C6 - C9. though the sign and zero extension semantics are
4753 handled elsewhere. This is the case where the argument fits
4754 entirely general registers. */
4755 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4756 {
4757 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4758
4759 /* C.8 if the argument has an alignment of 16 then the NGRN is
4760 rounded up to the next even number. */
4761 if (nregs == 2
4762 && ncrn % 2
4763 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4764 comparison is there because for > 16 * BITS_PER_UNIT
4765 alignment nregs should be > 2 and therefore it should be
4766 passed by reference rather than value. */
4767 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4768 == 16 * BITS_PER_UNIT))
4769 {
4770 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4771 inform (input_location, "parameter passing for argument of type "
4772 "%qT changed in GCC 9.1", type);
4773 ++ncrn;
4774 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4775 }
4776
4777 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4778 A reg is still generated for it, but the caller should be smart
4779 enough not to use it. */
4780 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4781 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4782 else
4783 {
4784 rtx par;
4785 int i;
4786
4787 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4788 for (i = 0; i < nregs; i++)
4789 {
4790 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4791 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4792 GEN_INT (i * UNITS_PER_WORD));
4793 XVECEXP (par, 0, i) = tmp;
4794 }
4795 pcum->aapcs_reg = par;
4796 }
4797
4798 pcum->aapcs_nextncrn = ncrn + nregs;
4799 return;
4800 }
4801
4802 /* C.11 */
4803 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4804
4805 /* The argument is passed on stack; record the needed number of words for
4806 this argument and align the total size if necessary. */
4807 on_stack:
4808 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4809
4810 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4811 == 16 * BITS_PER_UNIT)
4812 {
4813 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4814 if (pcum->aapcs_stack_size != new_size)
4815 {
4816 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4817 inform (input_location, "parameter passing for argument of type "
4818 "%qT changed in GCC 9.1", type);
4819 pcum->aapcs_stack_size = new_size;
4820 }
4821 }
4822 return;
4823 }
4824
4825 /* Implement TARGET_FUNCTION_ARG. */
4826
4827 static rtx
4828 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4829 {
4830 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4831 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
4832 || pcum->pcs_variant == ARM_PCS_SIMD);
4833
4834 if (arg.end_marker_p ())
4835 return gen_int_mode (pcum->pcs_variant, DImode);
4836
4837 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4838 return pcum->aapcs_reg;
4839 }
4840
4841 void
4842 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4843 const_tree fntype,
4844 rtx libname ATTRIBUTE_UNUSED,
4845 const_tree fndecl ATTRIBUTE_UNUSED,
4846 unsigned n_named ATTRIBUTE_UNUSED)
4847 {
4848 pcum->aapcs_ncrn = 0;
4849 pcum->aapcs_nvrn = 0;
4850 pcum->aapcs_nextncrn = 0;
4851 pcum->aapcs_nextnvrn = 0;
4852 if (fntype)
4853 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
4854 else
4855 pcum->pcs_variant = ARM_PCS_AAPCS64;
4856 pcum->aapcs_reg = NULL_RTX;
4857 pcum->aapcs_arg_processed = false;
4858 pcum->aapcs_stack_words = 0;
4859 pcum->aapcs_stack_size = 0;
4860
4861 if (!TARGET_FLOAT
4862 && fndecl && TREE_PUBLIC (fndecl)
4863 && fntype && fntype != error_mark_node)
4864 {
4865 const_tree type = TREE_TYPE (fntype);
4866 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4867 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4868 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4869 &mode, &nregs, NULL))
4870 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4871 }
4872 return;
4873 }
4874
4875 static void
4876 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4877 const function_arg_info &arg)
4878 {
4879 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4880 if (pcum->pcs_variant == ARM_PCS_AAPCS64
4881 || pcum->pcs_variant == ARM_PCS_SIMD)
4882 {
4883 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4884 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4885 != (pcum->aapcs_stack_words != 0));
4886 pcum->aapcs_arg_processed = false;
4887 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4888 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4889 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4890 pcum->aapcs_stack_words = 0;
4891 pcum->aapcs_reg = NULL_RTX;
4892 }
4893 }
4894
4895 bool
4896 aarch64_function_arg_regno_p (unsigned regno)
4897 {
4898 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4899 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4900 }
4901
4902 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4903 PARM_BOUNDARY bits of alignment, but will be given anything up
4904 to STACK_BOUNDARY bits if the type requires it. This makes sure
4905 that both before and after the layout of each argument, the Next
4906 Stacked Argument Address (NSAA) will have a minimum alignment of
4907 8 bytes. */
4908
4909 static unsigned int
4910 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4911 {
4912 bool abi_break;
4913 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4914 &abi_break);
4915 if (abi_break & warn_psabi)
4916 inform (input_location, "parameter passing for argument of type "
4917 "%qT changed in GCC 9.1", type);
4918
4919 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4920 }
4921
4922 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4923
4924 static fixed_size_mode
4925 aarch64_get_reg_raw_mode (int regno)
4926 {
4927 if (TARGET_SVE && FP_REGNUM_P (regno))
4928 /* Don't use the SVE part of the register for __builtin_apply and
4929 __builtin_return. The SVE registers aren't used by the normal PCS,
4930 so using them there would be a waste of time. The PCS extensions
4931 for SVE types are fundamentally incompatible with the
4932 __builtin_return/__builtin_apply interface. */
4933 return as_a <fixed_size_mode> (V16QImode);
4934 return default_get_reg_raw_mode (regno);
4935 }
4936
4937 /* Implement TARGET_FUNCTION_ARG_PADDING.
4938
4939 Small aggregate types are placed in the lowest memory address.
4940
4941 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4942
4943 static pad_direction
4944 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4945 {
4946 /* On little-endian targets, the least significant byte of every stack
4947 argument is passed at the lowest byte address of the stack slot. */
4948 if (!BYTES_BIG_ENDIAN)
4949 return PAD_UPWARD;
4950
4951 /* Otherwise, integral, floating-point and pointer types are padded downward:
4952 the least significant byte of a stack argument is passed at the highest
4953 byte address of the stack slot. */
4954 if (type
4955 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4956 || POINTER_TYPE_P (type))
4957 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4958 return PAD_DOWNWARD;
4959
4960 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4961 return PAD_UPWARD;
4962 }
4963
4964 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4965
4966 It specifies padding for the last (may also be the only)
4967 element of a block move between registers and memory. If
4968 assuming the block is in the memory, padding upward means that
4969 the last element is padded after its highest significant byte,
4970 while in downward padding, the last element is padded at the
4971 its least significant byte side.
4972
4973 Small aggregates and small complex types are always padded
4974 upwards.
4975
4976 We don't need to worry about homogeneous floating-point or
4977 short-vector aggregates; their move is not affected by the
4978 padding direction determined here. Regardless of endianness,
4979 each element of such an aggregate is put in the least
4980 significant bits of a fp/simd register.
4981
4982 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4983 register has useful data, and return the opposite if the most
4984 significant byte does. */
4985
4986 bool
4987 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4988 bool first ATTRIBUTE_UNUSED)
4989 {
4990
4991 /* Small composite types are always padded upward. */
4992 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4993 {
4994 HOST_WIDE_INT size;
4995 if (type)
4996 size = int_size_in_bytes (type);
4997 else
4998 /* No frontends can create types with variable-sized modes, so we
4999 shouldn't be asked to pass or return them. */
5000 size = GET_MODE_SIZE (mode).to_constant ();
5001 if (size < 2 * UNITS_PER_WORD)
5002 return true;
5003 }
5004
5005 /* Otherwise, use the default padding. */
5006 return !BYTES_BIG_ENDIAN;
5007 }
5008
5009 static scalar_int_mode
5010 aarch64_libgcc_cmp_return_mode (void)
5011 {
5012 return SImode;
5013 }
5014
5015 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5016
5017 /* We use the 12-bit shifted immediate arithmetic instructions so values
5018 must be multiple of (1 << 12), i.e. 4096. */
5019 #define ARITH_FACTOR 4096
5020
5021 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5022 #error Cannot use simple address calculation for stack probing
5023 #endif
5024
5025 /* The pair of scratch registers used for stack probing. */
5026 #define PROBE_STACK_FIRST_REG R9_REGNUM
5027 #define PROBE_STACK_SECOND_REG R10_REGNUM
5028
5029 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5030 inclusive. These are offsets from the current stack pointer. */
5031
5032 static void
5033 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5034 {
5035 HOST_WIDE_INT size;
5036 if (!poly_size.is_constant (&size))
5037 {
5038 sorry ("stack probes for SVE frames");
5039 return;
5040 }
5041
5042 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5043
5044 /* See the same assertion on PROBE_INTERVAL above. */
5045 gcc_assert ((first % ARITH_FACTOR) == 0);
5046
5047 /* See if we have a constant small number of probes to generate. If so,
5048 that's the easy case. */
5049 if (size <= PROBE_INTERVAL)
5050 {
5051 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5052
5053 emit_set_insn (reg1,
5054 plus_constant (Pmode,
5055 stack_pointer_rtx, -(first + base)));
5056 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5057 }
5058
5059 /* The run-time loop is made up of 8 insns in the generic case while the
5060 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5061 else if (size <= 4 * PROBE_INTERVAL)
5062 {
5063 HOST_WIDE_INT i, rem;
5064
5065 emit_set_insn (reg1,
5066 plus_constant (Pmode,
5067 stack_pointer_rtx,
5068 -(first + PROBE_INTERVAL)));
5069 emit_stack_probe (reg1);
5070
5071 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5072 it exceeds SIZE. If only two probes are needed, this will not
5073 generate any code. Then probe at FIRST + SIZE. */
5074 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5075 {
5076 emit_set_insn (reg1,
5077 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5078 emit_stack_probe (reg1);
5079 }
5080
5081 rem = size - (i - PROBE_INTERVAL);
5082 if (rem > 256)
5083 {
5084 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5085
5086 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5087 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5088 }
5089 else
5090 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5091 }
5092
5093 /* Otherwise, do the same as above, but in a loop. Note that we must be
5094 extra careful with variables wrapping around because we might be at
5095 the very top (or the very bottom) of the address space and we have
5096 to be able to handle this case properly; in particular, we use an
5097 equality test for the loop condition. */
5098 else
5099 {
5100 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5101
5102 /* Step 1: round SIZE to the previous multiple of the interval. */
5103
5104 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5105
5106
5107 /* Step 2: compute initial and final value of the loop counter. */
5108
5109 /* TEST_ADDR = SP + FIRST. */
5110 emit_set_insn (reg1,
5111 plus_constant (Pmode, stack_pointer_rtx, -first));
5112
5113 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5114 HOST_WIDE_INT adjustment = - (first + rounded_size);
5115 if (! aarch64_uimm12_shift (adjustment))
5116 {
5117 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5118 true, Pmode);
5119 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5120 }
5121 else
5122 emit_set_insn (reg2,
5123 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5124
5125 /* Step 3: the loop
5126
5127 do
5128 {
5129 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5130 probe at TEST_ADDR
5131 }
5132 while (TEST_ADDR != LAST_ADDR)
5133
5134 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5135 until it is equal to ROUNDED_SIZE. */
5136
5137 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5138
5139
5140 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5141 that SIZE is equal to ROUNDED_SIZE. */
5142
5143 if (size != rounded_size)
5144 {
5145 HOST_WIDE_INT rem = size - rounded_size;
5146
5147 if (rem > 256)
5148 {
5149 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5150
5151 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5152 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5153 }
5154 else
5155 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5156 }
5157 }
5158
5159 /* Make sure nothing is scheduled before we are done. */
5160 emit_insn (gen_blockage ());
5161 }
5162
5163 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5164 absolute addresses. */
5165
5166 const char *
5167 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5168 {
5169 static int labelno = 0;
5170 char loop_lab[32];
5171 rtx xops[2];
5172
5173 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5174
5175 /* Loop. */
5176 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5177
5178 HOST_WIDE_INT stack_clash_probe_interval
5179 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5180
5181 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5182 xops[0] = reg1;
5183 HOST_WIDE_INT interval;
5184 if (flag_stack_clash_protection)
5185 interval = stack_clash_probe_interval;
5186 else
5187 interval = PROBE_INTERVAL;
5188
5189 gcc_assert (aarch64_uimm12_shift (interval));
5190 xops[1] = GEN_INT (interval);
5191
5192 output_asm_insn ("sub\t%0, %0, %1", xops);
5193
5194 /* If doing stack clash protection then we probe up by the ABI specified
5195 amount. We do this because we're dropping full pages at a time in the
5196 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5197 if (flag_stack_clash_protection)
5198 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5199 else
5200 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5201
5202 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5203 by this amount for each iteration. */
5204 output_asm_insn ("str\txzr, [%0, %1]", xops);
5205
5206 /* Test if TEST_ADDR == LAST_ADDR. */
5207 xops[1] = reg2;
5208 output_asm_insn ("cmp\t%0, %1", xops);
5209
5210 /* Branch. */
5211 fputs ("\tb.ne\t", asm_out_file);
5212 assemble_name_raw (asm_out_file, loop_lab);
5213 fputc ('\n', asm_out_file);
5214
5215 return "";
5216 }
5217
5218 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5219 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5220 of GUARD_SIZE. When a probe is emitted it is done at most
5221 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5222 at most MIN_PROBE_THRESHOLD. By the end of this function
5223 BASE = BASE - ADJUSTMENT. */
5224
5225 const char *
5226 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5227 rtx min_probe_threshold, rtx guard_size)
5228 {
5229 /* This function is not allowed to use any instruction generation function
5230 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5231 so instead emit the code you want using output_asm_insn. */
5232 gcc_assert (flag_stack_clash_protection);
5233 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5234 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5235
5236 /* The minimum required allocation before the residual requires probing. */
5237 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5238
5239 /* Clamp the value down to the nearest value that can be used with a cmp. */
5240 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5241 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5242
5243 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5244 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5245
5246 static int labelno = 0;
5247 char loop_start_lab[32];
5248 char loop_end_lab[32];
5249 rtx xops[2];
5250
5251 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5252 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5253
5254 /* Emit loop start label. */
5255 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5256
5257 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5258 xops[0] = adjustment;
5259 xops[1] = probe_offset_value_rtx;
5260 output_asm_insn ("cmp\t%0, %1", xops);
5261
5262 /* Branch to end if not enough adjustment to probe. */
5263 fputs ("\tb.lt\t", asm_out_file);
5264 assemble_name_raw (asm_out_file, loop_end_lab);
5265 fputc ('\n', asm_out_file);
5266
5267 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5268 xops[0] = base;
5269 xops[1] = probe_offset_value_rtx;
5270 output_asm_insn ("sub\t%0, %0, %1", xops);
5271
5272 /* Probe at BASE. */
5273 xops[1] = const0_rtx;
5274 output_asm_insn ("str\txzr, [%0, %1]", xops);
5275
5276 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5277 xops[0] = adjustment;
5278 xops[1] = probe_offset_value_rtx;
5279 output_asm_insn ("sub\t%0, %0, %1", xops);
5280
5281 /* Branch to start if still more bytes to allocate. */
5282 fputs ("\tb\t", asm_out_file);
5283 assemble_name_raw (asm_out_file, loop_start_lab);
5284 fputc ('\n', asm_out_file);
5285
5286 /* No probe leave. */
5287 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5288
5289 /* BASE = BASE - ADJUSTMENT. */
5290 xops[0] = base;
5291 xops[1] = adjustment;
5292 output_asm_insn ("sub\t%0, %0, %1", xops);
5293 return "";
5294 }
5295
5296 /* Determine whether a frame chain needs to be generated. */
5297 static bool
5298 aarch64_needs_frame_chain (void)
5299 {
5300 /* Force a frame chain for EH returns so the return address is at FP+8. */
5301 if (frame_pointer_needed || crtl->calls_eh_return)
5302 return true;
5303
5304 /* A leaf function cannot have calls or write LR. */
5305 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5306
5307 /* Don't use a frame chain in leaf functions if leaf frame pointers
5308 are disabled. */
5309 if (flag_omit_leaf_frame_pointer && is_leaf)
5310 return false;
5311
5312 return aarch64_use_frame_pointer;
5313 }
5314
5315 /* Mark the registers that need to be saved by the callee and calculate
5316 the size of the callee-saved registers area and frame record (both FP
5317 and LR may be omitted). */
5318 static void
5319 aarch64_layout_frame (void)
5320 {
5321 HOST_WIDE_INT offset = 0;
5322 int regno, last_fp_reg = INVALID_REGNUM;
5323 bool simd_function = (crtl->abi->id () == ARM_PCS_SIMD);
5324
5325 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5326
5327 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5328 the mid-end is doing. */
5329 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5330
5331 #define SLOT_NOT_REQUIRED (-2)
5332 #define SLOT_REQUIRED (-1)
5333
5334 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5335 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5336
5337 /* First mark all the registers that really need to be saved... */
5338 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5339 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5340
5341 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5342 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5343
5344 /* ... that includes the eh data registers (if needed)... */
5345 if (crtl->calls_eh_return)
5346 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5347 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5348 = SLOT_REQUIRED;
5349
5350 /* ... and any callee saved register that dataflow says is live. */
5351 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5352 if (df_regs_ever_live_p (regno)
5353 && !fixed_regs[regno]
5354 && (regno == R30_REGNUM
5355 || !crtl->abi->clobbers_full_reg_p (regno)))
5356 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5357
5358 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5359 if (df_regs_ever_live_p (regno)
5360 && !fixed_regs[regno]
5361 && !crtl->abi->clobbers_full_reg_p (regno))
5362 {
5363 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5364 last_fp_reg = regno;
5365 }
5366
5367 if (cfun->machine->frame.emit_frame_chain)
5368 {
5369 /* FP and LR are placed in the linkage record. */
5370 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5371 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5372 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5373 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5374 offset = 2 * UNITS_PER_WORD;
5375 }
5376
5377 /* With stack-clash, LR must be saved in non-leaf functions. */
5378 gcc_assert (crtl->is_leaf
5379 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5380 != SLOT_NOT_REQUIRED));
5381
5382 /* Now assign stack slots for them. */
5383 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5384 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5385 {
5386 cfun->machine->frame.reg_offset[regno] = offset;
5387 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5388 cfun->machine->frame.wb_candidate1 = regno;
5389 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5390 cfun->machine->frame.wb_candidate2 = regno;
5391 offset += UNITS_PER_WORD;
5392 }
5393
5394 HOST_WIDE_INT max_int_offset = offset;
5395 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5396 bool has_align_gap = offset != max_int_offset;
5397
5398 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5399 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5400 {
5401 /* If there is an alignment gap between integer and fp callee-saves,
5402 allocate the last fp register to it if possible. */
5403 if (regno == last_fp_reg
5404 && has_align_gap
5405 && !simd_function
5406 && (offset & 8) == 0)
5407 {
5408 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5409 break;
5410 }
5411
5412 cfun->machine->frame.reg_offset[regno] = offset;
5413 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5414 cfun->machine->frame.wb_candidate1 = regno;
5415 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5416 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5417 cfun->machine->frame.wb_candidate2 = regno;
5418 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5419 }
5420
5421 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5422
5423 cfun->machine->frame.saved_regs_size = offset;
5424
5425 HOST_WIDE_INT varargs_and_saved_regs_size
5426 = offset + cfun->machine->frame.saved_varargs_size;
5427
5428 cfun->machine->frame.hard_fp_offset
5429 = aligned_upper_bound (varargs_and_saved_regs_size
5430 + get_frame_size (),
5431 STACK_BOUNDARY / BITS_PER_UNIT);
5432
5433 /* Both these values are already aligned. */
5434 gcc_assert (multiple_p (crtl->outgoing_args_size,
5435 STACK_BOUNDARY / BITS_PER_UNIT));
5436 cfun->machine->frame.frame_size
5437 = (cfun->machine->frame.hard_fp_offset
5438 + crtl->outgoing_args_size);
5439
5440 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5441
5442 cfun->machine->frame.initial_adjust = 0;
5443 cfun->machine->frame.final_adjust = 0;
5444 cfun->machine->frame.callee_adjust = 0;
5445 cfun->machine->frame.callee_offset = 0;
5446
5447 HOST_WIDE_INT max_push_offset = 0;
5448 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5449 max_push_offset = 512;
5450 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5451 max_push_offset = 256;
5452
5453 HOST_WIDE_INT const_size, const_fp_offset;
5454 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5455 && const_size < max_push_offset
5456 && known_eq (crtl->outgoing_args_size, 0))
5457 {
5458 /* Simple, small frame with no outgoing arguments:
5459 stp reg1, reg2, [sp, -frame_size]!
5460 stp reg3, reg4, [sp, 16] */
5461 cfun->machine->frame.callee_adjust = const_size;
5462 }
5463 else if (known_lt (crtl->outgoing_args_size
5464 + cfun->machine->frame.saved_regs_size, 512)
5465 && !(cfun->calls_alloca
5466 && known_lt (cfun->machine->frame.hard_fp_offset,
5467 max_push_offset)))
5468 {
5469 /* Frame with small outgoing arguments:
5470 sub sp, sp, frame_size
5471 stp reg1, reg2, [sp, outgoing_args_size]
5472 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5473 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5474 cfun->machine->frame.callee_offset
5475 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5476 }
5477 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5478 && const_fp_offset < max_push_offset)
5479 {
5480 /* Frame with large outgoing arguments but a small local area:
5481 stp reg1, reg2, [sp, -hard_fp_offset]!
5482 stp reg3, reg4, [sp, 16]
5483 sub sp, sp, outgoing_args_size */
5484 cfun->machine->frame.callee_adjust = const_fp_offset;
5485 cfun->machine->frame.final_adjust
5486 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5487 }
5488 else
5489 {
5490 /* Frame with large local area and outgoing arguments using frame pointer:
5491 sub sp, sp, hard_fp_offset
5492 stp x29, x30, [sp, 0]
5493 add x29, sp, 0
5494 stp reg3, reg4, [sp, 16]
5495 sub sp, sp, outgoing_args_size */
5496 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5497 cfun->machine->frame.final_adjust
5498 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5499 }
5500
5501 cfun->machine->frame.laid_out = true;
5502 }
5503
5504 /* Return true if the register REGNO is saved on entry to
5505 the current function. */
5506
5507 static bool
5508 aarch64_register_saved_on_entry (int regno)
5509 {
5510 return cfun->machine->frame.reg_offset[regno] >= 0;
5511 }
5512
5513 /* Return the next register up from REGNO up to LIMIT for the callee
5514 to save. */
5515
5516 static unsigned
5517 aarch64_next_callee_save (unsigned regno, unsigned limit)
5518 {
5519 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5520 regno ++;
5521 return regno;
5522 }
5523
5524 /* Push the register number REGNO of mode MODE to the stack with write-back
5525 adjusting the stack by ADJUSTMENT. */
5526
5527 static void
5528 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5529 HOST_WIDE_INT adjustment)
5530 {
5531 rtx base_rtx = stack_pointer_rtx;
5532 rtx insn, reg, mem;
5533
5534 reg = gen_rtx_REG (mode, regno);
5535 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5536 plus_constant (Pmode, base_rtx, -adjustment));
5537 mem = gen_frame_mem (mode, mem);
5538
5539 insn = emit_move_insn (mem, reg);
5540 RTX_FRAME_RELATED_P (insn) = 1;
5541 }
5542
5543 /* Generate and return an instruction to store the pair of registers
5544 REG and REG2 of mode MODE to location BASE with write-back adjusting
5545 the stack location BASE by ADJUSTMENT. */
5546
5547 static rtx
5548 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5549 HOST_WIDE_INT adjustment)
5550 {
5551 switch (mode)
5552 {
5553 case E_DImode:
5554 return gen_storewb_pairdi_di (base, base, reg, reg2,
5555 GEN_INT (-adjustment),
5556 GEN_INT (UNITS_PER_WORD - adjustment));
5557 case E_DFmode:
5558 return gen_storewb_pairdf_di (base, base, reg, reg2,
5559 GEN_INT (-adjustment),
5560 GEN_INT (UNITS_PER_WORD - adjustment));
5561 case E_TFmode:
5562 return gen_storewb_pairtf_di (base, base, reg, reg2,
5563 GEN_INT (-adjustment),
5564 GEN_INT (UNITS_PER_VREG - adjustment));
5565 default:
5566 gcc_unreachable ();
5567 }
5568 }
5569
5570 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5571 stack pointer by ADJUSTMENT. */
5572
5573 static void
5574 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5575 {
5576 rtx_insn *insn;
5577 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5578
5579 if (regno2 == INVALID_REGNUM)
5580 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5581
5582 rtx reg1 = gen_rtx_REG (mode, regno1);
5583 rtx reg2 = gen_rtx_REG (mode, regno2);
5584
5585 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5586 reg2, adjustment));
5587 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5588 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5589 RTX_FRAME_RELATED_P (insn) = 1;
5590 }
5591
5592 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5593 adjusting it by ADJUSTMENT afterwards. */
5594
5595 static rtx
5596 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5597 HOST_WIDE_INT adjustment)
5598 {
5599 switch (mode)
5600 {
5601 case E_DImode:
5602 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5603 GEN_INT (UNITS_PER_WORD));
5604 case E_DFmode:
5605 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5606 GEN_INT (UNITS_PER_WORD));
5607 case E_TFmode:
5608 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5609 GEN_INT (UNITS_PER_VREG));
5610 default:
5611 gcc_unreachable ();
5612 }
5613 }
5614
5615 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5616 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5617 into CFI_OPS. */
5618
5619 static void
5620 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5621 rtx *cfi_ops)
5622 {
5623 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5624 rtx reg1 = gen_rtx_REG (mode, regno1);
5625
5626 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5627
5628 if (regno2 == INVALID_REGNUM)
5629 {
5630 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5631 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5632 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5633 }
5634 else
5635 {
5636 rtx reg2 = gen_rtx_REG (mode, regno2);
5637 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5638 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5639 reg2, adjustment));
5640 }
5641 }
5642
5643 /* Generate and return a store pair instruction of mode MODE to store
5644 register REG1 to MEM1 and register REG2 to MEM2. */
5645
5646 static rtx
5647 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5648 rtx reg2)
5649 {
5650 switch (mode)
5651 {
5652 case E_DImode:
5653 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5654
5655 case E_DFmode:
5656 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5657
5658 case E_TFmode:
5659 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5660
5661 default:
5662 gcc_unreachable ();
5663 }
5664 }
5665
5666 /* Generate and regurn a load pair isntruction of mode MODE to load register
5667 REG1 from MEM1 and register REG2 from MEM2. */
5668
5669 static rtx
5670 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5671 rtx mem2)
5672 {
5673 switch (mode)
5674 {
5675 case E_DImode:
5676 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5677
5678 case E_DFmode:
5679 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5680
5681 case E_TFmode:
5682 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5683
5684 default:
5685 gcc_unreachable ();
5686 }
5687 }
5688
5689 /* Return TRUE if return address signing should be enabled for the current
5690 function, otherwise return FALSE. */
5691
5692 bool
5693 aarch64_return_address_signing_enabled (void)
5694 {
5695 /* This function should only be called after frame laid out. */
5696 gcc_assert (cfun->machine->frame.laid_out);
5697
5698 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5699 if its LR is pushed onto stack. */
5700 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5701 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5702 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5703 }
5704
5705 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5706 bool
5707 aarch64_bti_enabled (void)
5708 {
5709 return (aarch64_enable_bti == 1);
5710 }
5711
5712 /* Emit code to save the callee-saved registers from register number START
5713 to LIMIT to the stack at the location starting at offset START_OFFSET,
5714 skipping any write-back candidates if SKIP_WB is true. */
5715
5716 static void
5717 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5718 unsigned start, unsigned limit, bool skip_wb)
5719 {
5720 rtx_insn *insn;
5721 unsigned regno;
5722 unsigned regno2;
5723
5724 for (regno = aarch64_next_callee_save (start, limit);
5725 regno <= limit;
5726 regno = aarch64_next_callee_save (regno + 1, limit))
5727 {
5728 rtx reg, mem;
5729 poly_int64 offset;
5730 int offset_diff;
5731
5732 if (skip_wb
5733 && (regno == cfun->machine->frame.wb_candidate1
5734 || regno == cfun->machine->frame.wb_candidate2))
5735 continue;
5736
5737 if (cfun->machine->reg_is_wrapped_separately[regno])
5738 continue;
5739
5740 reg = gen_rtx_REG (mode, regno);
5741 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5742 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5743 offset));
5744
5745 regno2 = aarch64_next_callee_save (regno + 1, limit);
5746 offset_diff = cfun->machine->frame.reg_offset[regno2]
5747 - cfun->machine->frame.reg_offset[regno];
5748
5749 if (regno2 <= limit
5750 && !cfun->machine->reg_is_wrapped_separately[regno2]
5751 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5752 {
5753 rtx reg2 = gen_rtx_REG (mode, regno2);
5754 rtx mem2;
5755
5756 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5757 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5758 offset));
5759 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5760 reg2));
5761
5762 /* The first part of a frame-related parallel insn is
5763 always assumed to be relevant to the frame
5764 calculations; subsequent parts, are only
5765 frame-related if explicitly marked. */
5766 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5767 regno = regno2;
5768 }
5769 else
5770 insn = emit_move_insn (mem, reg);
5771
5772 RTX_FRAME_RELATED_P (insn) = 1;
5773 }
5774 }
5775
5776 /* Emit code to restore the callee registers of mode MODE from register
5777 number START up to and including LIMIT. Restore from the stack offset
5778 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5779 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5780
5781 static void
5782 aarch64_restore_callee_saves (machine_mode mode,
5783 poly_int64 start_offset, unsigned start,
5784 unsigned limit, bool skip_wb, rtx *cfi_ops)
5785 {
5786 rtx base_rtx = stack_pointer_rtx;
5787 unsigned regno;
5788 unsigned regno2;
5789 poly_int64 offset;
5790
5791 for (regno = aarch64_next_callee_save (start, limit);
5792 regno <= limit;
5793 regno = aarch64_next_callee_save (regno + 1, limit))
5794 {
5795 if (cfun->machine->reg_is_wrapped_separately[regno])
5796 continue;
5797
5798 rtx reg, mem;
5799 int offset_diff;
5800
5801 if (skip_wb
5802 && (regno == cfun->machine->frame.wb_candidate1
5803 || regno == cfun->machine->frame.wb_candidate2))
5804 continue;
5805
5806 reg = gen_rtx_REG (mode, regno);
5807 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5808 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5809
5810 regno2 = aarch64_next_callee_save (regno + 1, limit);
5811 offset_diff = cfun->machine->frame.reg_offset[regno2]
5812 - cfun->machine->frame.reg_offset[regno];
5813
5814 if (regno2 <= limit
5815 && !cfun->machine->reg_is_wrapped_separately[regno2]
5816 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5817 {
5818 rtx reg2 = gen_rtx_REG (mode, regno2);
5819 rtx mem2;
5820
5821 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5822 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5823 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5824
5825 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5826 regno = regno2;
5827 }
5828 else
5829 emit_move_insn (reg, mem);
5830 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5831 }
5832 }
5833
5834 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5835 of MODE. */
5836
5837 static inline bool
5838 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5839 {
5840 HOST_WIDE_INT multiple;
5841 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5842 && IN_RANGE (multiple, -8, 7));
5843 }
5844
5845 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5846 of MODE. */
5847
5848 static inline bool
5849 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5850 {
5851 HOST_WIDE_INT multiple;
5852 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5853 && IN_RANGE (multiple, 0, 63));
5854 }
5855
5856 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5857 of MODE. */
5858
5859 bool
5860 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5861 {
5862 HOST_WIDE_INT multiple;
5863 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5864 && IN_RANGE (multiple, -64, 63));
5865 }
5866
5867 /* Return true if OFFSET is a signed 9-bit value. */
5868
5869 bool
5870 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5871 poly_int64 offset)
5872 {
5873 HOST_WIDE_INT const_offset;
5874 return (offset.is_constant (&const_offset)
5875 && IN_RANGE (const_offset, -256, 255));
5876 }
5877
5878 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5879 of MODE. */
5880
5881 static inline bool
5882 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5883 {
5884 HOST_WIDE_INT multiple;
5885 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5886 && IN_RANGE (multiple, -256, 255));
5887 }
5888
5889 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5890 of MODE. */
5891
5892 static inline bool
5893 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5894 {
5895 HOST_WIDE_INT multiple;
5896 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5897 && IN_RANGE (multiple, 0, 4095));
5898 }
5899
5900 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5901
5902 static sbitmap
5903 aarch64_get_separate_components (void)
5904 {
5905 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5906 bitmap_clear (components);
5907
5908 /* The registers we need saved to the frame. */
5909 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5910 if (aarch64_register_saved_on_entry (regno))
5911 {
5912 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5913 if (!frame_pointer_needed)
5914 offset += cfun->machine->frame.frame_size
5915 - cfun->machine->frame.hard_fp_offset;
5916 /* Check that we can access the stack slot of the register with one
5917 direct load with no adjustments needed. */
5918 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5919 bitmap_set_bit (components, regno);
5920 }
5921
5922 /* Don't mess with the hard frame pointer. */
5923 if (frame_pointer_needed)
5924 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5925
5926 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5927 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5928 /* If registers have been chosen to be stored/restored with
5929 writeback don't interfere with them to avoid having to output explicit
5930 stack adjustment instructions. */
5931 if (reg2 != INVALID_REGNUM)
5932 bitmap_clear_bit (components, reg2);
5933 if (reg1 != INVALID_REGNUM)
5934 bitmap_clear_bit (components, reg1);
5935
5936 bitmap_clear_bit (components, LR_REGNUM);
5937 bitmap_clear_bit (components, SP_REGNUM);
5938
5939 return components;
5940 }
5941
5942 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5943
5944 static sbitmap
5945 aarch64_components_for_bb (basic_block bb)
5946 {
5947 bitmap in = DF_LIVE_IN (bb);
5948 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5949 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5950
5951 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5952 bitmap_clear (components);
5953
5954 /* Clobbered registers don't generate values in any meaningful sense,
5955 since nothing after the clobber can rely on their value. And we can't
5956 say that partially-clobbered registers are unconditionally killed,
5957 because whether they're killed or not depends on the mode of the
5958 value they're holding. Thus partially call-clobbered registers
5959 appear in neither the kill set nor the gen set.
5960
5961 Check manually for any calls that clobber more of a register than the
5962 current function can. */
5963 function_abi_aggregator callee_abis;
5964 rtx_insn *insn;
5965 FOR_BB_INSNS (bb, insn)
5966 if (CALL_P (insn))
5967 callee_abis.note_callee_abi (insn_callee_abi (insn));
5968 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
5969
5970 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5971 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5972 if (!fixed_regs[regno]
5973 && !crtl->abi->clobbers_full_reg_p (regno)
5974 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
5975 || bitmap_bit_p (in, regno)
5976 || bitmap_bit_p (gen, regno)
5977 || bitmap_bit_p (kill, regno)))
5978 {
5979 unsigned regno2, offset, offset2;
5980 bitmap_set_bit (components, regno);
5981
5982 /* If there is a callee-save at an adjacent offset, add it too
5983 to increase the use of LDP/STP. */
5984 offset = cfun->machine->frame.reg_offset[regno];
5985 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5986
5987 if (regno2 <= LAST_SAVED_REGNUM)
5988 {
5989 offset2 = cfun->machine->frame.reg_offset[regno2];
5990 if ((offset & ~8) == (offset2 & ~8))
5991 bitmap_set_bit (components, regno2);
5992 }
5993 }
5994
5995 return components;
5996 }
5997
5998 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5999 Nothing to do for aarch64. */
6000
6001 static void
6002 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6003 {
6004 }
6005
6006 /* Return the next set bit in BMP from START onwards. Return the total number
6007 of bits in BMP if no set bit is found at or after START. */
6008
6009 static unsigned int
6010 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6011 {
6012 unsigned int nbits = SBITMAP_SIZE (bmp);
6013 if (start == nbits)
6014 return start;
6015
6016 gcc_assert (start < nbits);
6017 for (unsigned int i = start; i < nbits; i++)
6018 if (bitmap_bit_p (bmp, i))
6019 return i;
6020
6021 return nbits;
6022 }
6023
6024 /* Do the work for aarch64_emit_prologue_components and
6025 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6026 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6027 for these components or the epilogue sequence. That is, it determines
6028 whether we should emit stores or loads and what kind of CFA notes to attach
6029 to the insns. Otherwise the logic for the two sequences is very
6030 similar. */
6031
6032 static void
6033 aarch64_process_components (sbitmap components, bool prologue_p)
6034 {
6035 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6036 ? HARD_FRAME_POINTER_REGNUM
6037 : STACK_POINTER_REGNUM);
6038
6039 unsigned last_regno = SBITMAP_SIZE (components);
6040 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6041 rtx_insn *insn = NULL;
6042
6043 while (regno != last_regno)
6044 {
6045 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6046 so DFmode for the vector registers is enough. For simd functions
6047 we want to save the low 128 bits. */
6048 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6049
6050 rtx reg = gen_rtx_REG (mode, regno);
6051 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6052 if (!frame_pointer_needed)
6053 offset += cfun->machine->frame.frame_size
6054 - cfun->machine->frame.hard_fp_offset;
6055 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6056 rtx mem = gen_frame_mem (mode, addr);
6057
6058 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6059 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6060 /* No more registers to handle after REGNO.
6061 Emit a single save/restore and exit. */
6062 if (regno2 == last_regno)
6063 {
6064 insn = emit_insn (set);
6065 RTX_FRAME_RELATED_P (insn) = 1;
6066 if (prologue_p)
6067 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6068 else
6069 add_reg_note (insn, REG_CFA_RESTORE, reg);
6070 break;
6071 }
6072
6073 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6074 /* The next register is not of the same class or its offset is not
6075 mergeable with the current one into a pair. */
6076 if (!satisfies_constraint_Ump (mem)
6077 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6078 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6079 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6080 GET_MODE_SIZE (mode)))
6081 {
6082 insn = emit_insn (set);
6083 RTX_FRAME_RELATED_P (insn) = 1;
6084 if (prologue_p)
6085 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6086 else
6087 add_reg_note (insn, REG_CFA_RESTORE, reg);
6088
6089 regno = regno2;
6090 continue;
6091 }
6092
6093 /* REGNO2 can be saved/restored in a pair with REGNO. */
6094 rtx reg2 = gen_rtx_REG (mode, regno2);
6095 if (!frame_pointer_needed)
6096 offset2 += cfun->machine->frame.frame_size
6097 - cfun->machine->frame.hard_fp_offset;
6098 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6099 rtx mem2 = gen_frame_mem (mode, addr2);
6100 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6101 : gen_rtx_SET (reg2, mem2);
6102
6103 if (prologue_p)
6104 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6105 else
6106 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6107
6108 RTX_FRAME_RELATED_P (insn) = 1;
6109 if (prologue_p)
6110 {
6111 add_reg_note (insn, REG_CFA_OFFSET, set);
6112 add_reg_note (insn, REG_CFA_OFFSET, set2);
6113 }
6114 else
6115 {
6116 add_reg_note (insn, REG_CFA_RESTORE, reg);
6117 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6118 }
6119
6120 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6121 }
6122 }
6123
6124 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6125
6126 static void
6127 aarch64_emit_prologue_components (sbitmap components)
6128 {
6129 aarch64_process_components (components, true);
6130 }
6131
6132 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6133
6134 static void
6135 aarch64_emit_epilogue_components (sbitmap components)
6136 {
6137 aarch64_process_components (components, false);
6138 }
6139
6140 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6141
6142 static void
6143 aarch64_set_handled_components (sbitmap components)
6144 {
6145 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6146 if (bitmap_bit_p (components, regno))
6147 cfun->machine->reg_is_wrapped_separately[regno] = true;
6148 }
6149
6150 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6151 determining the probe offset for alloca. */
6152
6153 static HOST_WIDE_INT
6154 aarch64_stack_clash_protection_alloca_probe_range (void)
6155 {
6156 return STACK_CLASH_CALLER_GUARD;
6157 }
6158
6159
6160 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6161 registers. If POLY_SIZE is not large enough to require a probe this function
6162 will only adjust the stack. When allocating the stack space
6163 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6164 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6165 arguments. If we are then we ensure that any allocation larger than the ABI
6166 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6167 maintained.
6168
6169 We emit barriers after each stack adjustment to prevent optimizations from
6170 breaking the invariant that we never drop the stack more than a page. This
6171 invariant is needed to make it easier to correctly handle asynchronous
6172 events, e.g. if we were to allow the stack to be dropped by more than a page
6173 and then have multiple probes up and we take a signal somewhere in between
6174 then the signal handler doesn't know the state of the stack and can make no
6175 assumptions about which pages have been probed. */
6176
6177 static void
6178 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6179 poly_int64 poly_size,
6180 bool frame_related_p,
6181 bool final_adjustment_p)
6182 {
6183 HOST_WIDE_INT guard_size
6184 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6185 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6186 /* When doing the final adjustment for the outgoing argument size we can't
6187 assume that LR was saved at position 0. So subtract it's offset from the
6188 ABI safe buffer so that we don't accidentally allow an adjustment that
6189 would result in an allocation larger than the ABI buffer without
6190 probing. */
6191 HOST_WIDE_INT min_probe_threshold
6192 = final_adjustment_p
6193 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6194 : guard_size - guard_used_by_caller;
6195
6196 poly_int64 frame_size = cfun->machine->frame.frame_size;
6197
6198 /* We should always have a positive probe threshold. */
6199 gcc_assert (min_probe_threshold > 0);
6200
6201 if (flag_stack_clash_protection && !final_adjustment_p)
6202 {
6203 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6204 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6205
6206 if (known_eq (frame_size, 0))
6207 {
6208 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6209 }
6210 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6211 && known_lt (final_adjust, guard_used_by_caller))
6212 {
6213 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6214 }
6215 }
6216
6217 /* If SIZE is not large enough to require probing, just adjust the stack and
6218 exit. */
6219 if (known_lt (poly_size, min_probe_threshold)
6220 || !flag_stack_clash_protection)
6221 {
6222 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6223 return;
6224 }
6225
6226 HOST_WIDE_INT size;
6227 /* Handle the SVE non-constant case first. */
6228 if (!poly_size.is_constant (&size))
6229 {
6230 if (dump_file)
6231 {
6232 fprintf (dump_file, "Stack clash SVE prologue: ");
6233 print_dec (poly_size, dump_file);
6234 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6235 }
6236
6237 /* First calculate the amount of bytes we're actually spilling. */
6238 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6239 poly_size, temp1, temp2, false, true);
6240
6241 rtx_insn *insn = get_last_insn ();
6242
6243 if (frame_related_p)
6244 {
6245 /* This is done to provide unwinding information for the stack
6246 adjustments we're about to do, however to prevent the optimizers
6247 from removing the R11 move and leaving the CFA note (which would be
6248 very wrong) we tie the old and new stack pointer together.
6249 The tie will expand to nothing but the optimizers will not touch
6250 the instruction. */
6251 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6252 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6253 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6254
6255 /* We want the CFA independent of the stack pointer for the
6256 duration of the loop. */
6257 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6258 RTX_FRAME_RELATED_P (insn) = 1;
6259 }
6260
6261 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6262 rtx guard_const = gen_int_mode (guard_size, Pmode);
6263
6264 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6265 stack_pointer_rtx, temp1,
6266 probe_const, guard_const));
6267
6268 /* Now reset the CFA register if needed. */
6269 if (frame_related_p)
6270 {
6271 add_reg_note (insn, REG_CFA_DEF_CFA,
6272 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6273 gen_int_mode (poly_size, Pmode)));
6274 RTX_FRAME_RELATED_P (insn) = 1;
6275 }
6276
6277 return;
6278 }
6279
6280 if (dump_file)
6281 fprintf (dump_file,
6282 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6283 " bytes, probing will be required.\n", size);
6284
6285 /* Round size to the nearest multiple of guard_size, and calculate the
6286 residual as the difference between the original size and the rounded
6287 size. */
6288 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6289 HOST_WIDE_INT residual = size - rounded_size;
6290
6291 /* We can handle a small number of allocations/probes inline. Otherwise
6292 punt to a loop. */
6293 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6294 {
6295 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6296 {
6297 aarch64_sub_sp (NULL, temp2, guard_size, true);
6298 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6299 guard_used_by_caller));
6300 emit_insn (gen_blockage ());
6301 }
6302 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6303 }
6304 else
6305 {
6306 /* Compute the ending address. */
6307 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6308 temp1, NULL, false, true);
6309 rtx_insn *insn = get_last_insn ();
6310
6311 /* For the initial allocation, we don't have a frame pointer
6312 set up, so we always need CFI notes. If we're doing the
6313 final allocation, then we may have a frame pointer, in which
6314 case it is the CFA, otherwise we need CFI notes.
6315
6316 We can determine which allocation we are doing by looking at
6317 the value of FRAME_RELATED_P since the final allocations are not
6318 frame related. */
6319 if (frame_related_p)
6320 {
6321 /* We want the CFA independent of the stack pointer for the
6322 duration of the loop. */
6323 add_reg_note (insn, REG_CFA_DEF_CFA,
6324 plus_constant (Pmode, temp1, rounded_size));
6325 RTX_FRAME_RELATED_P (insn) = 1;
6326 }
6327
6328 /* This allocates and probes the stack. Note that this re-uses some of
6329 the existing Ada stack protection code. However we are guaranteed not
6330 to enter the non loop or residual branches of that code.
6331
6332 The non-loop part won't be entered because if our allocation amount
6333 doesn't require a loop, the case above would handle it.
6334
6335 The residual amount won't be entered because TEMP1 is a mutliple of
6336 the allocation size. The residual will always be 0. As such, the only
6337 part we are actually using from that code is the loop setup. The
6338 actual probing is done in aarch64_output_probe_stack_range. */
6339 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6340 stack_pointer_rtx, temp1));
6341
6342 /* Now reset the CFA register if needed. */
6343 if (frame_related_p)
6344 {
6345 add_reg_note (insn, REG_CFA_DEF_CFA,
6346 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6347 RTX_FRAME_RELATED_P (insn) = 1;
6348 }
6349
6350 emit_insn (gen_blockage ());
6351 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6352 }
6353
6354 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6355 be probed. This maintains the requirement that each page is probed at
6356 least once. For initial probing we probe only if the allocation is
6357 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6358 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6359 GUARD_SIZE. This works that for any allocation that is large enough to
6360 trigger a probe here, we'll have at least one, and if they're not large
6361 enough for this code to emit anything for them, The page would have been
6362 probed by the saving of FP/LR either by this function or any callees. If
6363 we don't have any callees then we won't have more stack adjustments and so
6364 are still safe. */
6365 if (residual)
6366 {
6367 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6368 /* If we're doing final adjustments, and we've done any full page
6369 allocations then any residual needs to be probed. */
6370 if (final_adjustment_p && rounded_size != 0)
6371 min_probe_threshold = 0;
6372 /* If doing a small final adjustment, we always probe at offset 0.
6373 This is done to avoid issues when LR is not at position 0 or when
6374 the final adjustment is smaller than the probing offset. */
6375 else if (final_adjustment_p && rounded_size == 0)
6376 residual_probe_offset = 0;
6377
6378 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6379 if (residual >= min_probe_threshold)
6380 {
6381 if (dump_file)
6382 fprintf (dump_file,
6383 "Stack clash AArch64 prologue residuals: "
6384 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6385 "\n", residual);
6386
6387 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6388 residual_probe_offset));
6389 emit_insn (gen_blockage ());
6390 }
6391 }
6392 }
6393
6394 /* Return 1 if the register is used by the epilogue. We need to say the
6395 return register is used, but only after epilogue generation is complete.
6396 Note that in the case of sibcalls, the values "used by the epilogue" are
6397 considered live at the start of the called function.
6398
6399 For SIMD functions we need to return 1 for FP registers that are saved and
6400 restored by a function but are not zero in call_used_regs. If we do not do
6401 this optimizations may remove the restore of the register. */
6402
6403 int
6404 aarch64_epilogue_uses (int regno)
6405 {
6406 if (epilogue_completed)
6407 {
6408 if (regno == LR_REGNUM)
6409 return 1;
6410 }
6411 return 0;
6412 }
6413
6414 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6415 is saved at BASE + OFFSET. */
6416
6417 static void
6418 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6419 rtx base, poly_int64 offset)
6420 {
6421 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6422 add_reg_note (insn, REG_CFA_EXPRESSION,
6423 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6424 }
6425
6426 /* AArch64 stack frames generated by this compiler look like:
6427
6428 +-------------------------------+
6429 | |
6430 | incoming stack arguments |
6431 | |
6432 +-------------------------------+
6433 | | <-- incoming stack pointer (aligned)
6434 | callee-allocated save area |
6435 | for register varargs |
6436 | |
6437 +-------------------------------+
6438 | local variables | <-- frame_pointer_rtx
6439 | |
6440 +-------------------------------+
6441 | padding | \
6442 +-------------------------------+ |
6443 | callee-saved registers | | frame.saved_regs_size
6444 +-------------------------------+ |
6445 | LR' | |
6446 +-------------------------------+ |
6447 | FP' | / <- hard_frame_pointer_rtx (aligned)
6448 +-------------------------------+
6449 | dynamic allocation |
6450 +-------------------------------+
6451 | padding |
6452 +-------------------------------+
6453 | outgoing stack arguments | <-- arg_pointer
6454 | |
6455 +-------------------------------+
6456 | | <-- stack_pointer_rtx (aligned)
6457
6458 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6459 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6460 unchanged.
6461
6462 By default for stack-clash we assume the guard is at least 64KB, but this
6463 value is configurable to either 4KB or 64KB. We also force the guard size to
6464 be the same as the probing interval and both values are kept in sync.
6465
6466 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6467 on the guard size) of stack space without probing.
6468
6469 When probing is needed, we emit a probe at the start of the prologue
6470 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6471
6472 We have to track how much space has been allocated and the only stores
6473 to the stack we track as implicit probes are the FP/LR stores.
6474
6475 For outgoing arguments we probe if the size is larger than 1KB, such that
6476 the ABI specified buffer is maintained for the next callee.
6477
6478 The following registers are reserved during frame layout and should not be
6479 used for any other purpose:
6480
6481 - r11: Used by stack clash protection when SVE is enabled.
6482 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6483 - r14 and r15: Used for speculation tracking.
6484 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6485 - r30(LR), r29(FP): Used by standard frame layout.
6486
6487 These registers must be avoided in frame layout related code unless the
6488 explicit intention is to interact with one of the features listed above. */
6489
6490 /* Generate the prologue instructions for entry into a function.
6491 Establish the stack frame by decreasing the stack pointer with a
6492 properly calculated size and, if necessary, create a frame record
6493 filled with the values of LR and previous frame pointer. The
6494 current FP is also set up if it is in use. */
6495
6496 void
6497 aarch64_expand_prologue (void)
6498 {
6499 poly_int64 frame_size = cfun->machine->frame.frame_size;
6500 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6501 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6502 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6503 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6504 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6505 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6506 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6507 rtx_insn *insn;
6508
6509 /* Sign return address for functions. */
6510 if (aarch64_return_address_signing_enabled ())
6511 {
6512 switch (aarch64_ra_sign_key)
6513 {
6514 case AARCH64_KEY_A:
6515 insn = emit_insn (gen_paciasp ());
6516 break;
6517 case AARCH64_KEY_B:
6518 insn = emit_insn (gen_pacibsp ());
6519 break;
6520 default:
6521 gcc_unreachable ();
6522 }
6523 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6524 RTX_FRAME_RELATED_P (insn) = 1;
6525 }
6526
6527 if (flag_stack_usage_info)
6528 current_function_static_stack_size = constant_lower_bound (frame_size);
6529
6530 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6531 {
6532 if (crtl->is_leaf && !cfun->calls_alloca)
6533 {
6534 if (maybe_gt (frame_size, PROBE_INTERVAL)
6535 && maybe_gt (frame_size, get_stack_check_protect ()))
6536 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6537 (frame_size
6538 - get_stack_check_protect ()));
6539 }
6540 else if (maybe_gt (frame_size, 0))
6541 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6542 }
6543
6544 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6545 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6546
6547 /* In theory we should never have both an initial adjustment
6548 and a callee save adjustment. Verify that is the case since the
6549 code below does not handle it for -fstack-clash-protection. */
6550 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6551
6552 /* Will only probe if the initial adjustment is larger than the guard
6553 less the amount of the guard reserved for use by the caller's
6554 outgoing args. */
6555 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6556 true, false);
6557
6558 if (callee_adjust != 0)
6559 aarch64_push_regs (reg1, reg2, callee_adjust);
6560
6561 if (emit_frame_chain)
6562 {
6563 poly_int64 reg_offset = callee_adjust;
6564 if (callee_adjust == 0)
6565 {
6566 reg1 = R29_REGNUM;
6567 reg2 = R30_REGNUM;
6568 reg_offset = callee_offset;
6569 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6570 }
6571 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6572 stack_pointer_rtx, callee_offset,
6573 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6574 if (frame_pointer_needed && !frame_size.is_constant ())
6575 {
6576 /* Variable-sized frames need to describe the save slot
6577 address using DW_CFA_expression rather than DW_CFA_offset.
6578 This means that, without taking further action, the
6579 locations of the registers that we've already saved would
6580 remain based on the stack pointer even after we redefine
6581 the CFA based on the frame pointer. We therefore need new
6582 DW_CFA_expressions to re-express the save slots with addresses
6583 based on the frame pointer. */
6584 rtx_insn *insn = get_last_insn ();
6585 gcc_assert (RTX_FRAME_RELATED_P (insn));
6586
6587 /* Add an explicit CFA definition if this was previously
6588 implicit. */
6589 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6590 {
6591 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6592 callee_offset);
6593 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6594 gen_rtx_SET (hard_frame_pointer_rtx, src));
6595 }
6596
6597 /* Change the save slot expressions for the registers that
6598 we've already saved. */
6599 reg_offset -= callee_offset;
6600 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6601 reg_offset + UNITS_PER_WORD);
6602 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6603 reg_offset);
6604 }
6605 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6606 }
6607
6608 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6609 callee_adjust != 0 || emit_frame_chain);
6610 if (crtl->abi->id () == ARM_PCS_SIMD)
6611 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6612 callee_adjust != 0 || emit_frame_chain);
6613 else
6614 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6615 callee_adjust != 0 || emit_frame_chain);
6616
6617 /* We may need to probe the final adjustment if it is larger than the guard
6618 that is assumed by the called. */
6619 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6620 !frame_pointer_needed, true);
6621 }
6622
6623 /* Return TRUE if we can use a simple_return insn.
6624
6625 This function checks whether the callee saved stack is empty, which
6626 means no restore actions are need. The pro_and_epilogue will use
6627 this to check whether shrink-wrapping opt is feasible. */
6628
6629 bool
6630 aarch64_use_return_insn_p (void)
6631 {
6632 if (!reload_completed)
6633 return false;
6634
6635 if (crtl->profile)
6636 return false;
6637
6638 return known_eq (cfun->machine->frame.frame_size, 0);
6639 }
6640
6641 /* Generate the epilogue instructions for returning from a function.
6642 This is almost exactly the reverse of the prolog sequence, except
6643 that we need to insert barriers to avoid scheduling loads that read
6644 from a deallocated stack, and we optimize the unwind records by
6645 emitting them all together if possible. */
6646 void
6647 aarch64_expand_epilogue (bool for_sibcall)
6648 {
6649 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6650 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6651 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6652 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6653 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6654 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6655 rtx cfi_ops = NULL;
6656 rtx_insn *insn;
6657 /* A stack clash protection prologue may not have left EP0_REGNUM or
6658 EP1_REGNUM in a usable state. The same is true for allocations
6659 with an SVE component, since we then need both temporary registers
6660 for each allocation. For stack clash we are in a usable state if
6661 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6662 HOST_WIDE_INT guard_size
6663 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6664 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6665
6666 /* We can re-use the registers when the allocation amount is smaller than
6667 guard_size - guard_used_by_caller because we won't be doing any probes
6668 then. In such situations the register should remain live with the correct
6669 value. */
6670 bool can_inherit_p = (initial_adjust.is_constant ()
6671 && final_adjust.is_constant ())
6672 && (!flag_stack_clash_protection
6673 || known_lt (initial_adjust,
6674 guard_size - guard_used_by_caller));
6675
6676 /* We need to add memory barrier to prevent read from deallocated stack. */
6677 bool need_barrier_p
6678 = maybe_ne (get_frame_size ()
6679 + cfun->machine->frame.saved_varargs_size, 0);
6680
6681 /* Emit a barrier to prevent loads from a deallocated stack. */
6682 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6683 || cfun->calls_alloca
6684 || crtl->calls_eh_return)
6685 {
6686 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6687 need_barrier_p = false;
6688 }
6689
6690 /* Restore the stack pointer from the frame pointer if it may not
6691 be the same as the stack pointer. */
6692 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6693 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6694 if (frame_pointer_needed
6695 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6696 /* If writeback is used when restoring callee-saves, the CFA
6697 is restored on the instruction doing the writeback. */
6698 aarch64_add_offset (Pmode, stack_pointer_rtx,
6699 hard_frame_pointer_rtx, -callee_offset,
6700 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6701 else
6702 /* The case where we need to re-use the register here is very rare, so
6703 avoid the complicated condition and just always emit a move if the
6704 immediate doesn't fit. */
6705 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6706
6707 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6708 callee_adjust != 0, &cfi_ops);
6709 if (crtl->abi->id () == ARM_PCS_SIMD)
6710 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6711 callee_adjust != 0, &cfi_ops);
6712 else
6713 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6714 callee_adjust != 0, &cfi_ops);
6715
6716 if (need_barrier_p)
6717 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6718
6719 if (callee_adjust != 0)
6720 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6721
6722 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6723 {
6724 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6725 insn = get_last_insn ();
6726 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6727 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6728 RTX_FRAME_RELATED_P (insn) = 1;
6729 cfi_ops = NULL;
6730 }
6731
6732 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6733 add restriction on emit_move optimization to leaf functions. */
6734 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6735 (!can_inherit_p || !crtl->is_leaf
6736 || df_regs_ever_live_p (EP0_REGNUM)));
6737
6738 if (cfi_ops)
6739 {
6740 /* Emit delayed restores and reset the CFA to be SP. */
6741 insn = get_last_insn ();
6742 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6743 REG_NOTES (insn) = cfi_ops;
6744 RTX_FRAME_RELATED_P (insn) = 1;
6745 }
6746
6747 /* We prefer to emit the combined return/authenticate instruction RETAA,
6748 however there are three cases in which we must instead emit an explicit
6749 authentication instruction.
6750
6751 1) Sibcalls don't return in a normal way, so if we're about to call one
6752 we must authenticate.
6753
6754 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6755 generating code for !TARGET_ARMV8_3 we can't use it and must
6756 explicitly authenticate.
6757
6758 3) On an eh_return path we make extra stack adjustments to update the
6759 canonical frame address to be the exception handler's CFA. We want
6760 to authenticate using the CFA of the function which calls eh_return.
6761 */
6762 if (aarch64_return_address_signing_enabled ()
6763 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6764 {
6765 switch (aarch64_ra_sign_key)
6766 {
6767 case AARCH64_KEY_A:
6768 insn = emit_insn (gen_autiasp ());
6769 break;
6770 case AARCH64_KEY_B:
6771 insn = emit_insn (gen_autibsp ());
6772 break;
6773 default:
6774 gcc_unreachable ();
6775 }
6776 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6777 RTX_FRAME_RELATED_P (insn) = 1;
6778 }
6779
6780 /* Stack adjustment for exception handler. */
6781 if (crtl->calls_eh_return && !for_sibcall)
6782 {
6783 /* We need to unwind the stack by the offset computed by
6784 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6785 to be SP; letting the CFA move during this adjustment
6786 is just as correct as retaining the CFA from the body
6787 of the function. Therefore, do nothing special. */
6788 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6789 }
6790
6791 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6792 if (!for_sibcall)
6793 emit_jump_insn (ret_rtx);
6794 }
6795
6796 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6797 normally or return to a previous frame after unwinding.
6798
6799 An EH return uses a single shared return sequence. The epilogue is
6800 exactly like a normal epilogue except that it has an extra input
6801 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6802 that must be applied after the frame has been destroyed. An extra label
6803 is inserted before the epilogue which initializes this register to zero,
6804 and this is the entry point for a normal return.
6805
6806 An actual EH return updates the return address, initializes the stack
6807 adjustment and jumps directly into the epilogue (bypassing the zeroing
6808 of the adjustment). Since the return address is typically saved on the
6809 stack when a function makes a call, the saved LR must be updated outside
6810 the epilogue.
6811
6812 This poses problems as the store is generated well before the epilogue,
6813 so the offset of LR is not known yet. Also optimizations will remove the
6814 store as it appears dead, even after the epilogue is generated (as the
6815 base or offset for loading LR is different in many cases).
6816
6817 To avoid these problems this implementation forces the frame pointer
6818 in eh_return functions so that the location of LR is fixed and known early.
6819 It also marks the store volatile, so no optimization is permitted to
6820 remove the store. */
6821 rtx
6822 aarch64_eh_return_handler_rtx (void)
6823 {
6824 rtx tmp = gen_frame_mem (Pmode,
6825 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6826
6827 /* Mark the store volatile, so no optimization is permitted to remove it. */
6828 MEM_VOLATILE_P (tmp) = true;
6829 return tmp;
6830 }
6831
6832 /* Output code to add DELTA to the first argument, and then jump
6833 to FUNCTION. Used for C++ multiple inheritance. */
6834 static void
6835 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6836 HOST_WIDE_INT delta,
6837 HOST_WIDE_INT vcall_offset,
6838 tree function)
6839 {
6840 /* The this pointer is always in x0. Note that this differs from
6841 Arm where the this pointer maybe bumped to r1 if r0 is required
6842 to return a pointer to an aggregate. On AArch64 a result value
6843 pointer will be in x8. */
6844 int this_regno = R0_REGNUM;
6845 rtx this_rtx, temp0, temp1, addr, funexp;
6846 rtx_insn *insn;
6847 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6848
6849 if (aarch64_bti_enabled ())
6850 emit_insn (gen_bti_c());
6851
6852 reload_completed = 1;
6853 emit_note (NOTE_INSN_PROLOGUE_END);
6854
6855 this_rtx = gen_rtx_REG (Pmode, this_regno);
6856 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6857 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6858
6859 if (vcall_offset == 0)
6860 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6861 else
6862 {
6863 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6864
6865 addr = this_rtx;
6866 if (delta != 0)
6867 {
6868 if (delta >= -256 && delta < 256)
6869 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6870 plus_constant (Pmode, this_rtx, delta));
6871 else
6872 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6873 temp1, temp0, false);
6874 }
6875
6876 if (Pmode == ptr_mode)
6877 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6878 else
6879 aarch64_emit_move (temp0,
6880 gen_rtx_ZERO_EXTEND (Pmode,
6881 gen_rtx_MEM (ptr_mode, addr)));
6882
6883 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6884 addr = plus_constant (Pmode, temp0, vcall_offset);
6885 else
6886 {
6887 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6888 Pmode);
6889 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6890 }
6891
6892 if (Pmode == ptr_mode)
6893 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6894 else
6895 aarch64_emit_move (temp1,
6896 gen_rtx_SIGN_EXTEND (Pmode,
6897 gen_rtx_MEM (ptr_mode, addr)));
6898
6899 emit_insn (gen_add2_insn (this_rtx, temp1));
6900 }
6901
6902 /* Generate a tail call to the target function. */
6903 if (!TREE_USED (function))
6904 {
6905 assemble_external (function);
6906 TREE_USED (function) = 1;
6907 }
6908 funexp = XEXP (DECL_RTL (function), 0);
6909 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6910 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
6911 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
6912 SIBLING_CALL_P (insn) = 1;
6913
6914 insn = get_insns ();
6915 shorten_branches (insn);
6916
6917 assemble_start_function (thunk, fnname);
6918 final_start_function (insn, file, 1);
6919 final (insn, file, 1);
6920 final_end_function ();
6921 assemble_end_function (thunk, fnname);
6922
6923 /* Stop pretending to be a post-reload pass. */
6924 reload_completed = 0;
6925 }
6926
6927 static bool
6928 aarch64_tls_referenced_p (rtx x)
6929 {
6930 if (!TARGET_HAVE_TLS)
6931 return false;
6932 subrtx_iterator::array_type array;
6933 FOR_EACH_SUBRTX (iter, array, x, ALL)
6934 {
6935 const_rtx x = *iter;
6936 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6937 return true;
6938 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6939 TLS offsets, not real symbol references. */
6940 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6941 iter.skip_subrtxes ();
6942 }
6943 return false;
6944 }
6945
6946
6947 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6948 a left shift of 0 or 12 bits. */
6949 bool
6950 aarch64_uimm12_shift (HOST_WIDE_INT val)
6951 {
6952 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6953 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6954 );
6955 }
6956
6957 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6958 that can be created with a left shift of 0 or 12. */
6959 static HOST_WIDE_INT
6960 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6961 {
6962 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6963 handle correctly. */
6964 gcc_assert ((val & 0xffffff) == val);
6965
6966 if (((val & 0xfff) << 0) == val)
6967 return val;
6968
6969 return val & (0xfff << 12);
6970 }
6971
6972 /* Return true if val is an immediate that can be loaded into a
6973 register by a MOVZ instruction. */
6974 static bool
6975 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6976 {
6977 if (GET_MODE_SIZE (mode) > 4)
6978 {
6979 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6980 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6981 return 1;
6982 }
6983 else
6984 {
6985 /* Ignore sign extension. */
6986 val &= (HOST_WIDE_INT) 0xffffffff;
6987 }
6988 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6989 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6990 }
6991
6992 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6993 64-bit (DImode) integer. */
6994
6995 static unsigned HOST_WIDE_INT
6996 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6997 {
6998 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6999 while (size < 64)
7000 {
7001 val &= (HOST_WIDE_INT_1U << size) - 1;
7002 val |= val << size;
7003 size *= 2;
7004 }
7005 return val;
7006 }
7007
7008 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7009
7010 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7011 {
7012 0x0000000100000001ull,
7013 0x0001000100010001ull,
7014 0x0101010101010101ull,
7015 0x1111111111111111ull,
7016 0x5555555555555555ull,
7017 };
7018
7019
7020 /* Return true if val is a valid bitmask immediate. */
7021
7022 bool
7023 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7024 {
7025 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7026 int bits;
7027
7028 /* Check for a single sequence of one bits and return quickly if so.
7029 The special cases of all ones and all zeroes returns false. */
7030 val = aarch64_replicate_bitmask_imm (val_in, mode);
7031 tmp = val + (val & -val);
7032
7033 if (tmp == (tmp & -tmp))
7034 return (val + 1) > 1;
7035
7036 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7037 if (mode == SImode)
7038 val = (val << 32) | (val & 0xffffffff);
7039
7040 /* Invert if the immediate doesn't start with a zero bit - this means we
7041 only need to search for sequences of one bits. */
7042 if (val & 1)
7043 val = ~val;
7044
7045 /* Find the first set bit and set tmp to val with the first sequence of one
7046 bits removed. Return success if there is a single sequence of ones. */
7047 first_one = val & -val;
7048 tmp = val & (val + first_one);
7049
7050 if (tmp == 0)
7051 return true;
7052
7053 /* Find the next set bit and compute the difference in bit position. */
7054 next_one = tmp & -tmp;
7055 bits = clz_hwi (first_one) - clz_hwi (next_one);
7056 mask = val ^ tmp;
7057
7058 /* Check the bit position difference is a power of 2, and that the first
7059 sequence of one bits fits within 'bits' bits. */
7060 if ((mask >> bits) != 0 || bits != (bits & -bits))
7061 return false;
7062
7063 /* Check the sequence of one bits is repeated 64/bits times. */
7064 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7065 }
7066
7067 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7068 Assumed precondition: VAL_IN Is not zero. */
7069
7070 unsigned HOST_WIDE_INT
7071 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7072 {
7073 int lowest_bit_set = ctz_hwi (val_in);
7074 int highest_bit_set = floor_log2 (val_in);
7075 gcc_assert (val_in != 0);
7076
7077 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7078 (HOST_WIDE_INT_1U << lowest_bit_set));
7079 }
7080
7081 /* Create constant where bits outside of lowest bit set to highest bit set
7082 are set to 1. */
7083
7084 unsigned HOST_WIDE_INT
7085 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7086 {
7087 return val_in | ~aarch64_and_split_imm1 (val_in);
7088 }
7089
7090 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7091
7092 bool
7093 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7094 {
7095 scalar_int_mode int_mode;
7096 if (!is_a <scalar_int_mode> (mode, &int_mode))
7097 return false;
7098
7099 if (aarch64_bitmask_imm (val_in, int_mode))
7100 return false;
7101
7102 if (aarch64_move_imm (val_in, int_mode))
7103 return false;
7104
7105 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7106
7107 return aarch64_bitmask_imm (imm2, int_mode);
7108 }
7109
7110 /* Return true if val is an immediate that can be loaded into a
7111 register in a single instruction. */
7112 bool
7113 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7114 {
7115 scalar_int_mode int_mode;
7116 if (!is_a <scalar_int_mode> (mode, &int_mode))
7117 return false;
7118
7119 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7120 return 1;
7121 return aarch64_bitmask_imm (val, int_mode);
7122 }
7123
7124 static bool
7125 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7126 {
7127 rtx base, offset;
7128
7129 if (GET_CODE (x) == HIGH)
7130 return true;
7131
7132 /* There's no way to calculate VL-based values using relocations. */
7133 subrtx_iterator::array_type array;
7134 FOR_EACH_SUBRTX (iter, array, x, ALL)
7135 if (GET_CODE (*iter) == CONST_POLY_INT)
7136 return true;
7137
7138 split_const (x, &base, &offset);
7139 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7140 {
7141 if (aarch64_classify_symbol (base, INTVAL (offset))
7142 != SYMBOL_FORCE_TO_MEM)
7143 return true;
7144 else
7145 /* Avoid generating a 64-bit relocation in ILP32; leave
7146 to aarch64_expand_mov_immediate to handle it properly. */
7147 return mode != ptr_mode;
7148 }
7149
7150 return aarch64_tls_referenced_p (x);
7151 }
7152
7153 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7154 The expansion for a table switch is quite expensive due to the number
7155 of instructions, the table lookup and hard to predict indirect jump.
7156 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7157 set, otherwise use tables for > 16 cases as a tradeoff between size and
7158 performance. When optimizing for size, use the default setting. */
7159
7160 static unsigned int
7161 aarch64_case_values_threshold (void)
7162 {
7163 /* Use the specified limit for the number of cases before using jump
7164 tables at higher optimization levels. */
7165 if (optimize > 2
7166 && selected_cpu->tune->max_case_values != 0)
7167 return selected_cpu->tune->max_case_values;
7168 else
7169 return optimize_size ? default_case_values_threshold () : 17;
7170 }
7171
7172 /* Return true if register REGNO is a valid index register.
7173 STRICT_P is true if REG_OK_STRICT is in effect. */
7174
7175 bool
7176 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7177 {
7178 if (!HARD_REGISTER_NUM_P (regno))
7179 {
7180 if (!strict_p)
7181 return true;
7182
7183 if (!reg_renumber)
7184 return false;
7185
7186 regno = reg_renumber[regno];
7187 }
7188 return GP_REGNUM_P (regno);
7189 }
7190
7191 /* Return true if register REGNO is a valid base register for mode MODE.
7192 STRICT_P is true if REG_OK_STRICT is in effect. */
7193
7194 bool
7195 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7196 {
7197 if (!HARD_REGISTER_NUM_P (regno))
7198 {
7199 if (!strict_p)
7200 return true;
7201
7202 if (!reg_renumber)
7203 return false;
7204
7205 regno = reg_renumber[regno];
7206 }
7207
7208 /* The fake registers will be eliminated to either the stack or
7209 hard frame pointer, both of which are usually valid base registers.
7210 Reload deals with the cases where the eliminated form isn't valid. */
7211 return (GP_REGNUM_P (regno)
7212 || regno == SP_REGNUM
7213 || regno == FRAME_POINTER_REGNUM
7214 || regno == ARG_POINTER_REGNUM);
7215 }
7216
7217 /* Return true if X is a valid base register for mode MODE.
7218 STRICT_P is true if REG_OK_STRICT is in effect. */
7219
7220 static bool
7221 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7222 {
7223 if (!strict_p
7224 && GET_CODE (x) == SUBREG
7225 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7226 x = SUBREG_REG (x);
7227
7228 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7229 }
7230
7231 /* Return true if address offset is a valid index. If it is, fill in INFO
7232 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7233
7234 static bool
7235 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7236 machine_mode mode, bool strict_p)
7237 {
7238 enum aarch64_address_type type;
7239 rtx index;
7240 int shift;
7241
7242 /* (reg:P) */
7243 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7244 && GET_MODE (x) == Pmode)
7245 {
7246 type = ADDRESS_REG_REG;
7247 index = x;
7248 shift = 0;
7249 }
7250 /* (sign_extend:DI (reg:SI)) */
7251 else if ((GET_CODE (x) == SIGN_EXTEND
7252 || GET_CODE (x) == ZERO_EXTEND)
7253 && GET_MODE (x) == DImode
7254 && GET_MODE (XEXP (x, 0)) == SImode)
7255 {
7256 type = (GET_CODE (x) == SIGN_EXTEND)
7257 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7258 index = XEXP (x, 0);
7259 shift = 0;
7260 }
7261 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7262 else if (GET_CODE (x) == MULT
7263 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7264 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7265 && GET_MODE (XEXP (x, 0)) == DImode
7266 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7267 && CONST_INT_P (XEXP (x, 1)))
7268 {
7269 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7270 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7271 index = XEXP (XEXP (x, 0), 0);
7272 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7273 }
7274 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7275 else if (GET_CODE (x) == ASHIFT
7276 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7277 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7278 && GET_MODE (XEXP (x, 0)) == DImode
7279 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7280 && CONST_INT_P (XEXP (x, 1)))
7281 {
7282 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7283 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7284 index = XEXP (XEXP (x, 0), 0);
7285 shift = INTVAL (XEXP (x, 1));
7286 }
7287 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7288 else if ((GET_CODE (x) == SIGN_EXTRACT
7289 || GET_CODE (x) == ZERO_EXTRACT)
7290 && GET_MODE (x) == DImode
7291 && GET_CODE (XEXP (x, 0)) == MULT
7292 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7293 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7294 {
7295 type = (GET_CODE (x) == SIGN_EXTRACT)
7296 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7297 index = XEXP (XEXP (x, 0), 0);
7298 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7299 if (INTVAL (XEXP (x, 1)) != 32 + shift
7300 || INTVAL (XEXP (x, 2)) != 0)
7301 shift = -1;
7302 }
7303 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7304 (const_int 0xffffffff<<shift)) */
7305 else if (GET_CODE (x) == AND
7306 && GET_MODE (x) == DImode
7307 && GET_CODE (XEXP (x, 0)) == MULT
7308 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7309 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7310 && CONST_INT_P (XEXP (x, 1)))
7311 {
7312 type = ADDRESS_REG_UXTW;
7313 index = XEXP (XEXP (x, 0), 0);
7314 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7315 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7316 shift = -1;
7317 }
7318 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7319 else if ((GET_CODE (x) == SIGN_EXTRACT
7320 || GET_CODE (x) == ZERO_EXTRACT)
7321 && GET_MODE (x) == DImode
7322 && GET_CODE (XEXP (x, 0)) == ASHIFT
7323 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7324 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7325 {
7326 type = (GET_CODE (x) == SIGN_EXTRACT)
7327 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7328 index = XEXP (XEXP (x, 0), 0);
7329 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7330 if (INTVAL (XEXP (x, 1)) != 32 + shift
7331 || INTVAL (XEXP (x, 2)) != 0)
7332 shift = -1;
7333 }
7334 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7335 (const_int 0xffffffff<<shift)) */
7336 else if (GET_CODE (x) == AND
7337 && GET_MODE (x) == DImode
7338 && GET_CODE (XEXP (x, 0)) == ASHIFT
7339 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7340 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7341 && CONST_INT_P (XEXP (x, 1)))
7342 {
7343 type = ADDRESS_REG_UXTW;
7344 index = XEXP (XEXP (x, 0), 0);
7345 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7346 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7347 shift = -1;
7348 }
7349 /* (mult:P (reg:P) (const_int scale)) */
7350 else if (GET_CODE (x) == MULT
7351 && GET_MODE (x) == Pmode
7352 && GET_MODE (XEXP (x, 0)) == Pmode
7353 && CONST_INT_P (XEXP (x, 1)))
7354 {
7355 type = ADDRESS_REG_REG;
7356 index = XEXP (x, 0);
7357 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7358 }
7359 /* (ashift:P (reg:P) (const_int shift)) */
7360 else if (GET_CODE (x) == ASHIFT
7361 && GET_MODE (x) == Pmode
7362 && GET_MODE (XEXP (x, 0)) == Pmode
7363 && CONST_INT_P (XEXP (x, 1)))
7364 {
7365 type = ADDRESS_REG_REG;
7366 index = XEXP (x, 0);
7367 shift = INTVAL (XEXP (x, 1));
7368 }
7369 else
7370 return false;
7371
7372 if (!strict_p
7373 && GET_CODE (index) == SUBREG
7374 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7375 index = SUBREG_REG (index);
7376
7377 if (aarch64_sve_data_mode_p (mode))
7378 {
7379 if (type != ADDRESS_REG_REG
7380 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7381 return false;
7382 }
7383 else
7384 {
7385 if (shift != 0
7386 && !(IN_RANGE (shift, 1, 3)
7387 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7388 return false;
7389 }
7390
7391 if (REG_P (index)
7392 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7393 {
7394 info->type = type;
7395 info->offset = index;
7396 info->shift = shift;
7397 return true;
7398 }
7399
7400 return false;
7401 }
7402
7403 /* Return true if MODE is one of the modes for which we
7404 support LDP/STP operations. */
7405
7406 static bool
7407 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7408 {
7409 return mode == SImode || mode == DImode
7410 || mode == SFmode || mode == DFmode
7411 || (aarch64_vector_mode_supported_p (mode)
7412 && (known_eq (GET_MODE_SIZE (mode), 8)
7413 || (known_eq (GET_MODE_SIZE (mode), 16)
7414 && (aarch64_tune_params.extra_tuning_flags
7415 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7416 }
7417
7418 /* Return true if REGNO is a virtual pointer register, or an eliminable
7419 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7420 include stack_pointer or hard_frame_pointer. */
7421 static bool
7422 virt_or_elim_regno_p (unsigned regno)
7423 {
7424 return ((regno >= FIRST_VIRTUAL_REGISTER
7425 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7426 || regno == FRAME_POINTER_REGNUM
7427 || regno == ARG_POINTER_REGNUM);
7428 }
7429
7430 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7431 If it is, fill in INFO appropriately. STRICT_P is true if
7432 REG_OK_STRICT is in effect. */
7433
7434 bool
7435 aarch64_classify_address (struct aarch64_address_info *info,
7436 rtx x, machine_mode mode, bool strict_p,
7437 aarch64_addr_query_type type)
7438 {
7439 enum rtx_code code = GET_CODE (x);
7440 rtx op0, op1;
7441 poly_int64 offset;
7442
7443 HOST_WIDE_INT const_size;
7444
7445 /* On BE, we use load/store pair for all large int mode load/stores.
7446 TI/TFmode may also use a load/store pair. */
7447 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7448 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7449 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7450 || type == ADDR_QUERY_LDP_STP_N
7451 || mode == TImode
7452 || mode == TFmode
7453 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7454
7455 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7456 corresponds to the actual size of the memory being loaded/stored and the
7457 mode of the corresponding addressing mode is half of that. */
7458 if (type == ADDR_QUERY_LDP_STP_N
7459 && known_eq (GET_MODE_SIZE (mode), 16))
7460 mode = DFmode;
7461
7462 bool allow_reg_index_p = (!load_store_pair_p
7463 && (known_lt (GET_MODE_SIZE (mode), 16)
7464 || vec_flags == VEC_ADVSIMD
7465 || vec_flags & VEC_SVE_DATA));
7466
7467 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7468 [Rn, #offset, MUL VL]. */
7469 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7470 && (code != REG && code != PLUS))
7471 return false;
7472
7473 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7474 REG addressing. */
7475 if (advsimd_struct_p
7476 && !BYTES_BIG_ENDIAN
7477 && (code != POST_INC && code != REG))
7478 return false;
7479
7480 gcc_checking_assert (GET_MODE (x) == VOIDmode
7481 || SCALAR_INT_MODE_P (GET_MODE (x)));
7482
7483 switch (code)
7484 {
7485 case REG:
7486 case SUBREG:
7487 info->type = ADDRESS_REG_IMM;
7488 info->base = x;
7489 info->offset = const0_rtx;
7490 info->const_offset = 0;
7491 return aarch64_base_register_rtx_p (x, strict_p);
7492
7493 case PLUS:
7494 op0 = XEXP (x, 0);
7495 op1 = XEXP (x, 1);
7496
7497 if (! strict_p
7498 && REG_P (op0)
7499 && virt_or_elim_regno_p (REGNO (op0))
7500 && poly_int_rtx_p (op1, &offset))
7501 {
7502 info->type = ADDRESS_REG_IMM;
7503 info->base = op0;
7504 info->offset = op1;
7505 info->const_offset = offset;
7506
7507 return true;
7508 }
7509
7510 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7511 && aarch64_base_register_rtx_p (op0, strict_p)
7512 && poly_int_rtx_p (op1, &offset))
7513 {
7514 info->type = ADDRESS_REG_IMM;
7515 info->base = op0;
7516 info->offset = op1;
7517 info->const_offset = offset;
7518
7519 /* TImode and TFmode values are allowed in both pairs of X
7520 registers and individual Q registers. The available
7521 address modes are:
7522 X,X: 7-bit signed scaled offset
7523 Q: 9-bit signed offset
7524 We conservatively require an offset representable in either mode.
7525 When performing the check for pairs of X registers i.e. LDP/STP
7526 pass down DImode since that is the natural size of the LDP/STP
7527 instruction memory accesses. */
7528 if (mode == TImode || mode == TFmode)
7529 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7530 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7531 || offset_12bit_unsigned_scaled_p (mode, offset)));
7532
7533 /* A 7bit offset check because OImode will emit a ldp/stp
7534 instruction (only big endian will get here).
7535 For ldp/stp instructions, the offset is scaled for the size of a
7536 single element of the pair. */
7537 if (mode == OImode)
7538 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7539
7540 /* Three 9/12 bit offsets checks because CImode will emit three
7541 ldr/str instructions (only big endian will get here). */
7542 if (mode == CImode)
7543 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7544 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7545 offset + 32)
7546 || offset_12bit_unsigned_scaled_p (V16QImode,
7547 offset + 32)));
7548
7549 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7550 instructions (only big endian will get here). */
7551 if (mode == XImode)
7552 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7553 && aarch64_offset_7bit_signed_scaled_p (TImode,
7554 offset + 32));
7555
7556 /* Make "m" use the LD1 offset range for SVE data modes, so
7557 that pre-RTL optimizers like ivopts will work to that
7558 instead of the wider LDR/STR range. */
7559 if (vec_flags == VEC_SVE_DATA)
7560 return (type == ADDR_QUERY_M
7561 ? offset_4bit_signed_scaled_p (mode, offset)
7562 : offset_9bit_signed_scaled_p (mode, offset));
7563
7564 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7565 {
7566 poly_int64 end_offset = (offset
7567 + GET_MODE_SIZE (mode)
7568 - BYTES_PER_SVE_VECTOR);
7569 return (type == ADDR_QUERY_M
7570 ? offset_4bit_signed_scaled_p (mode, offset)
7571 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7572 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7573 end_offset)));
7574 }
7575
7576 if (vec_flags == VEC_SVE_PRED)
7577 return offset_9bit_signed_scaled_p (mode, offset);
7578
7579 if (load_store_pair_p)
7580 return ((known_eq (GET_MODE_SIZE (mode), 4)
7581 || known_eq (GET_MODE_SIZE (mode), 8)
7582 || known_eq (GET_MODE_SIZE (mode), 16))
7583 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7584 else
7585 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7586 || offset_12bit_unsigned_scaled_p (mode, offset));
7587 }
7588
7589 if (allow_reg_index_p)
7590 {
7591 /* Look for base + (scaled/extended) index register. */
7592 if (aarch64_base_register_rtx_p (op0, strict_p)
7593 && aarch64_classify_index (info, op1, mode, strict_p))
7594 {
7595 info->base = op0;
7596 return true;
7597 }
7598 if (aarch64_base_register_rtx_p (op1, strict_p)
7599 && aarch64_classify_index (info, op0, mode, strict_p))
7600 {
7601 info->base = op1;
7602 return true;
7603 }
7604 }
7605
7606 return false;
7607
7608 case POST_INC:
7609 case POST_DEC:
7610 case PRE_INC:
7611 case PRE_DEC:
7612 info->type = ADDRESS_REG_WB;
7613 info->base = XEXP (x, 0);
7614 info->offset = NULL_RTX;
7615 return aarch64_base_register_rtx_p (info->base, strict_p);
7616
7617 case POST_MODIFY:
7618 case PRE_MODIFY:
7619 info->type = ADDRESS_REG_WB;
7620 info->base = XEXP (x, 0);
7621 if (GET_CODE (XEXP (x, 1)) == PLUS
7622 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7623 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7624 && aarch64_base_register_rtx_p (info->base, strict_p))
7625 {
7626 info->offset = XEXP (XEXP (x, 1), 1);
7627 info->const_offset = offset;
7628
7629 /* TImode and TFmode values are allowed in both pairs of X
7630 registers and individual Q registers. The available
7631 address modes are:
7632 X,X: 7-bit signed scaled offset
7633 Q: 9-bit signed offset
7634 We conservatively require an offset representable in either mode.
7635 */
7636 if (mode == TImode || mode == TFmode)
7637 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7638 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7639
7640 if (load_store_pair_p)
7641 return ((known_eq (GET_MODE_SIZE (mode), 4)
7642 || known_eq (GET_MODE_SIZE (mode), 8)
7643 || known_eq (GET_MODE_SIZE (mode), 16))
7644 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7645 else
7646 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7647 }
7648 return false;
7649
7650 case CONST:
7651 case SYMBOL_REF:
7652 case LABEL_REF:
7653 /* load literal: pc-relative constant pool entry. Only supported
7654 for SI mode or larger. */
7655 info->type = ADDRESS_SYMBOLIC;
7656
7657 if (!load_store_pair_p
7658 && GET_MODE_SIZE (mode).is_constant (&const_size)
7659 && const_size >= 4)
7660 {
7661 rtx sym, addend;
7662
7663 split_const (x, &sym, &addend);
7664 return ((GET_CODE (sym) == LABEL_REF
7665 || (GET_CODE (sym) == SYMBOL_REF
7666 && CONSTANT_POOL_ADDRESS_P (sym)
7667 && aarch64_pcrelative_literal_loads)));
7668 }
7669 return false;
7670
7671 case LO_SUM:
7672 info->type = ADDRESS_LO_SUM;
7673 info->base = XEXP (x, 0);
7674 info->offset = XEXP (x, 1);
7675 if (allow_reg_index_p
7676 && aarch64_base_register_rtx_p (info->base, strict_p))
7677 {
7678 rtx sym, offs;
7679 split_const (info->offset, &sym, &offs);
7680 if (GET_CODE (sym) == SYMBOL_REF
7681 && (aarch64_classify_symbol (sym, INTVAL (offs))
7682 == SYMBOL_SMALL_ABSOLUTE))
7683 {
7684 /* The symbol and offset must be aligned to the access size. */
7685 unsigned int align;
7686
7687 if (CONSTANT_POOL_ADDRESS_P (sym))
7688 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7689 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7690 {
7691 tree exp = SYMBOL_REF_DECL (sym);
7692 align = TYPE_ALIGN (TREE_TYPE (exp));
7693 align = aarch64_constant_alignment (exp, align);
7694 }
7695 else if (SYMBOL_REF_DECL (sym))
7696 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7697 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7698 && SYMBOL_REF_BLOCK (sym) != NULL)
7699 align = SYMBOL_REF_BLOCK (sym)->alignment;
7700 else
7701 align = BITS_PER_UNIT;
7702
7703 poly_int64 ref_size = GET_MODE_SIZE (mode);
7704 if (known_eq (ref_size, 0))
7705 ref_size = GET_MODE_SIZE (DImode);
7706
7707 return (multiple_p (INTVAL (offs), ref_size)
7708 && multiple_p (align / BITS_PER_UNIT, ref_size));
7709 }
7710 }
7711 return false;
7712
7713 default:
7714 return false;
7715 }
7716 }
7717
7718 /* Return true if the address X is valid for a PRFM instruction.
7719 STRICT_P is true if we should do strict checking with
7720 aarch64_classify_address. */
7721
7722 bool
7723 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7724 {
7725 struct aarch64_address_info addr;
7726
7727 /* PRFM accepts the same addresses as DImode... */
7728 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7729 if (!res)
7730 return false;
7731
7732 /* ... except writeback forms. */
7733 return addr.type != ADDRESS_REG_WB;
7734 }
7735
7736 bool
7737 aarch64_symbolic_address_p (rtx x)
7738 {
7739 rtx offset;
7740
7741 split_const (x, &x, &offset);
7742 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7743 }
7744
7745 /* Classify the base of symbolic expression X. */
7746
7747 enum aarch64_symbol_type
7748 aarch64_classify_symbolic_expression (rtx x)
7749 {
7750 rtx offset;
7751
7752 split_const (x, &x, &offset);
7753 return aarch64_classify_symbol (x, INTVAL (offset));
7754 }
7755
7756
7757 /* Return TRUE if X is a legitimate address for accessing memory in
7758 mode MODE. */
7759 static bool
7760 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7761 {
7762 struct aarch64_address_info addr;
7763
7764 return aarch64_classify_address (&addr, x, mode, strict_p);
7765 }
7766
7767 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7768 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7769 bool
7770 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7771 aarch64_addr_query_type type)
7772 {
7773 struct aarch64_address_info addr;
7774
7775 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7776 }
7777
7778 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7779
7780 static bool
7781 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7782 poly_int64 orig_offset,
7783 machine_mode mode)
7784 {
7785 HOST_WIDE_INT size;
7786 if (GET_MODE_SIZE (mode).is_constant (&size))
7787 {
7788 HOST_WIDE_INT const_offset, second_offset;
7789
7790 /* A general SVE offset is A * VQ + B. Remove the A component from
7791 coefficient 0 in order to get the constant B. */
7792 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7793
7794 /* Split an out-of-range address displacement into a base and
7795 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7796 range otherwise to increase opportunities for sharing the base
7797 address of different sizes. Unaligned accesses use the signed
7798 9-bit range, TImode/TFmode use the intersection of signed
7799 scaled 7-bit and signed 9-bit offset. */
7800 if (mode == TImode || mode == TFmode)
7801 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7802 else if ((const_offset & (size - 1)) != 0)
7803 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7804 else
7805 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7806
7807 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7808 return false;
7809
7810 /* Split the offset into second_offset and the rest. */
7811 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7812 *offset2 = gen_int_mode (second_offset, Pmode);
7813 return true;
7814 }
7815 else
7816 {
7817 /* Get the mode we should use as the basis of the range. For structure
7818 modes this is the mode of one vector. */
7819 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7820 machine_mode step_mode
7821 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7822
7823 /* Get the "mul vl" multiplier we'd like to use. */
7824 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7825 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7826 if (vec_flags & VEC_SVE_DATA)
7827 /* LDR supports a 9-bit range, but the move patterns for
7828 structure modes require all vectors to be in range of the
7829 same base. The simplest way of accomodating that while still
7830 promoting reuse of anchor points between different modes is
7831 to use an 8-bit range unconditionally. */
7832 vnum = ((vnum + 128) & 255) - 128;
7833 else
7834 /* Predicates are only handled singly, so we might as well use
7835 the full range. */
7836 vnum = ((vnum + 256) & 511) - 256;
7837 if (vnum == 0)
7838 return false;
7839
7840 /* Convert the "mul vl" multiplier into a byte offset. */
7841 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7842 if (known_eq (second_offset, orig_offset))
7843 return false;
7844
7845 /* Split the offset into second_offset and the rest. */
7846 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7847 *offset2 = gen_int_mode (second_offset, Pmode);
7848 return true;
7849 }
7850 }
7851
7852 /* Return the binary representation of floating point constant VALUE in INTVAL.
7853 If the value cannot be converted, return false without setting INTVAL.
7854 The conversion is done in the given MODE. */
7855 bool
7856 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7857 {
7858
7859 /* We make a general exception for 0. */
7860 if (aarch64_float_const_zero_rtx_p (value))
7861 {
7862 *intval = 0;
7863 return true;
7864 }
7865
7866 scalar_float_mode mode;
7867 if (GET_CODE (value) != CONST_DOUBLE
7868 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7869 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7870 /* Only support up to DF mode. */
7871 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7872 return false;
7873
7874 unsigned HOST_WIDE_INT ival = 0;
7875
7876 long res[2];
7877 real_to_target (res,
7878 CONST_DOUBLE_REAL_VALUE (value),
7879 REAL_MODE_FORMAT (mode));
7880
7881 if (mode == DFmode)
7882 {
7883 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7884 ival = zext_hwi (res[order], 32);
7885 ival |= (zext_hwi (res[1 - order], 32) << 32);
7886 }
7887 else
7888 ival = zext_hwi (res[0], 32);
7889
7890 *intval = ival;
7891 return true;
7892 }
7893
7894 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7895 single MOV(+MOVK) followed by an FMOV. */
7896 bool
7897 aarch64_float_const_rtx_p (rtx x)
7898 {
7899 machine_mode mode = GET_MODE (x);
7900 if (mode == VOIDmode)
7901 return false;
7902
7903 /* Determine whether it's cheaper to write float constants as
7904 mov/movk pairs over ldr/adrp pairs. */
7905 unsigned HOST_WIDE_INT ival;
7906
7907 if (GET_CODE (x) == CONST_DOUBLE
7908 && SCALAR_FLOAT_MODE_P (mode)
7909 && aarch64_reinterpret_float_as_int (x, &ival))
7910 {
7911 scalar_int_mode imode = (mode == HFmode
7912 ? SImode
7913 : int_mode_for_mode (mode).require ());
7914 int num_instr = aarch64_internal_mov_immediate
7915 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7916 return num_instr < 3;
7917 }
7918
7919 return false;
7920 }
7921
7922 /* Return TRUE if rtx X is immediate constant 0.0 */
7923 bool
7924 aarch64_float_const_zero_rtx_p (rtx x)
7925 {
7926 if (GET_MODE (x) == VOIDmode)
7927 return false;
7928
7929 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7930 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7931 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7932 }
7933
7934 /* Return TRUE if rtx X is immediate constant that fits in a single
7935 MOVI immediate operation. */
7936 bool
7937 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7938 {
7939 if (!TARGET_SIMD)
7940 return false;
7941
7942 machine_mode vmode;
7943 scalar_int_mode imode;
7944 unsigned HOST_WIDE_INT ival;
7945
7946 if (GET_CODE (x) == CONST_DOUBLE
7947 && SCALAR_FLOAT_MODE_P (mode))
7948 {
7949 if (!aarch64_reinterpret_float_as_int (x, &ival))
7950 return false;
7951
7952 /* We make a general exception for 0. */
7953 if (aarch64_float_const_zero_rtx_p (x))
7954 return true;
7955
7956 imode = int_mode_for_mode (mode).require ();
7957 }
7958 else if (GET_CODE (x) == CONST_INT
7959 && is_a <scalar_int_mode> (mode, &imode))
7960 ival = INTVAL (x);
7961 else
7962 return false;
7963
7964 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7965 a 128 bit vector mode. */
7966 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7967
7968 vmode = aarch64_simd_container_mode (imode, width);
7969 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7970
7971 return aarch64_simd_valid_immediate (v_op, NULL);
7972 }
7973
7974
7975 /* Return the fixed registers used for condition codes. */
7976
7977 static bool
7978 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7979 {
7980 *p1 = CC_REGNUM;
7981 *p2 = INVALID_REGNUM;
7982 return true;
7983 }
7984
7985 /* This function is used by the call expanders of the machine description.
7986 RESULT is the register in which the result is returned. It's NULL for
7987 "call" and "sibcall".
7988 MEM is the location of the function call.
7989 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
7990 SIBCALL indicates whether this function call is normal call or sibling call.
7991 It will generate different pattern accordingly. */
7992
7993 void
7994 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
7995 {
7996 rtx call, callee, tmp;
7997 rtvec vec;
7998 machine_mode mode;
7999
8000 gcc_assert (MEM_P (mem));
8001 callee = XEXP (mem, 0);
8002 mode = GET_MODE (callee);
8003 gcc_assert (mode == Pmode);
8004
8005 /* Decide if we should generate indirect calls by loading the
8006 address of the callee into a register before performing
8007 the branch-and-link. */
8008 if (SYMBOL_REF_P (callee)
8009 ? (aarch64_is_long_call_p (callee)
8010 || aarch64_is_noplt_call_p (callee))
8011 : !REG_P (callee))
8012 XEXP (mem, 0) = force_reg (mode, callee);
8013
8014 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8015
8016 if (result != NULL_RTX)
8017 call = gen_rtx_SET (result, call);
8018
8019 if (sibcall)
8020 tmp = ret_rtx;
8021 else
8022 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8023
8024 gcc_assert (CONST_INT_P (callee_abi));
8025 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8026 UNSPEC_CALLEE_ABI);
8027
8028 vec = gen_rtvec (3, call, callee_abi, tmp);
8029 call = gen_rtx_PARALLEL (VOIDmode, vec);
8030
8031 aarch64_emit_call_insn (call);
8032 }
8033
8034 /* Emit call insn with PAT and do aarch64-specific handling. */
8035
8036 void
8037 aarch64_emit_call_insn (rtx pat)
8038 {
8039 rtx insn = emit_call_insn (pat);
8040
8041 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8042 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8043 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8044 }
8045
8046 machine_mode
8047 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8048 {
8049 machine_mode mode_x = GET_MODE (x);
8050 rtx_code code_x = GET_CODE (x);
8051
8052 /* All floating point compares return CCFP if it is an equality
8053 comparison, and CCFPE otherwise. */
8054 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8055 {
8056 switch (code)
8057 {
8058 case EQ:
8059 case NE:
8060 case UNORDERED:
8061 case ORDERED:
8062 case UNLT:
8063 case UNLE:
8064 case UNGT:
8065 case UNGE:
8066 case UNEQ:
8067 return CCFPmode;
8068
8069 case LT:
8070 case LE:
8071 case GT:
8072 case GE:
8073 case LTGT:
8074 return CCFPEmode;
8075
8076 default:
8077 gcc_unreachable ();
8078 }
8079 }
8080
8081 /* Equality comparisons of short modes against zero can be performed
8082 using the TST instruction with the appropriate bitmask. */
8083 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8084 && (code == EQ || code == NE)
8085 && (mode_x == HImode || mode_x == QImode))
8086 return CC_NZmode;
8087
8088 /* Similarly, comparisons of zero_extends from shorter modes can
8089 be performed using an ANDS with an immediate mask. */
8090 if (y == const0_rtx && code_x == ZERO_EXTEND
8091 && (mode_x == SImode || mode_x == DImode)
8092 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8093 && (code == EQ || code == NE))
8094 return CC_NZmode;
8095
8096 if ((mode_x == SImode || mode_x == DImode)
8097 && y == const0_rtx
8098 && (code == EQ || code == NE || code == LT || code == GE)
8099 && (code_x == PLUS || code_x == MINUS || code_x == AND
8100 || code_x == NEG
8101 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8102 && CONST_INT_P (XEXP (x, 2)))))
8103 return CC_NZmode;
8104
8105 /* A compare with a shifted operand. Because of canonicalization,
8106 the comparison will have to be swapped when we emit the assembly
8107 code. */
8108 if ((mode_x == SImode || mode_x == DImode)
8109 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8110 && (code_x == ASHIFT || code_x == ASHIFTRT
8111 || code_x == LSHIFTRT
8112 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8113 return CC_SWPmode;
8114
8115 /* Similarly for a negated operand, but we can only do this for
8116 equalities. */
8117 if ((mode_x == SImode || mode_x == DImode)
8118 && (REG_P (y) || GET_CODE (y) == SUBREG)
8119 && (code == EQ || code == NE)
8120 && code_x == NEG)
8121 return CC_Zmode;
8122
8123 /* A test for unsigned overflow from an addition. */
8124 if ((mode_x == DImode || mode_x == TImode)
8125 && (code == LTU || code == GEU)
8126 && code_x == PLUS
8127 && rtx_equal_p (XEXP (x, 0), y))
8128 return CC_Cmode;
8129
8130 /* A test for unsigned overflow from an add with carry. */
8131 if ((mode_x == DImode || mode_x == TImode)
8132 && (code == LTU || code == GEU)
8133 && code_x == PLUS
8134 && CONST_SCALAR_INT_P (y)
8135 && (rtx_mode_t (y, mode_x)
8136 == (wi::shwi (1, mode_x)
8137 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8138 return CC_ADCmode;
8139
8140 /* A test for signed overflow. */
8141 if ((mode_x == DImode || mode_x == TImode)
8142 && code == NE
8143 && code_x == PLUS
8144 && GET_CODE (y) == SIGN_EXTEND)
8145 return CC_Vmode;
8146
8147 /* For everything else, return CCmode. */
8148 return CCmode;
8149 }
8150
8151 static int
8152 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8153
8154 int
8155 aarch64_get_condition_code (rtx x)
8156 {
8157 machine_mode mode = GET_MODE (XEXP (x, 0));
8158 enum rtx_code comp_code = GET_CODE (x);
8159
8160 if (GET_MODE_CLASS (mode) != MODE_CC)
8161 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8162 return aarch64_get_condition_code_1 (mode, comp_code);
8163 }
8164
8165 static int
8166 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8167 {
8168 switch (mode)
8169 {
8170 case E_CCFPmode:
8171 case E_CCFPEmode:
8172 switch (comp_code)
8173 {
8174 case GE: return AARCH64_GE;
8175 case GT: return AARCH64_GT;
8176 case LE: return AARCH64_LS;
8177 case LT: return AARCH64_MI;
8178 case NE: return AARCH64_NE;
8179 case EQ: return AARCH64_EQ;
8180 case ORDERED: return AARCH64_VC;
8181 case UNORDERED: return AARCH64_VS;
8182 case UNLT: return AARCH64_LT;
8183 case UNLE: return AARCH64_LE;
8184 case UNGT: return AARCH64_HI;
8185 case UNGE: return AARCH64_PL;
8186 default: return -1;
8187 }
8188 break;
8189
8190 case E_CCmode:
8191 switch (comp_code)
8192 {
8193 case NE: return AARCH64_NE;
8194 case EQ: return AARCH64_EQ;
8195 case GE: return AARCH64_GE;
8196 case GT: return AARCH64_GT;
8197 case LE: return AARCH64_LE;
8198 case LT: return AARCH64_LT;
8199 case GEU: return AARCH64_CS;
8200 case GTU: return AARCH64_HI;
8201 case LEU: return AARCH64_LS;
8202 case LTU: return AARCH64_CC;
8203 default: return -1;
8204 }
8205 break;
8206
8207 case E_CC_SWPmode:
8208 switch (comp_code)
8209 {
8210 case NE: return AARCH64_NE;
8211 case EQ: return AARCH64_EQ;
8212 case GE: return AARCH64_LE;
8213 case GT: return AARCH64_LT;
8214 case LE: return AARCH64_GE;
8215 case LT: return AARCH64_GT;
8216 case GEU: return AARCH64_LS;
8217 case GTU: return AARCH64_CC;
8218 case LEU: return AARCH64_CS;
8219 case LTU: return AARCH64_HI;
8220 default: return -1;
8221 }
8222 break;
8223
8224 case E_CC_NZCmode:
8225 switch (comp_code)
8226 {
8227 case NE: return AARCH64_NE; /* = any */
8228 case EQ: return AARCH64_EQ; /* = none */
8229 case GE: return AARCH64_PL; /* = nfrst */
8230 case LT: return AARCH64_MI; /* = first */
8231 case GEU: return AARCH64_CS; /* = nlast */
8232 case GTU: return AARCH64_HI; /* = pmore */
8233 case LEU: return AARCH64_LS; /* = plast */
8234 case LTU: return AARCH64_CC; /* = last */
8235 default: return -1;
8236 }
8237 break;
8238
8239 case E_CC_NZmode:
8240 switch (comp_code)
8241 {
8242 case NE: return AARCH64_NE;
8243 case EQ: return AARCH64_EQ;
8244 case GE: return AARCH64_PL;
8245 case LT: return AARCH64_MI;
8246 default: return -1;
8247 }
8248 break;
8249
8250 case E_CC_Zmode:
8251 switch (comp_code)
8252 {
8253 case NE: return AARCH64_NE;
8254 case EQ: return AARCH64_EQ;
8255 default: return -1;
8256 }
8257 break;
8258
8259 case E_CC_Cmode:
8260 switch (comp_code)
8261 {
8262 case LTU: return AARCH64_CS;
8263 case GEU: return AARCH64_CC;
8264 default: return -1;
8265 }
8266 break;
8267
8268 case E_CC_ADCmode:
8269 switch (comp_code)
8270 {
8271 case GEU: return AARCH64_CS;
8272 case LTU: return AARCH64_CC;
8273 default: return -1;
8274 }
8275 break;
8276
8277 case E_CC_Vmode:
8278 switch (comp_code)
8279 {
8280 case NE: return AARCH64_VS;
8281 case EQ: return AARCH64_VC;
8282 default: return -1;
8283 }
8284 break;
8285
8286 default:
8287 return -1;
8288 }
8289
8290 return -1;
8291 }
8292
8293 bool
8294 aarch64_const_vec_all_same_in_range_p (rtx x,
8295 HOST_WIDE_INT minval,
8296 HOST_WIDE_INT maxval)
8297 {
8298 rtx elt;
8299 return (const_vec_duplicate_p (x, &elt)
8300 && CONST_INT_P (elt)
8301 && IN_RANGE (INTVAL (elt), minval, maxval));
8302 }
8303
8304 bool
8305 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8306 {
8307 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8308 }
8309
8310 /* Return true if VEC is a constant in which every element is in the range
8311 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8312
8313 static bool
8314 aarch64_const_vec_all_in_range_p (rtx vec,
8315 HOST_WIDE_INT minval,
8316 HOST_WIDE_INT maxval)
8317 {
8318 if (GET_CODE (vec) != CONST_VECTOR
8319 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8320 return false;
8321
8322 int nunits;
8323 if (!CONST_VECTOR_STEPPED_P (vec))
8324 nunits = const_vector_encoded_nelts (vec);
8325 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8326 return false;
8327
8328 for (int i = 0; i < nunits; i++)
8329 {
8330 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8331 if (!CONST_INT_P (vec_elem)
8332 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8333 return false;
8334 }
8335 return true;
8336 }
8337
8338 /* N Z C V. */
8339 #define AARCH64_CC_V 1
8340 #define AARCH64_CC_C (1 << 1)
8341 #define AARCH64_CC_Z (1 << 2)
8342 #define AARCH64_CC_N (1 << 3)
8343
8344 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8345 static const int aarch64_nzcv_codes[] =
8346 {
8347 0, /* EQ, Z == 1. */
8348 AARCH64_CC_Z, /* NE, Z == 0. */
8349 0, /* CS, C == 1. */
8350 AARCH64_CC_C, /* CC, C == 0. */
8351 0, /* MI, N == 1. */
8352 AARCH64_CC_N, /* PL, N == 0. */
8353 0, /* VS, V == 1. */
8354 AARCH64_CC_V, /* VC, V == 0. */
8355 0, /* HI, C ==1 && Z == 0. */
8356 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8357 AARCH64_CC_V, /* GE, N == V. */
8358 0, /* LT, N != V. */
8359 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8360 0, /* LE, !(Z == 0 && N == V). */
8361 0, /* AL, Any. */
8362 0 /* NV, Any. */
8363 };
8364
8365 /* Print floating-point vector immediate operand X to F, negating it
8366 first if NEGATE is true. Return true on success, false if it isn't
8367 a constant we can handle. */
8368
8369 static bool
8370 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8371 {
8372 rtx elt;
8373
8374 if (!const_vec_duplicate_p (x, &elt))
8375 return false;
8376
8377 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8378 if (negate)
8379 r = real_value_negate (&r);
8380
8381 /* Handle the SVE single-bit immediates specially, since they have a
8382 fixed form in the assembly syntax. */
8383 if (real_equal (&r, &dconst0))
8384 asm_fprintf (f, "0.0");
8385 else if (real_equal (&r, &dconst2))
8386 asm_fprintf (f, "2.0");
8387 else if (real_equal (&r, &dconst1))
8388 asm_fprintf (f, "1.0");
8389 else if (real_equal (&r, &dconsthalf))
8390 asm_fprintf (f, "0.5");
8391 else
8392 {
8393 const int buf_size = 20;
8394 char float_buf[buf_size] = {'\0'};
8395 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8396 1, GET_MODE (elt));
8397 asm_fprintf (f, "%s", float_buf);
8398 }
8399
8400 return true;
8401 }
8402
8403 /* Return the equivalent letter for size. */
8404 static char
8405 sizetochar (int size)
8406 {
8407 switch (size)
8408 {
8409 case 64: return 'd';
8410 case 32: return 's';
8411 case 16: return 'h';
8412 case 8 : return 'b';
8413 default: gcc_unreachable ();
8414 }
8415 }
8416
8417 /* Print operand X to file F in a target specific manner according to CODE.
8418 The acceptable formatting commands given by CODE are:
8419 'c': An integer or symbol address without a preceding #
8420 sign.
8421 'C': Take the duplicated element in a vector constant
8422 and print it in hex.
8423 'D': Take the duplicated element in a vector constant
8424 and print it as an unsigned integer, in decimal.
8425 'e': Print the sign/zero-extend size as a character 8->b,
8426 16->h, 32->w. Can also be used for masks:
8427 0xff->b, 0xffff->h, 0xffffffff->w.
8428 'I': If the operand is a duplicated vector constant,
8429 replace it with the duplicated scalar. If the
8430 operand is then a floating-point constant, replace
8431 it with the integer bit representation. Print the
8432 transformed constant as a signed decimal number.
8433 'p': Prints N such that 2^N == X (X must be power of 2 and
8434 const int).
8435 'P': Print the number of non-zero bits in X (a const_int).
8436 'H': Print the higher numbered register of a pair (TImode)
8437 of regs.
8438 'm': Print a condition (eq, ne, etc).
8439 'M': Same as 'm', but invert condition.
8440 'N': Take the duplicated element in a vector constant
8441 and print the negative of it in decimal.
8442 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8443 'S/T/U/V': Print a FP/SIMD register name for a register list.
8444 The register printed is the FP/SIMD register name
8445 of X + 0/1/2/3 for S/T/U/V.
8446 'R': Print a scalar Integer/FP/SIMD register name + 1.
8447 'X': Print bottom 16 bits of integer constant in hex.
8448 'w/x': Print a general register name or the zero register
8449 (32-bit or 64-bit).
8450 '0': Print a normal operand, if it's a general register,
8451 then we assume DImode.
8452 'k': Print NZCV for conditional compare instructions.
8453 'A': Output address constant representing the first
8454 argument of X, specifying a relocation offset
8455 if appropriate.
8456 'L': Output constant address specified by X
8457 with a relocation offset if appropriate.
8458 'G': Prints address of X, specifying a PC relative
8459 relocation mode if appropriate.
8460 'y': Output address of LDP or STP - this is used for
8461 some LDP/STPs which don't use a PARALLEL in their
8462 pattern (so the mode needs to be adjusted).
8463 'z': Output address of a typical LDP or STP. */
8464
8465 static void
8466 aarch64_print_operand (FILE *f, rtx x, int code)
8467 {
8468 rtx elt;
8469 switch (code)
8470 {
8471 case 'c':
8472 switch (GET_CODE (x))
8473 {
8474 case CONST_INT:
8475 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8476 break;
8477
8478 case SYMBOL_REF:
8479 output_addr_const (f, x);
8480 break;
8481
8482 case CONST:
8483 if (GET_CODE (XEXP (x, 0)) == PLUS
8484 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8485 {
8486 output_addr_const (f, x);
8487 break;
8488 }
8489 /* Fall through. */
8490
8491 default:
8492 output_operand_lossage ("unsupported operand for code '%c'", code);
8493 }
8494 break;
8495
8496 case 'e':
8497 {
8498 x = unwrap_const_vec_duplicate (x);
8499 if (!CONST_INT_P (x))
8500 {
8501 output_operand_lossage ("invalid operand for '%%%c'", code);
8502 return;
8503 }
8504
8505 HOST_WIDE_INT val = INTVAL (x);
8506 if ((val & ~7) == 8 || val == 0xff)
8507 fputc ('b', f);
8508 else if ((val & ~7) == 16 || val == 0xffff)
8509 fputc ('h', f);
8510 else if ((val & ~7) == 32 || val == 0xffffffff)
8511 fputc ('w', f);
8512 else
8513 {
8514 output_operand_lossage ("invalid operand for '%%%c'", code);
8515 return;
8516 }
8517 }
8518 break;
8519
8520 case 'p':
8521 {
8522 int n;
8523
8524 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8525 {
8526 output_operand_lossage ("invalid operand for '%%%c'", code);
8527 return;
8528 }
8529
8530 asm_fprintf (f, "%d", n);
8531 }
8532 break;
8533
8534 case 'P':
8535 if (!CONST_INT_P (x))
8536 {
8537 output_operand_lossage ("invalid operand for '%%%c'", code);
8538 return;
8539 }
8540
8541 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8542 break;
8543
8544 case 'H':
8545 if (x == const0_rtx)
8546 {
8547 asm_fprintf (f, "xzr");
8548 break;
8549 }
8550
8551 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8552 {
8553 output_operand_lossage ("invalid operand for '%%%c'", code);
8554 return;
8555 }
8556
8557 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8558 break;
8559
8560 case 'I':
8561 {
8562 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8563 if (CONST_INT_P (x))
8564 asm_fprintf (f, "%wd", INTVAL (x));
8565 else
8566 {
8567 output_operand_lossage ("invalid operand for '%%%c'", code);
8568 return;
8569 }
8570 break;
8571 }
8572
8573 case 'M':
8574 case 'm':
8575 {
8576 int cond_code;
8577 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8578 if (x == const_true_rtx)
8579 {
8580 if (code == 'M')
8581 fputs ("nv", f);
8582 return;
8583 }
8584
8585 if (!COMPARISON_P (x))
8586 {
8587 output_operand_lossage ("invalid operand for '%%%c'", code);
8588 return;
8589 }
8590
8591 cond_code = aarch64_get_condition_code (x);
8592 gcc_assert (cond_code >= 0);
8593 if (code == 'M')
8594 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8595 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8596 fputs (aarch64_sve_condition_codes[cond_code], f);
8597 else
8598 fputs (aarch64_condition_codes[cond_code], f);
8599 }
8600 break;
8601
8602 case 'N':
8603 if (!const_vec_duplicate_p (x, &elt))
8604 {
8605 output_operand_lossage ("invalid vector constant");
8606 return;
8607 }
8608
8609 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8610 asm_fprintf (f, "%wd", -INTVAL (elt));
8611 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8612 && aarch64_print_vector_float_operand (f, x, true))
8613 ;
8614 else
8615 {
8616 output_operand_lossage ("invalid vector constant");
8617 return;
8618 }
8619 break;
8620
8621 case 'b':
8622 case 'h':
8623 case 's':
8624 case 'd':
8625 case 'q':
8626 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8627 {
8628 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8629 return;
8630 }
8631 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8632 break;
8633
8634 case 'S':
8635 case 'T':
8636 case 'U':
8637 case 'V':
8638 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8639 {
8640 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8641 return;
8642 }
8643 asm_fprintf (f, "%c%d",
8644 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8645 REGNO (x) - V0_REGNUM + (code - 'S'));
8646 break;
8647
8648 case 'R':
8649 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
8650 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8651 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8652 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
8653 else
8654 output_operand_lossage ("incompatible register operand for '%%%c'",
8655 code);
8656 break;
8657
8658 case 'X':
8659 if (!CONST_INT_P (x))
8660 {
8661 output_operand_lossage ("invalid operand for '%%%c'", code);
8662 return;
8663 }
8664 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8665 break;
8666
8667 case 'C':
8668 {
8669 /* Print a replicated constant in hex. */
8670 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8671 {
8672 output_operand_lossage ("invalid operand for '%%%c'", code);
8673 return;
8674 }
8675 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8676 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8677 }
8678 break;
8679
8680 case 'D':
8681 {
8682 /* Print a replicated constant in decimal, treating it as
8683 unsigned. */
8684 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8685 {
8686 output_operand_lossage ("invalid operand for '%%%c'", code);
8687 return;
8688 }
8689 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8690 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8691 }
8692 break;
8693
8694 case 'w':
8695 case 'x':
8696 if (x == const0_rtx
8697 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8698 {
8699 asm_fprintf (f, "%czr", code);
8700 break;
8701 }
8702
8703 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8704 {
8705 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8706 break;
8707 }
8708
8709 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8710 {
8711 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8712 break;
8713 }
8714
8715 /* Fall through */
8716
8717 case 0:
8718 if (x == NULL)
8719 {
8720 output_operand_lossage ("missing operand");
8721 return;
8722 }
8723
8724 switch (GET_CODE (x))
8725 {
8726 case REG:
8727 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8728 {
8729 if (REG_NREGS (x) == 1)
8730 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8731 else
8732 {
8733 char suffix
8734 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8735 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8736 REGNO (x) - V0_REGNUM, suffix,
8737 END_REGNO (x) - V0_REGNUM - 1, suffix);
8738 }
8739 }
8740 else
8741 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8742 break;
8743
8744 case MEM:
8745 output_address (GET_MODE (x), XEXP (x, 0));
8746 break;
8747
8748 case LABEL_REF:
8749 case SYMBOL_REF:
8750 output_addr_const (asm_out_file, x);
8751 break;
8752
8753 case CONST_INT:
8754 asm_fprintf (f, "%wd", INTVAL (x));
8755 break;
8756
8757 case CONST:
8758 if (!VECTOR_MODE_P (GET_MODE (x)))
8759 {
8760 output_addr_const (asm_out_file, x);
8761 break;
8762 }
8763 /* fall through */
8764
8765 case CONST_VECTOR:
8766 if (!const_vec_duplicate_p (x, &elt))
8767 {
8768 output_operand_lossage ("invalid vector constant");
8769 return;
8770 }
8771
8772 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8773 asm_fprintf (f, "%wd", INTVAL (elt));
8774 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8775 && aarch64_print_vector_float_operand (f, x, false))
8776 ;
8777 else
8778 {
8779 output_operand_lossage ("invalid vector constant");
8780 return;
8781 }
8782 break;
8783
8784 case CONST_DOUBLE:
8785 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8786 be getting CONST_DOUBLEs holding integers. */
8787 gcc_assert (GET_MODE (x) != VOIDmode);
8788 if (aarch64_float_const_zero_rtx_p (x))
8789 {
8790 fputc ('0', f);
8791 break;
8792 }
8793 else if (aarch64_float_const_representable_p (x))
8794 {
8795 #define buf_size 20
8796 char float_buf[buf_size] = {'\0'};
8797 real_to_decimal_for_mode (float_buf,
8798 CONST_DOUBLE_REAL_VALUE (x),
8799 buf_size, buf_size,
8800 1, GET_MODE (x));
8801 asm_fprintf (asm_out_file, "%s", float_buf);
8802 break;
8803 #undef buf_size
8804 }
8805 output_operand_lossage ("invalid constant");
8806 return;
8807 default:
8808 output_operand_lossage ("invalid operand");
8809 return;
8810 }
8811 break;
8812
8813 case 'A':
8814 if (GET_CODE (x) == HIGH)
8815 x = XEXP (x, 0);
8816
8817 switch (aarch64_classify_symbolic_expression (x))
8818 {
8819 case SYMBOL_SMALL_GOT_4G:
8820 asm_fprintf (asm_out_file, ":got:");
8821 break;
8822
8823 case SYMBOL_SMALL_TLSGD:
8824 asm_fprintf (asm_out_file, ":tlsgd:");
8825 break;
8826
8827 case SYMBOL_SMALL_TLSDESC:
8828 asm_fprintf (asm_out_file, ":tlsdesc:");
8829 break;
8830
8831 case SYMBOL_SMALL_TLSIE:
8832 asm_fprintf (asm_out_file, ":gottprel:");
8833 break;
8834
8835 case SYMBOL_TLSLE24:
8836 asm_fprintf (asm_out_file, ":tprel:");
8837 break;
8838
8839 case SYMBOL_TINY_GOT:
8840 gcc_unreachable ();
8841 break;
8842
8843 default:
8844 break;
8845 }
8846 output_addr_const (asm_out_file, x);
8847 break;
8848
8849 case 'L':
8850 switch (aarch64_classify_symbolic_expression (x))
8851 {
8852 case SYMBOL_SMALL_GOT_4G:
8853 asm_fprintf (asm_out_file, ":lo12:");
8854 break;
8855
8856 case SYMBOL_SMALL_TLSGD:
8857 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8858 break;
8859
8860 case SYMBOL_SMALL_TLSDESC:
8861 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8862 break;
8863
8864 case SYMBOL_SMALL_TLSIE:
8865 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8866 break;
8867
8868 case SYMBOL_TLSLE12:
8869 asm_fprintf (asm_out_file, ":tprel_lo12:");
8870 break;
8871
8872 case SYMBOL_TLSLE24:
8873 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8874 break;
8875
8876 case SYMBOL_TINY_GOT:
8877 asm_fprintf (asm_out_file, ":got:");
8878 break;
8879
8880 case SYMBOL_TINY_TLSIE:
8881 asm_fprintf (asm_out_file, ":gottprel:");
8882 break;
8883
8884 default:
8885 break;
8886 }
8887 output_addr_const (asm_out_file, x);
8888 break;
8889
8890 case 'G':
8891 switch (aarch64_classify_symbolic_expression (x))
8892 {
8893 case SYMBOL_TLSLE24:
8894 asm_fprintf (asm_out_file, ":tprel_hi12:");
8895 break;
8896 default:
8897 break;
8898 }
8899 output_addr_const (asm_out_file, x);
8900 break;
8901
8902 case 'k':
8903 {
8904 HOST_WIDE_INT cond_code;
8905
8906 if (!CONST_INT_P (x))
8907 {
8908 output_operand_lossage ("invalid operand for '%%%c'", code);
8909 return;
8910 }
8911
8912 cond_code = INTVAL (x);
8913 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8914 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8915 }
8916 break;
8917
8918 case 'y':
8919 case 'z':
8920 {
8921 machine_mode mode = GET_MODE (x);
8922
8923 if (GET_CODE (x) != MEM
8924 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8925 {
8926 output_operand_lossage ("invalid operand for '%%%c'", code);
8927 return;
8928 }
8929
8930 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8931 code == 'y'
8932 ? ADDR_QUERY_LDP_STP_N
8933 : ADDR_QUERY_LDP_STP))
8934 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8935 }
8936 break;
8937
8938 default:
8939 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8940 return;
8941 }
8942 }
8943
8944 /* Print address 'x' of a memory access with mode 'mode'.
8945 'op' is the context required by aarch64_classify_address. It can either be
8946 MEM for a normal memory access or PARALLEL for LDP/STP. */
8947 static bool
8948 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8949 aarch64_addr_query_type type)
8950 {
8951 struct aarch64_address_info addr;
8952 unsigned int size;
8953
8954 /* Check all addresses are Pmode - including ILP32. */
8955 if (GET_MODE (x) != Pmode
8956 && (!CONST_INT_P (x)
8957 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8958 {
8959 output_operand_lossage ("invalid address mode");
8960 return false;
8961 }
8962
8963 if (aarch64_classify_address (&addr, x, mode, true, type))
8964 switch (addr.type)
8965 {
8966 case ADDRESS_REG_IMM:
8967 if (known_eq (addr.const_offset, 0))
8968 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8969 else if (aarch64_sve_data_mode_p (mode))
8970 {
8971 HOST_WIDE_INT vnum
8972 = exact_div (addr.const_offset,
8973 BYTES_PER_SVE_VECTOR).to_constant ();
8974 asm_fprintf (f, "[%s, #%wd, mul vl]",
8975 reg_names[REGNO (addr.base)], vnum);
8976 }
8977 else if (aarch64_sve_pred_mode_p (mode))
8978 {
8979 HOST_WIDE_INT vnum
8980 = exact_div (addr.const_offset,
8981 BYTES_PER_SVE_PRED).to_constant ();
8982 asm_fprintf (f, "[%s, #%wd, mul vl]",
8983 reg_names[REGNO (addr.base)], vnum);
8984 }
8985 else
8986 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8987 INTVAL (addr.offset));
8988 return true;
8989
8990 case ADDRESS_REG_REG:
8991 if (addr.shift == 0)
8992 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8993 reg_names [REGNO (addr.offset)]);
8994 else
8995 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8996 reg_names [REGNO (addr.offset)], addr.shift);
8997 return true;
8998
8999 case ADDRESS_REG_UXTW:
9000 if (addr.shift == 0)
9001 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9002 REGNO (addr.offset) - R0_REGNUM);
9003 else
9004 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9005 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9006 return true;
9007
9008 case ADDRESS_REG_SXTW:
9009 if (addr.shift == 0)
9010 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9011 REGNO (addr.offset) - R0_REGNUM);
9012 else
9013 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9014 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9015 return true;
9016
9017 case ADDRESS_REG_WB:
9018 /* Writeback is only supported for fixed-width modes. */
9019 size = GET_MODE_SIZE (mode).to_constant ();
9020 switch (GET_CODE (x))
9021 {
9022 case PRE_INC:
9023 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9024 return true;
9025 case POST_INC:
9026 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9027 return true;
9028 case PRE_DEC:
9029 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9030 return true;
9031 case POST_DEC:
9032 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9033 return true;
9034 case PRE_MODIFY:
9035 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9036 INTVAL (addr.offset));
9037 return true;
9038 case POST_MODIFY:
9039 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9040 INTVAL (addr.offset));
9041 return true;
9042 default:
9043 break;
9044 }
9045 break;
9046
9047 case ADDRESS_LO_SUM:
9048 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9049 output_addr_const (f, addr.offset);
9050 asm_fprintf (f, "]");
9051 return true;
9052
9053 case ADDRESS_SYMBOLIC:
9054 output_addr_const (f, x);
9055 return true;
9056 }
9057
9058 return false;
9059 }
9060
9061 /* Print address 'x' of a memory access with mode 'mode'. */
9062 static void
9063 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9064 {
9065 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9066 output_addr_const (f, x);
9067 }
9068
9069 bool
9070 aarch64_label_mentioned_p (rtx x)
9071 {
9072 const char *fmt;
9073 int i;
9074
9075 if (GET_CODE (x) == LABEL_REF)
9076 return true;
9077
9078 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9079 referencing instruction, but they are constant offsets, not
9080 symbols. */
9081 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9082 return false;
9083
9084 fmt = GET_RTX_FORMAT (GET_CODE (x));
9085 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9086 {
9087 if (fmt[i] == 'E')
9088 {
9089 int j;
9090
9091 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9092 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9093 return 1;
9094 }
9095 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9096 return 1;
9097 }
9098
9099 return 0;
9100 }
9101
9102 /* Implement REGNO_REG_CLASS. */
9103
9104 enum reg_class
9105 aarch64_regno_regclass (unsigned regno)
9106 {
9107 if (GP_REGNUM_P (regno))
9108 return GENERAL_REGS;
9109
9110 if (regno == SP_REGNUM)
9111 return STACK_REG;
9112
9113 if (regno == FRAME_POINTER_REGNUM
9114 || regno == ARG_POINTER_REGNUM)
9115 return POINTER_REGS;
9116
9117 if (FP_REGNUM_P (regno))
9118 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9119 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9120
9121 if (PR_REGNUM_P (regno))
9122 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9123
9124 return NO_REGS;
9125 }
9126
9127 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9128 If OFFSET is out of range, return an offset of an anchor point
9129 that is in range. Return 0 otherwise. */
9130
9131 static HOST_WIDE_INT
9132 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9133 machine_mode mode)
9134 {
9135 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9136 if (size > 16)
9137 return (offset + 0x400) & ~0x7f0;
9138
9139 /* For offsets that aren't a multiple of the access size, the limit is
9140 -256...255. */
9141 if (offset & (size - 1))
9142 {
9143 /* BLKmode typically uses LDP of X-registers. */
9144 if (mode == BLKmode)
9145 return (offset + 512) & ~0x3ff;
9146 return (offset + 0x100) & ~0x1ff;
9147 }
9148
9149 /* Small negative offsets are supported. */
9150 if (IN_RANGE (offset, -256, 0))
9151 return 0;
9152
9153 if (mode == TImode || mode == TFmode)
9154 return (offset + 0x100) & ~0x1ff;
9155
9156 /* Use 12-bit offset by access size. */
9157 return offset & (~0xfff * size);
9158 }
9159
9160 static rtx
9161 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9162 {
9163 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9164 where mask is selected by alignment and size of the offset.
9165 We try to pick as large a range for the offset as possible to
9166 maximize the chance of a CSE. However, for aligned addresses
9167 we limit the range to 4k so that structures with different sized
9168 elements are likely to use the same base. We need to be careful
9169 not to split a CONST for some forms of address expression, otherwise
9170 it will generate sub-optimal code. */
9171
9172 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9173 {
9174 rtx base = XEXP (x, 0);
9175 rtx offset_rtx = XEXP (x, 1);
9176 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9177
9178 if (GET_CODE (base) == PLUS)
9179 {
9180 rtx op0 = XEXP (base, 0);
9181 rtx op1 = XEXP (base, 1);
9182
9183 /* Force any scaling into a temp for CSE. */
9184 op0 = force_reg (Pmode, op0);
9185 op1 = force_reg (Pmode, op1);
9186
9187 /* Let the pointer register be in op0. */
9188 if (REG_POINTER (op1))
9189 std::swap (op0, op1);
9190
9191 /* If the pointer is virtual or frame related, then we know that
9192 virtual register instantiation or register elimination is going
9193 to apply a second constant. We want the two constants folded
9194 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9195 if (virt_or_elim_regno_p (REGNO (op0)))
9196 {
9197 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9198 NULL_RTX, true, OPTAB_DIRECT);
9199 return gen_rtx_PLUS (Pmode, base, op1);
9200 }
9201
9202 /* Otherwise, in order to encourage CSE (and thence loop strength
9203 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9204 base = expand_binop (Pmode, add_optab, op0, op1,
9205 NULL_RTX, true, OPTAB_DIRECT);
9206 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9207 }
9208
9209 HOST_WIDE_INT size;
9210 if (GET_MODE_SIZE (mode).is_constant (&size))
9211 {
9212 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9213 mode);
9214 if (base_offset != 0)
9215 {
9216 base = plus_constant (Pmode, base, base_offset);
9217 base = force_operand (base, NULL_RTX);
9218 return plus_constant (Pmode, base, offset - base_offset);
9219 }
9220 }
9221 }
9222
9223 return x;
9224 }
9225
9226 static reg_class_t
9227 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9228 reg_class_t rclass,
9229 machine_mode mode,
9230 secondary_reload_info *sri)
9231 {
9232 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9233 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9234 comment at the head of aarch64-sve.md for more details about the
9235 big-endian handling. */
9236 if (BYTES_BIG_ENDIAN
9237 && reg_class_subset_p (rclass, FP_REGS)
9238 && !((REG_P (x) && HARD_REGISTER_P (x))
9239 || aarch64_simd_valid_immediate (x, NULL))
9240 && aarch64_sve_data_mode_p (mode))
9241 {
9242 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9243 return NO_REGS;
9244 }
9245
9246 /* If we have to disable direct literal pool loads and stores because the
9247 function is too big, then we need a scratch register. */
9248 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9249 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9250 || targetm.vector_mode_supported_p (GET_MODE (x)))
9251 && !aarch64_pcrelative_literal_loads)
9252 {
9253 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9254 return NO_REGS;
9255 }
9256
9257 /* Without the TARGET_SIMD instructions we cannot move a Q register
9258 to a Q register directly. We need a scratch. */
9259 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9260 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9261 && reg_class_subset_p (rclass, FP_REGS))
9262 {
9263 sri->icode = code_for_aarch64_reload_mov (mode);
9264 return NO_REGS;
9265 }
9266
9267 /* A TFmode or TImode memory access should be handled via an FP_REGS
9268 because AArch64 has richer addressing modes for LDR/STR instructions
9269 than LDP/STP instructions. */
9270 if (TARGET_FLOAT && rclass == GENERAL_REGS
9271 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9272 return FP_REGS;
9273
9274 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9275 return GENERAL_REGS;
9276
9277 return NO_REGS;
9278 }
9279
9280 static bool
9281 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9282 {
9283 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9284
9285 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9286 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9287 if (frame_pointer_needed)
9288 return to == HARD_FRAME_POINTER_REGNUM;
9289 return true;
9290 }
9291
9292 poly_int64
9293 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9294 {
9295 if (to == HARD_FRAME_POINTER_REGNUM)
9296 {
9297 if (from == ARG_POINTER_REGNUM)
9298 return cfun->machine->frame.hard_fp_offset;
9299
9300 if (from == FRAME_POINTER_REGNUM)
9301 return cfun->machine->frame.hard_fp_offset
9302 - cfun->machine->frame.locals_offset;
9303 }
9304
9305 if (to == STACK_POINTER_REGNUM)
9306 {
9307 if (from == FRAME_POINTER_REGNUM)
9308 return cfun->machine->frame.frame_size
9309 - cfun->machine->frame.locals_offset;
9310 }
9311
9312 return cfun->machine->frame.frame_size;
9313 }
9314
9315 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9316 previous frame. */
9317
9318 rtx
9319 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9320 {
9321 if (count != 0)
9322 return const0_rtx;
9323 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9324 }
9325
9326
9327 static void
9328 aarch64_asm_trampoline_template (FILE *f)
9329 {
9330 int offset1 = 16;
9331 int offset2 = 20;
9332
9333 if (aarch64_bti_enabled ())
9334 {
9335 asm_fprintf (f, "\thint\t34 // bti c\n");
9336 offset1 -= 4;
9337 offset2 -= 4;
9338 }
9339
9340 if (TARGET_ILP32)
9341 {
9342 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9343 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9344 offset1);
9345 }
9346 else
9347 {
9348 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9349 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9350 offset2);
9351 }
9352 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9353
9354 /* The trampoline needs an extra padding instruction. In case if BTI is
9355 enabled the padding instruction is replaced by the BTI instruction at
9356 the beginning. */
9357 if (!aarch64_bti_enabled ())
9358 assemble_aligned_integer (4, const0_rtx);
9359
9360 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9361 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9362 }
9363
9364 static void
9365 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9366 {
9367 rtx fnaddr, mem, a_tramp;
9368 const int tramp_code_sz = 16;
9369
9370 /* Don't need to copy the trailing D-words, we fill those in below. */
9371 emit_block_move (m_tramp, assemble_trampoline_template (),
9372 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9373 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9374 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9375 if (GET_MODE (fnaddr) != ptr_mode)
9376 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9377 emit_move_insn (mem, fnaddr);
9378
9379 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9380 emit_move_insn (mem, chain_value);
9381
9382 /* XXX We should really define a "clear_cache" pattern and use
9383 gen_clear_cache(). */
9384 a_tramp = XEXP (m_tramp, 0);
9385 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9386 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9387 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9388 ptr_mode);
9389 }
9390
9391 static unsigned char
9392 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9393 {
9394 /* ??? Logically we should only need to provide a value when
9395 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9396 can hold MODE, but at the moment we need to handle all modes.
9397 Just ignore any runtime parts for registers that can't store them. */
9398 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9399 unsigned int nregs;
9400 switch (regclass)
9401 {
9402 case TAILCALL_ADDR_REGS:
9403 case POINTER_REGS:
9404 case GENERAL_REGS:
9405 case ALL_REGS:
9406 case POINTER_AND_FP_REGS:
9407 case FP_REGS:
9408 case FP_LO_REGS:
9409 case FP_LO8_REGS:
9410 if (aarch64_sve_data_mode_p (mode)
9411 && constant_multiple_p (GET_MODE_SIZE (mode),
9412 BYTES_PER_SVE_VECTOR, &nregs))
9413 return nregs;
9414 return (aarch64_vector_data_mode_p (mode)
9415 ? CEIL (lowest_size, UNITS_PER_VREG)
9416 : CEIL (lowest_size, UNITS_PER_WORD));
9417 case STACK_REG:
9418 case PR_REGS:
9419 case PR_LO_REGS:
9420 case PR_HI_REGS:
9421 return 1;
9422
9423 case NO_REGS:
9424 return 0;
9425
9426 default:
9427 break;
9428 }
9429 gcc_unreachable ();
9430 }
9431
9432 static reg_class_t
9433 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9434 {
9435 if (regclass == POINTER_REGS)
9436 return GENERAL_REGS;
9437
9438 if (regclass == STACK_REG)
9439 {
9440 if (REG_P(x)
9441 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9442 return regclass;
9443
9444 return NO_REGS;
9445 }
9446
9447 /* Register eliminiation can result in a request for
9448 SP+constant->FP_REGS. We cannot support such operations which
9449 use SP as source and an FP_REG as destination, so reject out
9450 right now. */
9451 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9452 {
9453 rtx lhs = XEXP (x, 0);
9454
9455 /* Look through a possible SUBREG introduced by ILP32. */
9456 if (GET_CODE (lhs) == SUBREG)
9457 lhs = SUBREG_REG (lhs);
9458
9459 gcc_assert (REG_P (lhs));
9460 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9461 POINTER_REGS));
9462 return NO_REGS;
9463 }
9464
9465 return regclass;
9466 }
9467
9468 void
9469 aarch64_asm_output_labelref (FILE* f, const char *name)
9470 {
9471 asm_fprintf (f, "%U%s", name);
9472 }
9473
9474 static void
9475 aarch64_elf_asm_constructor (rtx symbol, int priority)
9476 {
9477 if (priority == DEFAULT_INIT_PRIORITY)
9478 default_ctor_section_asm_out_constructor (symbol, priority);
9479 else
9480 {
9481 section *s;
9482 /* While priority is known to be in range [0, 65535], so 18 bytes
9483 would be enough, the compiler might not know that. To avoid
9484 -Wformat-truncation false positive, use a larger size. */
9485 char buf[23];
9486 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9487 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9488 switch_to_section (s);
9489 assemble_align (POINTER_SIZE);
9490 assemble_aligned_integer (POINTER_BYTES, symbol);
9491 }
9492 }
9493
9494 static void
9495 aarch64_elf_asm_destructor (rtx symbol, int priority)
9496 {
9497 if (priority == DEFAULT_INIT_PRIORITY)
9498 default_dtor_section_asm_out_destructor (symbol, priority);
9499 else
9500 {
9501 section *s;
9502 /* While priority is known to be in range [0, 65535], so 18 bytes
9503 would be enough, the compiler might not know that. To avoid
9504 -Wformat-truncation false positive, use a larger size. */
9505 char buf[23];
9506 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9507 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9508 switch_to_section (s);
9509 assemble_align (POINTER_SIZE);
9510 assemble_aligned_integer (POINTER_BYTES, symbol);
9511 }
9512 }
9513
9514 const char*
9515 aarch64_output_casesi (rtx *operands)
9516 {
9517 char buf[100];
9518 char label[100];
9519 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9520 int index;
9521 static const char *const patterns[4][2] =
9522 {
9523 {
9524 "ldrb\t%w3, [%0,%w1,uxtw]",
9525 "add\t%3, %4, %w3, sxtb #2"
9526 },
9527 {
9528 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9529 "add\t%3, %4, %w3, sxth #2"
9530 },
9531 {
9532 "ldr\t%w3, [%0,%w1,uxtw #2]",
9533 "add\t%3, %4, %w3, sxtw #2"
9534 },
9535 /* We assume that DImode is only generated when not optimizing and
9536 that we don't really need 64-bit address offsets. That would
9537 imply an object file with 8GB of code in a single function! */
9538 {
9539 "ldr\t%w3, [%0,%w1,uxtw #2]",
9540 "add\t%3, %4, %w3, sxtw #2"
9541 }
9542 };
9543
9544 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9545
9546 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9547 index = exact_log2 (GET_MODE_SIZE (mode));
9548
9549 gcc_assert (index >= 0 && index <= 3);
9550
9551 /* Need to implement table size reduction, by chaning the code below. */
9552 output_asm_insn (patterns[index][0], operands);
9553 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9554 snprintf (buf, sizeof (buf),
9555 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9556 output_asm_insn (buf, operands);
9557 output_asm_insn (patterns[index][1], operands);
9558 output_asm_insn ("br\t%3", operands);
9559 assemble_label (asm_out_file, label);
9560 return "";
9561 }
9562
9563
9564 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9565 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9566 operator. */
9567
9568 int
9569 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9570 {
9571 if (shift >= 0 && shift <= 3)
9572 {
9573 int size;
9574 for (size = 8; size <= 32; size *= 2)
9575 {
9576 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9577 if (mask == bits << shift)
9578 return size;
9579 }
9580 }
9581 return 0;
9582 }
9583
9584 /* Constant pools are per function only when PC relative
9585 literal loads are true or we are in the large memory
9586 model. */
9587
9588 static inline bool
9589 aarch64_can_use_per_function_literal_pools_p (void)
9590 {
9591 return (aarch64_pcrelative_literal_loads
9592 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9593 }
9594
9595 static bool
9596 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9597 {
9598 /* We can't use blocks for constants when we're using a per-function
9599 constant pool. */
9600 return !aarch64_can_use_per_function_literal_pools_p ();
9601 }
9602
9603 /* Select appropriate section for constants depending
9604 on where we place literal pools. */
9605
9606 static section *
9607 aarch64_select_rtx_section (machine_mode mode,
9608 rtx x,
9609 unsigned HOST_WIDE_INT align)
9610 {
9611 if (aarch64_can_use_per_function_literal_pools_p ())
9612 return function_section (current_function_decl);
9613
9614 return default_elf_select_rtx_section (mode, x, align);
9615 }
9616
9617 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9618 void
9619 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9620 HOST_WIDE_INT offset)
9621 {
9622 /* When using per-function literal pools, we must ensure that any code
9623 section is aligned to the minimal instruction length, lest we get
9624 errors from the assembler re "unaligned instructions". */
9625 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9626 ASM_OUTPUT_ALIGN (f, 2);
9627 }
9628
9629 /* Costs. */
9630
9631 /* Helper function for rtx cost calculation. Strip a shift expression
9632 from X. Returns the inner operand if successful, or the original
9633 expression on failure. */
9634 static rtx
9635 aarch64_strip_shift (rtx x)
9636 {
9637 rtx op = x;
9638
9639 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9640 we can convert both to ROR during final output. */
9641 if ((GET_CODE (op) == ASHIFT
9642 || GET_CODE (op) == ASHIFTRT
9643 || GET_CODE (op) == LSHIFTRT
9644 || GET_CODE (op) == ROTATERT
9645 || GET_CODE (op) == ROTATE)
9646 && CONST_INT_P (XEXP (op, 1)))
9647 return XEXP (op, 0);
9648
9649 if (GET_CODE (op) == MULT
9650 && CONST_INT_P (XEXP (op, 1))
9651 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9652 return XEXP (op, 0);
9653
9654 return x;
9655 }
9656
9657 /* Helper function for rtx cost calculation. Strip an extend
9658 expression from X. Returns the inner operand if successful, or the
9659 original expression on failure. We deal with a number of possible
9660 canonicalization variations here. If STRIP_SHIFT is true, then
9661 we can strip off a shift also. */
9662 static rtx
9663 aarch64_strip_extend (rtx x, bool strip_shift)
9664 {
9665 scalar_int_mode mode;
9666 rtx op = x;
9667
9668 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9669 return op;
9670
9671 /* Zero and sign extraction of a widened value. */
9672 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9673 && XEXP (op, 2) == const0_rtx
9674 && GET_CODE (XEXP (op, 0)) == MULT
9675 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9676 XEXP (op, 1)))
9677 return XEXP (XEXP (op, 0), 0);
9678
9679 /* It can also be represented (for zero-extend) as an AND with an
9680 immediate. */
9681 if (GET_CODE (op) == AND
9682 && GET_CODE (XEXP (op, 0)) == MULT
9683 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9684 && CONST_INT_P (XEXP (op, 1))
9685 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9686 INTVAL (XEXP (op, 1))) != 0)
9687 return XEXP (XEXP (op, 0), 0);
9688
9689 /* Now handle extended register, as this may also have an optional
9690 left shift by 1..4. */
9691 if (strip_shift
9692 && GET_CODE (op) == ASHIFT
9693 && CONST_INT_P (XEXP (op, 1))
9694 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9695 op = XEXP (op, 0);
9696
9697 if (GET_CODE (op) == ZERO_EXTEND
9698 || GET_CODE (op) == SIGN_EXTEND)
9699 op = XEXP (op, 0);
9700
9701 if (op != x)
9702 return op;
9703
9704 return x;
9705 }
9706
9707 /* Return true iff CODE is a shift supported in combination
9708 with arithmetic instructions. */
9709
9710 static bool
9711 aarch64_shift_p (enum rtx_code code)
9712 {
9713 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9714 }
9715
9716
9717 /* Return true iff X is a cheap shift without a sign extend. */
9718
9719 static bool
9720 aarch64_cheap_mult_shift_p (rtx x)
9721 {
9722 rtx op0, op1;
9723
9724 op0 = XEXP (x, 0);
9725 op1 = XEXP (x, 1);
9726
9727 if (!(aarch64_tune_params.extra_tuning_flags
9728 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9729 return false;
9730
9731 if (GET_CODE (op0) == SIGN_EXTEND)
9732 return false;
9733
9734 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9735 && UINTVAL (op1) <= 4)
9736 return true;
9737
9738 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9739 return false;
9740
9741 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9742
9743 if (l2 > 0 && l2 <= 4)
9744 return true;
9745
9746 return false;
9747 }
9748
9749 /* Helper function for rtx cost calculation. Calculate the cost of
9750 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9751 Return the calculated cost of the expression, recursing manually in to
9752 operands where needed. */
9753
9754 static int
9755 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9756 {
9757 rtx op0, op1;
9758 const struct cpu_cost_table *extra_cost
9759 = aarch64_tune_params.insn_extra_cost;
9760 int cost = 0;
9761 bool compound_p = (outer == PLUS || outer == MINUS);
9762 machine_mode mode = GET_MODE (x);
9763
9764 gcc_checking_assert (code == MULT);
9765
9766 op0 = XEXP (x, 0);
9767 op1 = XEXP (x, 1);
9768
9769 if (VECTOR_MODE_P (mode))
9770 mode = GET_MODE_INNER (mode);
9771
9772 /* Integer multiply/fma. */
9773 if (GET_MODE_CLASS (mode) == MODE_INT)
9774 {
9775 /* The multiply will be canonicalized as a shift, cost it as such. */
9776 if (aarch64_shift_p (GET_CODE (x))
9777 || (CONST_INT_P (op1)
9778 && exact_log2 (INTVAL (op1)) > 0))
9779 {
9780 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9781 || GET_CODE (op0) == SIGN_EXTEND;
9782 if (speed)
9783 {
9784 if (compound_p)
9785 {
9786 /* If the shift is considered cheap,
9787 then don't add any cost. */
9788 if (aarch64_cheap_mult_shift_p (x))
9789 ;
9790 else if (REG_P (op1))
9791 /* ARITH + shift-by-register. */
9792 cost += extra_cost->alu.arith_shift_reg;
9793 else if (is_extend)
9794 /* ARITH + extended register. We don't have a cost field
9795 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9796 cost += extra_cost->alu.extend_arith;
9797 else
9798 /* ARITH + shift-by-immediate. */
9799 cost += extra_cost->alu.arith_shift;
9800 }
9801 else
9802 /* LSL (immediate). */
9803 cost += extra_cost->alu.shift;
9804
9805 }
9806 /* Strip extends as we will have costed them in the case above. */
9807 if (is_extend)
9808 op0 = aarch64_strip_extend (op0, true);
9809
9810 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9811
9812 return cost;
9813 }
9814
9815 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9816 compound and let the below cases handle it. After all, MNEG is a
9817 special-case alias of MSUB. */
9818 if (GET_CODE (op0) == NEG)
9819 {
9820 op0 = XEXP (op0, 0);
9821 compound_p = true;
9822 }
9823
9824 /* Integer multiplies or FMAs have zero/sign extending variants. */
9825 if ((GET_CODE (op0) == ZERO_EXTEND
9826 && GET_CODE (op1) == ZERO_EXTEND)
9827 || (GET_CODE (op0) == SIGN_EXTEND
9828 && GET_CODE (op1) == SIGN_EXTEND))
9829 {
9830 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9831 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9832
9833 if (speed)
9834 {
9835 if (compound_p)
9836 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9837 cost += extra_cost->mult[0].extend_add;
9838 else
9839 /* MUL/SMULL/UMULL. */
9840 cost += extra_cost->mult[0].extend;
9841 }
9842
9843 return cost;
9844 }
9845
9846 /* This is either an integer multiply or a MADD. In both cases
9847 we want to recurse and cost the operands. */
9848 cost += rtx_cost (op0, mode, MULT, 0, speed);
9849 cost += rtx_cost (op1, mode, MULT, 1, speed);
9850
9851 if (speed)
9852 {
9853 if (compound_p)
9854 /* MADD/MSUB. */
9855 cost += extra_cost->mult[mode == DImode].add;
9856 else
9857 /* MUL. */
9858 cost += extra_cost->mult[mode == DImode].simple;
9859 }
9860
9861 return cost;
9862 }
9863 else
9864 {
9865 if (speed)
9866 {
9867 /* Floating-point FMA/FMUL can also support negations of the
9868 operands, unless the rounding mode is upward or downward in
9869 which case FNMUL is different than FMUL with operand negation. */
9870 bool neg0 = GET_CODE (op0) == NEG;
9871 bool neg1 = GET_CODE (op1) == NEG;
9872 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9873 {
9874 if (neg0)
9875 op0 = XEXP (op0, 0);
9876 if (neg1)
9877 op1 = XEXP (op1, 0);
9878 }
9879
9880 if (compound_p)
9881 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9882 cost += extra_cost->fp[mode == DFmode].fma;
9883 else
9884 /* FMUL/FNMUL. */
9885 cost += extra_cost->fp[mode == DFmode].mult;
9886 }
9887
9888 cost += rtx_cost (op0, mode, MULT, 0, speed);
9889 cost += rtx_cost (op1, mode, MULT, 1, speed);
9890 return cost;
9891 }
9892 }
9893
9894 static int
9895 aarch64_address_cost (rtx x,
9896 machine_mode mode,
9897 addr_space_t as ATTRIBUTE_UNUSED,
9898 bool speed)
9899 {
9900 enum rtx_code c = GET_CODE (x);
9901 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9902 struct aarch64_address_info info;
9903 int cost = 0;
9904 info.shift = 0;
9905
9906 if (!aarch64_classify_address (&info, x, mode, false))
9907 {
9908 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9909 {
9910 /* This is a CONST or SYMBOL ref which will be split
9911 in a different way depending on the code model in use.
9912 Cost it through the generic infrastructure. */
9913 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9914 /* Divide through by the cost of one instruction to
9915 bring it to the same units as the address costs. */
9916 cost_symbol_ref /= COSTS_N_INSNS (1);
9917 /* The cost is then the cost of preparing the address,
9918 followed by an immediate (possibly 0) offset. */
9919 return cost_symbol_ref + addr_cost->imm_offset;
9920 }
9921 else
9922 {
9923 /* This is most likely a jump table from a case
9924 statement. */
9925 return addr_cost->register_offset;
9926 }
9927 }
9928
9929 switch (info.type)
9930 {
9931 case ADDRESS_LO_SUM:
9932 case ADDRESS_SYMBOLIC:
9933 case ADDRESS_REG_IMM:
9934 cost += addr_cost->imm_offset;
9935 break;
9936
9937 case ADDRESS_REG_WB:
9938 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9939 cost += addr_cost->pre_modify;
9940 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9941 cost += addr_cost->post_modify;
9942 else
9943 gcc_unreachable ();
9944
9945 break;
9946
9947 case ADDRESS_REG_REG:
9948 cost += addr_cost->register_offset;
9949 break;
9950
9951 case ADDRESS_REG_SXTW:
9952 cost += addr_cost->register_sextend;
9953 break;
9954
9955 case ADDRESS_REG_UXTW:
9956 cost += addr_cost->register_zextend;
9957 break;
9958
9959 default:
9960 gcc_unreachable ();
9961 }
9962
9963
9964 if (info.shift > 0)
9965 {
9966 /* For the sake of calculating the cost of the shifted register
9967 component, we can treat same sized modes in the same way. */
9968 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9969 cost += addr_cost->addr_scale_costs.hi;
9970 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9971 cost += addr_cost->addr_scale_costs.si;
9972 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9973 cost += addr_cost->addr_scale_costs.di;
9974 else
9975 /* We can't tell, or this is a 128-bit vector. */
9976 cost += addr_cost->addr_scale_costs.ti;
9977 }
9978
9979 return cost;
9980 }
9981
9982 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9983 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9984 to be taken. */
9985
9986 int
9987 aarch64_branch_cost (bool speed_p, bool predictable_p)
9988 {
9989 /* When optimizing for speed, use the cost of unpredictable branches. */
9990 const struct cpu_branch_cost *branch_costs =
9991 aarch64_tune_params.branch_costs;
9992
9993 if (!speed_p || predictable_p)
9994 return branch_costs->predictable;
9995 else
9996 return branch_costs->unpredictable;
9997 }
9998
9999 /* Return true if the RTX X in mode MODE is a zero or sign extract
10000 usable in an ADD or SUB (extended register) instruction. */
10001 static bool
10002 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10003 {
10004 /* Catch add with a sign extract.
10005 This is add_<optab><mode>_multp2. */
10006 if (GET_CODE (x) == SIGN_EXTRACT
10007 || GET_CODE (x) == ZERO_EXTRACT)
10008 {
10009 rtx op0 = XEXP (x, 0);
10010 rtx op1 = XEXP (x, 1);
10011 rtx op2 = XEXP (x, 2);
10012
10013 if (GET_CODE (op0) == MULT
10014 && CONST_INT_P (op1)
10015 && op2 == const0_rtx
10016 && CONST_INT_P (XEXP (op0, 1))
10017 && aarch64_is_extend_from_extract (mode,
10018 XEXP (op0, 1),
10019 op1))
10020 {
10021 return true;
10022 }
10023 }
10024 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10025 No shift. */
10026 else if (GET_CODE (x) == SIGN_EXTEND
10027 || GET_CODE (x) == ZERO_EXTEND)
10028 return REG_P (XEXP (x, 0));
10029
10030 return false;
10031 }
10032
10033 static bool
10034 aarch64_frint_unspec_p (unsigned int u)
10035 {
10036 switch (u)
10037 {
10038 case UNSPEC_FRINTZ:
10039 case UNSPEC_FRINTP:
10040 case UNSPEC_FRINTM:
10041 case UNSPEC_FRINTA:
10042 case UNSPEC_FRINTN:
10043 case UNSPEC_FRINTX:
10044 case UNSPEC_FRINTI:
10045 return true;
10046
10047 default:
10048 return false;
10049 }
10050 }
10051
10052 /* Return true iff X is an rtx that will match an extr instruction
10053 i.e. as described in the *extr<mode>5_insn family of patterns.
10054 OP0 and OP1 will be set to the operands of the shifts involved
10055 on success and will be NULL_RTX otherwise. */
10056
10057 static bool
10058 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10059 {
10060 rtx op0, op1;
10061 scalar_int_mode mode;
10062 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10063 return false;
10064
10065 *res_op0 = NULL_RTX;
10066 *res_op1 = NULL_RTX;
10067
10068 if (GET_CODE (x) != IOR)
10069 return false;
10070
10071 op0 = XEXP (x, 0);
10072 op1 = XEXP (x, 1);
10073
10074 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10075 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10076 {
10077 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10078 if (GET_CODE (op1) == ASHIFT)
10079 std::swap (op0, op1);
10080
10081 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10082 return false;
10083
10084 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10085 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10086
10087 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10088 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10089 {
10090 *res_op0 = XEXP (op0, 0);
10091 *res_op1 = XEXP (op1, 0);
10092 return true;
10093 }
10094 }
10095
10096 return false;
10097 }
10098
10099 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10100 storing it in *COST. Result is true if the total cost of the operation
10101 has now been calculated. */
10102 static bool
10103 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10104 {
10105 rtx inner;
10106 rtx comparator;
10107 enum rtx_code cmpcode;
10108
10109 if (COMPARISON_P (op0))
10110 {
10111 inner = XEXP (op0, 0);
10112 comparator = XEXP (op0, 1);
10113 cmpcode = GET_CODE (op0);
10114 }
10115 else
10116 {
10117 inner = op0;
10118 comparator = const0_rtx;
10119 cmpcode = NE;
10120 }
10121
10122 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10123 {
10124 /* Conditional branch. */
10125 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10126 return true;
10127 else
10128 {
10129 if (cmpcode == NE || cmpcode == EQ)
10130 {
10131 if (comparator == const0_rtx)
10132 {
10133 /* TBZ/TBNZ/CBZ/CBNZ. */
10134 if (GET_CODE (inner) == ZERO_EXTRACT)
10135 /* TBZ/TBNZ. */
10136 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10137 ZERO_EXTRACT, 0, speed);
10138 else
10139 /* CBZ/CBNZ. */
10140 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10141
10142 return true;
10143 }
10144 }
10145 else if (cmpcode == LT || cmpcode == GE)
10146 {
10147 /* TBZ/TBNZ. */
10148 if (comparator == const0_rtx)
10149 return true;
10150 }
10151 }
10152 }
10153 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10154 {
10155 /* CCMP. */
10156 if (GET_CODE (op1) == COMPARE)
10157 {
10158 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10159 if (XEXP (op1, 1) == const0_rtx)
10160 *cost += 1;
10161 if (speed)
10162 {
10163 machine_mode mode = GET_MODE (XEXP (op1, 0));
10164 const struct cpu_cost_table *extra_cost
10165 = aarch64_tune_params.insn_extra_cost;
10166
10167 if (GET_MODE_CLASS (mode) == MODE_INT)
10168 *cost += extra_cost->alu.arith;
10169 else
10170 *cost += extra_cost->fp[mode == DFmode].compare;
10171 }
10172 return true;
10173 }
10174
10175 /* It's a conditional operation based on the status flags,
10176 so it must be some flavor of CSEL. */
10177
10178 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10179 if (GET_CODE (op1) == NEG
10180 || GET_CODE (op1) == NOT
10181 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10182 op1 = XEXP (op1, 0);
10183 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10184 {
10185 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10186 op1 = XEXP (op1, 0);
10187 op2 = XEXP (op2, 0);
10188 }
10189
10190 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10191 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10192 return true;
10193 }
10194
10195 /* We don't know what this is, cost all operands. */
10196 return false;
10197 }
10198
10199 /* Check whether X is a bitfield operation of the form shift + extend that
10200 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10201 operand to which the bitfield operation is applied. Otherwise return
10202 NULL_RTX. */
10203
10204 static rtx
10205 aarch64_extend_bitfield_pattern_p (rtx x)
10206 {
10207 rtx_code outer_code = GET_CODE (x);
10208 machine_mode outer_mode = GET_MODE (x);
10209
10210 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10211 && outer_mode != SImode && outer_mode != DImode)
10212 return NULL_RTX;
10213
10214 rtx inner = XEXP (x, 0);
10215 rtx_code inner_code = GET_CODE (inner);
10216 machine_mode inner_mode = GET_MODE (inner);
10217 rtx op = NULL_RTX;
10218
10219 switch (inner_code)
10220 {
10221 case ASHIFT:
10222 if (CONST_INT_P (XEXP (inner, 1))
10223 && (inner_mode == QImode || inner_mode == HImode))
10224 op = XEXP (inner, 0);
10225 break;
10226 case LSHIFTRT:
10227 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10228 && (inner_mode == QImode || inner_mode == HImode))
10229 op = XEXP (inner, 0);
10230 break;
10231 case ASHIFTRT:
10232 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10233 && (inner_mode == QImode || inner_mode == HImode))
10234 op = XEXP (inner, 0);
10235 break;
10236 default:
10237 break;
10238 }
10239
10240 return op;
10241 }
10242
10243 /* Return true if the mask and a shift amount from an RTX of the form
10244 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10245 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10246
10247 bool
10248 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10249 rtx shft_amnt)
10250 {
10251 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10252 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10253 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10254 && (INTVAL (mask)
10255 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10256 }
10257
10258 /* Return true if the masks and a shift amount from an RTX of the form
10259 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10260 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10261
10262 bool
10263 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10264 unsigned HOST_WIDE_INT mask1,
10265 unsigned HOST_WIDE_INT shft_amnt,
10266 unsigned HOST_WIDE_INT mask2)
10267 {
10268 unsigned HOST_WIDE_INT t;
10269
10270 /* Verify that there is no overlap in what bits are set in the two masks. */
10271 if (mask1 != ~mask2)
10272 return false;
10273
10274 /* Verify that mask2 is not all zeros or ones. */
10275 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10276 return false;
10277
10278 /* The shift amount should always be less than the mode size. */
10279 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10280
10281 /* Verify that the mask being shifted is contiguous and would be in the
10282 least significant bits after shifting by shft_amnt. */
10283 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10284 return (t == (t & -t));
10285 }
10286
10287 /* Calculate the cost of calculating X, storing it in *COST. Result
10288 is true if the total cost of the operation has now been calculated. */
10289 static bool
10290 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10291 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10292 {
10293 rtx op0, op1, op2;
10294 const struct cpu_cost_table *extra_cost
10295 = aarch64_tune_params.insn_extra_cost;
10296 int code = GET_CODE (x);
10297 scalar_int_mode int_mode;
10298
10299 /* By default, assume that everything has equivalent cost to the
10300 cheapest instruction. Any additional costs are applied as a delta
10301 above this default. */
10302 *cost = COSTS_N_INSNS (1);
10303
10304 switch (code)
10305 {
10306 case SET:
10307 /* The cost depends entirely on the operands to SET. */
10308 *cost = 0;
10309 op0 = SET_DEST (x);
10310 op1 = SET_SRC (x);
10311
10312 switch (GET_CODE (op0))
10313 {
10314 case MEM:
10315 if (speed)
10316 {
10317 rtx address = XEXP (op0, 0);
10318 if (VECTOR_MODE_P (mode))
10319 *cost += extra_cost->ldst.storev;
10320 else if (GET_MODE_CLASS (mode) == MODE_INT)
10321 *cost += extra_cost->ldst.store;
10322 else if (mode == SFmode)
10323 *cost += extra_cost->ldst.storef;
10324 else if (mode == DFmode)
10325 *cost += extra_cost->ldst.stored;
10326
10327 *cost +=
10328 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10329 0, speed));
10330 }
10331
10332 *cost += rtx_cost (op1, mode, SET, 1, speed);
10333 return true;
10334
10335 case SUBREG:
10336 if (! REG_P (SUBREG_REG (op0)))
10337 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10338
10339 /* Fall through. */
10340 case REG:
10341 /* The cost is one per vector-register copied. */
10342 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10343 {
10344 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10345 *cost = COSTS_N_INSNS (nregs);
10346 }
10347 /* const0_rtx is in general free, but we will use an
10348 instruction to set a register to 0. */
10349 else if (REG_P (op1) || op1 == const0_rtx)
10350 {
10351 /* The cost is 1 per register copied. */
10352 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10353 *cost = COSTS_N_INSNS (nregs);
10354 }
10355 else
10356 /* Cost is just the cost of the RHS of the set. */
10357 *cost += rtx_cost (op1, mode, SET, 1, speed);
10358 return true;
10359
10360 case ZERO_EXTRACT:
10361 case SIGN_EXTRACT:
10362 /* Bit-field insertion. Strip any redundant widening of
10363 the RHS to meet the width of the target. */
10364 if (GET_CODE (op1) == SUBREG)
10365 op1 = SUBREG_REG (op1);
10366 if ((GET_CODE (op1) == ZERO_EXTEND
10367 || GET_CODE (op1) == SIGN_EXTEND)
10368 && CONST_INT_P (XEXP (op0, 1))
10369 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10370 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10371 op1 = XEXP (op1, 0);
10372
10373 if (CONST_INT_P (op1))
10374 {
10375 /* MOV immediate is assumed to always be cheap. */
10376 *cost = COSTS_N_INSNS (1);
10377 }
10378 else
10379 {
10380 /* BFM. */
10381 if (speed)
10382 *cost += extra_cost->alu.bfi;
10383 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10384 }
10385
10386 return true;
10387
10388 default:
10389 /* We can't make sense of this, assume default cost. */
10390 *cost = COSTS_N_INSNS (1);
10391 return false;
10392 }
10393 return false;
10394
10395 case CONST_INT:
10396 /* If an instruction can incorporate a constant within the
10397 instruction, the instruction's expression avoids calling
10398 rtx_cost() on the constant. If rtx_cost() is called on a
10399 constant, then it is usually because the constant must be
10400 moved into a register by one or more instructions.
10401
10402 The exception is constant 0, which can be expressed
10403 as XZR/WZR and is therefore free. The exception to this is
10404 if we have (set (reg) (const0_rtx)) in which case we must cost
10405 the move. However, we can catch that when we cost the SET, so
10406 we don't need to consider that here. */
10407 if (x == const0_rtx)
10408 *cost = 0;
10409 else
10410 {
10411 /* To an approximation, building any other constant is
10412 proportionally expensive to the number of instructions
10413 required to build that constant. This is true whether we
10414 are compiling for SPEED or otherwise. */
10415 if (!is_a <scalar_int_mode> (mode, &int_mode))
10416 int_mode = word_mode;
10417 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10418 (NULL_RTX, x, false, int_mode));
10419 }
10420 return true;
10421
10422 case CONST_DOUBLE:
10423
10424 /* First determine number of instructions to do the move
10425 as an integer constant. */
10426 if (!aarch64_float_const_representable_p (x)
10427 && !aarch64_can_const_movi_rtx_p (x, mode)
10428 && aarch64_float_const_rtx_p (x))
10429 {
10430 unsigned HOST_WIDE_INT ival;
10431 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10432 gcc_assert (succeed);
10433
10434 scalar_int_mode imode = (mode == HFmode
10435 ? SImode
10436 : int_mode_for_mode (mode).require ());
10437 int ncost = aarch64_internal_mov_immediate
10438 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10439 *cost += COSTS_N_INSNS (ncost);
10440 return true;
10441 }
10442
10443 if (speed)
10444 {
10445 /* mov[df,sf]_aarch64. */
10446 if (aarch64_float_const_representable_p (x))
10447 /* FMOV (scalar immediate). */
10448 *cost += extra_cost->fp[mode == DFmode].fpconst;
10449 else if (!aarch64_float_const_zero_rtx_p (x))
10450 {
10451 /* This will be a load from memory. */
10452 if (mode == DFmode)
10453 *cost += extra_cost->ldst.loadd;
10454 else
10455 *cost += extra_cost->ldst.loadf;
10456 }
10457 else
10458 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10459 or MOV v0.s[0], wzr - neither of which are modeled by the
10460 cost tables. Just use the default cost. */
10461 {
10462 }
10463 }
10464
10465 return true;
10466
10467 case MEM:
10468 if (speed)
10469 {
10470 /* For loads we want the base cost of a load, plus an
10471 approximation for the additional cost of the addressing
10472 mode. */
10473 rtx address = XEXP (x, 0);
10474 if (VECTOR_MODE_P (mode))
10475 *cost += extra_cost->ldst.loadv;
10476 else if (GET_MODE_CLASS (mode) == MODE_INT)
10477 *cost += extra_cost->ldst.load;
10478 else if (mode == SFmode)
10479 *cost += extra_cost->ldst.loadf;
10480 else if (mode == DFmode)
10481 *cost += extra_cost->ldst.loadd;
10482
10483 *cost +=
10484 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10485 0, speed));
10486 }
10487
10488 return true;
10489
10490 case NEG:
10491 op0 = XEXP (x, 0);
10492
10493 if (VECTOR_MODE_P (mode))
10494 {
10495 if (speed)
10496 {
10497 /* FNEG. */
10498 *cost += extra_cost->vect.alu;
10499 }
10500 return false;
10501 }
10502
10503 if (GET_MODE_CLASS (mode) == MODE_INT)
10504 {
10505 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10506 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10507 {
10508 /* CSETM. */
10509 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10510 return true;
10511 }
10512
10513 /* Cost this as SUB wzr, X. */
10514 op0 = CONST0_RTX (mode);
10515 op1 = XEXP (x, 0);
10516 goto cost_minus;
10517 }
10518
10519 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10520 {
10521 /* Support (neg(fma...)) as a single instruction only if
10522 sign of zeros is unimportant. This matches the decision
10523 making in aarch64.md. */
10524 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10525 {
10526 /* FNMADD. */
10527 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10528 return true;
10529 }
10530 if (GET_CODE (op0) == MULT)
10531 {
10532 /* FNMUL. */
10533 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10534 return true;
10535 }
10536 if (speed)
10537 /* FNEG. */
10538 *cost += extra_cost->fp[mode == DFmode].neg;
10539 return false;
10540 }
10541
10542 return false;
10543
10544 case CLRSB:
10545 case CLZ:
10546 if (speed)
10547 {
10548 if (VECTOR_MODE_P (mode))
10549 *cost += extra_cost->vect.alu;
10550 else
10551 *cost += extra_cost->alu.clz;
10552 }
10553
10554 return false;
10555
10556 case COMPARE:
10557 op0 = XEXP (x, 0);
10558 op1 = XEXP (x, 1);
10559
10560 if (op1 == const0_rtx
10561 && GET_CODE (op0) == AND)
10562 {
10563 x = op0;
10564 mode = GET_MODE (op0);
10565 goto cost_logic;
10566 }
10567
10568 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10569 {
10570 /* TODO: A write to the CC flags possibly costs extra, this
10571 needs encoding in the cost tables. */
10572
10573 mode = GET_MODE (op0);
10574 /* ANDS. */
10575 if (GET_CODE (op0) == AND)
10576 {
10577 x = op0;
10578 goto cost_logic;
10579 }
10580
10581 if (GET_CODE (op0) == PLUS)
10582 {
10583 /* ADDS (and CMN alias). */
10584 x = op0;
10585 goto cost_plus;
10586 }
10587
10588 if (GET_CODE (op0) == MINUS)
10589 {
10590 /* SUBS. */
10591 x = op0;
10592 goto cost_minus;
10593 }
10594
10595 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10596 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10597 && CONST_INT_P (XEXP (op0, 2)))
10598 {
10599 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10600 Handle it here directly rather than going to cost_logic
10601 since we know the immediate generated for the TST is valid
10602 so we can avoid creating an intermediate rtx for it only
10603 for costing purposes. */
10604 if (speed)
10605 *cost += extra_cost->alu.logical;
10606
10607 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10608 ZERO_EXTRACT, 0, speed);
10609 return true;
10610 }
10611
10612 if (GET_CODE (op1) == NEG)
10613 {
10614 /* CMN. */
10615 if (speed)
10616 *cost += extra_cost->alu.arith;
10617
10618 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10619 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10620 return true;
10621 }
10622
10623 /* CMP.
10624
10625 Compare can freely swap the order of operands, and
10626 canonicalization puts the more complex operation first.
10627 But the integer MINUS logic expects the shift/extend
10628 operation in op1. */
10629 if (! (REG_P (op0)
10630 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10631 {
10632 op0 = XEXP (x, 1);
10633 op1 = XEXP (x, 0);
10634 }
10635 goto cost_minus;
10636 }
10637
10638 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10639 {
10640 /* FCMP. */
10641 if (speed)
10642 *cost += extra_cost->fp[mode == DFmode].compare;
10643
10644 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10645 {
10646 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10647 /* FCMP supports constant 0.0 for no extra cost. */
10648 return true;
10649 }
10650 return false;
10651 }
10652
10653 if (VECTOR_MODE_P (mode))
10654 {
10655 /* Vector compare. */
10656 if (speed)
10657 *cost += extra_cost->vect.alu;
10658
10659 if (aarch64_float_const_zero_rtx_p (op1))
10660 {
10661 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10662 cost. */
10663 return true;
10664 }
10665 return false;
10666 }
10667 return false;
10668
10669 case MINUS:
10670 {
10671 op0 = XEXP (x, 0);
10672 op1 = XEXP (x, 1);
10673
10674 cost_minus:
10675 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10676
10677 /* Detect valid immediates. */
10678 if ((GET_MODE_CLASS (mode) == MODE_INT
10679 || (GET_MODE_CLASS (mode) == MODE_CC
10680 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10681 && CONST_INT_P (op1)
10682 && aarch64_uimm12_shift (INTVAL (op1)))
10683 {
10684 if (speed)
10685 /* SUB(S) (immediate). */
10686 *cost += extra_cost->alu.arith;
10687 return true;
10688 }
10689
10690 /* Look for SUB (extended register). */
10691 if (is_a <scalar_int_mode> (mode, &int_mode)
10692 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10693 {
10694 if (speed)
10695 *cost += extra_cost->alu.extend_arith;
10696
10697 op1 = aarch64_strip_extend (op1, true);
10698 *cost += rtx_cost (op1, VOIDmode,
10699 (enum rtx_code) GET_CODE (op1), 0, speed);
10700 return true;
10701 }
10702
10703 rtx new_op1 = aarch64_strip_extend (op1, false);
10704
10705 /* Cost this as an FMA-alike operation. */
10706 if ((GET_CODE (new_op1) == MULT
10707 || aarch64_shift_p (GET_CODE (new_op1)))
10708 && code != COMPARE)
10709 {
10710 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10711 (enum rtx_code) code,
10712 speed);
10713 return true;
10714 }
10715
10716 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10717
10718 if (speed)
10719 {
10720 if (VECTOR_MODE_P (mode))
10721 {
10722 /* Vector SUB. */
10723 *cost += extra_cost->vect.alu;
10724 }
10725 else if (GET_MODE_CLASS (mode) == MODE_INT)
10726 {
10727 /* SUB(S). */
10728 *cost += extra_cost->alu.arith;
10729 }
10730 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10731 {
10732 /* FSUB. */
10733 *cost += extra_cost->fp[mode == DFmode].addsub;
10734 }
10735 }
10736 return true;
10737 }
10738
10739 case PLUS:
10740 {
10741 rtx new_op0;
10742
10743 op0 = XEXP (x, 0);
10744 op1 = XEXP (x, 1);
10745
10746 cost_plus:
10747 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10748 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10749 {
10750 /* CSINC. */
10751 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10752 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10753 return true;
10754 }
10755
10756 if (GET_MODE_CLASS (mode) == MODE_INT
10757 && (aarch64_plus_immediate (op1, mode)
10758 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10759 {
10760 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10761
10762 if (speed)
10763 /* ADD (immediate). */
10764 *cost += extra_cost->alu.arith;
10765 return true;
10766 }
10767
10768 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10769
10770 /* Look for ADD (extended register). */
10771 if (is_a <scalar_int_mode> (mode, &int_mode)
10772 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10773 {
10774 if (speed)
10775 *cost += extra_cost->alu.extend_arith;
10776
10777 op0 = aarch64_strip_extend (op0, true);
10778 *cost += rtx_cost (op0, VOIDmode,
10779 (enum rtx_code) GET_CODE (op0), 0, speed);
10780 return true;
10781 }
10782
10783 /* Strip any extend, leave shifts behind as we will
10784 cost them through mult_cost. */
10785 new_op0 = aarch64_strip_extend (op0, false);
10786
10787 if (GET_CODE (new_op0) == MULT
10788 || aarch64_shift_p (GET_CODE (new_op0)))
10789 {
10790 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10791 speed);
10792 return true;
10793 }
10794
10795 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10796
10797 if (speed)
10798 {
10799 if (VECTOR_MODE_P (mode))
10800 {
10801 /* Vector ADD. */
10802 *cost += extra_cost->vect.alu;
10803 }
10804 else if (GET_MODE_CLASS (mode) == MODE_INT)
10805 {
10806 /* ADD. */
10807 *cost += extra_cost->alu.arith;
10808 }
10809 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10810 {
10811 /* FADD. */
10812 *cost += extra_cost->fp[mode == DFmode].addsub;
10813 }
10814 }
10815 return true;
10816 }
10817
10818 case BSWAP:
10819 *cost = COSTS_N_INSNS (1);
10820
10821 if (speed)
10822 {
10823 if (VECTOR_MODE_P (mode))
10824 *cost += extra_cost->vect.alu;
10825 else
10826 *cost += extra_cost->alu.rev;
10827 }
10828 return false;
10829
10830 case IOR:
10831 if (aarch_rev16_p (x))
10832 {
10833 *cost = COSTS_N_INSNS (1);
10834
10835 if (speed)
10836 {
10837 if (VECTOR_MODE_P (mode))
10838 *cost += extra_cost->vect.alu;
10839 else
10840 *cost += extra_cost->alu.rev;
10841 }
10842 return true;
10843 }
10844
10845 if (aarch64_extr_rtx_p (x, &op0, &op1))
10846 {
10847 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10848 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10849 if (speed)
10850 *cost += extra_cost->alu.shift;
10851
10852 return true;
10853 }
10854 /* Fall through. */
10855 case XOR:
10856 case AND:
10857 cost_logic:
10858 op0 = XEXP (x, 0);
10859 op1 = XEXP (x, 1);
10860
10861 if (VECTOR_MODE_P (mode))
10862 {
10863 if (speed)
10864 *cost += extra_cost->vect.alu;
10865 return true;
10866 }
10867
10868 if (code == AND
10869 && GET_CODE (op0) == MULT
10870 && CONST_INT_P (XEXP (op0, 1))
10871 && CONST_INT_P (op1)
10872 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10873 INTVAL (op1)) != 0)
10874 {
10875 /* This is a UBFM/SBFM. */
10876 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10877 if (speed)
10878 *cost += extra_cost->alu.bfx;
10879 return true;
10880 }
10881
10882 if (is_int_mode (mode, &int_mode))
10883 {
10884 if (CONST_INT_P (op1))
10885 {
10886 /* We have a mask + shift version of a UBFIZ
10887 i.e. the *andim_ashift<mode>_bfiz pattern. */
10888 if (GET_CODE (op0) == ASHIFT
10889 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10890 XEXP (op0, 1)))
10891 {
10892 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10893 (enum rtx_code) code, 0, speed);
10894 if (speed)
10895 *cost += extra_cost->alu.bfx;
10896
10897 return true;
10898 }
10899 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10900 {
10901 /* We possibly get the immediate for free, this is not
10902 modelled. */
10903 *cost += rtx_cost (op0, int_mode,
10904 (enum rtx_code) code, 0, speed);
10905 if (speed)
10906 *cost += extra_cost->alu.logical;
10907
10908 return true;
10909 }
10910 }
10911 else
10912 {
10913 rtx new_op0 = op0;
10914
10915 /* Handle ORN, EON, or BIC. */
10916 if (GET_CODE (op0) == NOT)
10917 op0 = XEXP (op0, 0);
10918
10919 new_op0 = aarch64_strip_shift (op0);
10920
10921 /* If we had a shift on op0 then this is a logical-shift-
10922 by-register/immediate operation. Otherwise, this is just
10923 a logical operation. */
10924 if (speed)
10925 {
10926 if (new_op0 != op0)
10927 {
10928 /* Shift by immediate. */
10929 if (CONST_INT_P (XEXP (op0, 1)))
10930 *cost += extra_cost->alu.log_shift;
10931 else
10932 *cost += extra_cost->alu.log_shift_reg;
10933 }
10934 else
10935 *cost += extra_cost->alu.logical;
10936 }
10937
10938 /* In both cases we want to cost both operands. */
10939 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10940 0, speed);
10941 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10942 1, speed);
10943
10944 return true;
10945 }
10946 }
10947 return false;
10948
10949 case NOT:
10950 x = XEXP (x, 0);
10951 op0 = aarch64_strip_shift (x);
10952
10953 if (VECTOR_MODE_P (mode))
10954 {
10955 /* Vector NOT. */
10956 *cost += extra_cost->vect.alu;
10957 return false;
10958 }
10959
10960 /* MVN-shifted-reg. */
10961 if (op0 != x)
10962 {
10963 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10964
10965 if (speed)
10966 *cost += extra_cost->alu.log_shift;
10967
10968 return true;
10969 }
10970 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10971 Handle the second form here taking care that 'a' in the above can
10972 be a shift. */
10973 else if (GET_CODE (op0) == XOR)
10974 {
10975 rtx newop0 = XEXP (op0, 0);
10976 rtx newop1 = XEXP (op0, 1);
10977 rtx op0_stripped = aarch64_strip_shift (newop0);
10978
10979 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10980 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10981
10982 if (speed)
10983 {
10984 if (op0_stripped != newop0)
10985 *cost += extra_cost->alu.log_shift;
10986 else
10987 *cost += extra_cost->alu.logical;
10988 }
10989
10990 return true;
10991 }
10992 /* MVN. */
10993 if (speed)
10994 *cost += extra_cost->alu.logical;
10995
10996 return false;
10997
10998 case ZERO_EXTEND:
10999
11000 op0 = XEXP (x, 0);
11001 /* If a value is written in SI mode, then zero extended to DI
11002 mode, the operation will in general be free as a write to
11003 a 'w' register implicitly zeroes the upper bits of an 'x'
11004 register. However, if this is
11005
11006 (set (reg) (zero_extend (reg)))
11007
11008 we must cost the explicit register move. */
11009 if (mode == DImode
11010 && GET_MODE (op0) == SImode
11011 && outer == SET)
11012 {
11013 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11014
11015 /* If OP_COST is non-zero, then the cost of the zero extend
11016 is effectively the cost of the inner operation. Otherwise
11017 we have a MOV instruction and we take the cost from the MOV
11018 itself. This is true independently of whether we are
11019 optimizing for space or time. */
11020 if (op_cost)
11021 *cost = op_cost;
11022
11023 return true;
11024 }
11025 else if (MEM_P (op0))
11026 {
11027 /* All loads can zero extend to any size for free. */
11028 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11029 return true;
11030 }
11031
11032 op0 = aarch64_extend_bitfield_pattern_p (x);
11033 if (op0)
11034 {
11035 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11036 if (speed)
11037 *cost += extra_cost->alu.bfx;
11038 return true;
11039 }
11040
11041 if (speed)
11042 {
11043 if (VECTOR_MODE_P (mode))
11044 {
11045 /* UMOV. */
11046 *cost += extra_cost->vect.alu;
11047 }
11048 else
11049 {
11050 /* We generate an AND instead of UXTB/UXTH. */
11051 *cost += extra_cost->alu.logical;
11052 }
11053 }
11054 return false;
11055
11056 case SIGN_EXTEND:
11057 if (MEM_P (XEXP (x, 0)))
11058 {
11059 /* LDRSH. */
11060 if (speed)
11061 {
11062 rtx address = XEXP (XEXP (x, 0), 0);
11063 *cost += extra_cost->ldst.load_sign_extend;
11064
11065 *cost +=
11066 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11067 0, speed));
11068 }
11069 return true;
11070 }
11071
11072 op0 = aarch64_extend_bitfield_pattern_p (x);
11073 if (op0)
11074 {
11075 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11076 if (speed)
11077 *cost += extra_cost->alu.bfx;
11078 return true;
11079 }
11080
11081 if (speed)
11082 {
11083 if (VECTOR_MODE_P (mode))
11084 *cost += extra_cost->vect.alu;
11085 else
11086 *cost += extra_cost->alu.extend;
11087 }
11088 return false;
11089
11090 case ASHIFT:
11091 op0 = XEXP (x, 0);
11092 op1 = XEXP (x, 1);
11093
11094 if (CONST_INT_P (op1))
11095 {
11096 if (speed)
11097 {
11098 if (VECTOR_MODE_P (mode))
11099 {
11100 /* Vector shift (immediate). */
11101 *cost += extra_cost->vect.alu;
11102 }
11103 else
11104 {
11105 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11106 aliases. */
11107 *cost += extra_cost->alu.shift;
11108 }
11109 }
11110
11111 /* We can incorporate zero/sign extend for free. */
11112 if (GET_CODE (op0) == ZERO_EXTEND
11113 || GET_CODE (op0) == SIGN_EXTEND)
11114 op0 = XEXP (op0, 0);
11115
11116 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11117 return true;
11118 }
11119 else
11120 {
11121 if (VECTOR_MODE_P (mode))
11122 {
11123 if (speed)
11124 /* Vector shift (register). */
11125 *cost += extra_cost->vect.alu;
11126 }
11127 else
11128 {
11129 if (speed)
11130 /* LSLV. */
11131 *cost += extra_cost->alu.shift_reg;
11132
11133 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11134 && CONST_INT_P (XEXP (op1, 1))
11135 && known_eq (INTVAL (XEXP (op1, 1)),
11136 GET_MODE_BITSIZE (mode) - 1))
11137 {
11138 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11139 /* We already demanded XEXP (op1, 0) to be REG_P, so
11140 don't recurse into it. */
11141 return true;
11142 }
11143 }
11144 return false; /* All arguments need to be in registers. */
11145 }
11146
11147 case ROTATE:
11148 case ROTATERT:
11149 case LSHIFTRT:
11150 case ASHIFTRT:
11151 op0 = XEXP (x, 0);
11152 op1 = XEXP (x, 1);
11153
11154 if (CONST_INT_P (op1))
11155 {
11156 /* ASR (immediate) and friends. */
11157 if (speed)
11158 {
11159 if (VECTOR_MODE_P (mode))
11160 *cost += extra_cost->vect.alu;
11161 else
11162 *cost += extra_cost->alu.shift;
11163 }
11164
11165 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11166 return true;
11167 }
11168 else
11169 {
11170 if (VECTOR_MODE_P (mode))
11171 {
11172 if (speed)
11173 /* Vector shift (register). */
11174 *cost += extra_cost->vect.alu;
11175 }
11176 else
11177 {
11178 if (speed)
11179 /* ASR (register) and friends. */
11180 *cost += extra_cost->alu.shift_reg;
11181
11182 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11183 && CONST_INT_P (XEXP (op1, 1))
11184 && known_eq (INTVAL (XEXP (op1, 1)),
11185 GET_MODE_BITSIZE (mode) - 1))
11186 {
11187 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11188 /* We already demanded XEXP (op1, 0) to be REG_P, so
11189 don't recurse into it. */
11190 return true;
11191 }
11192 }
11193 return false; /* All arguments need to be in registers. */
11194 }
11195
11196 case SYMBOL_REF:
11197
11198 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11199 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11200 {
11201 /* LDR. */
11202 if (speed)
11203 *cost += extra_cost->ldst.load;
11204 }
11205 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11206 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11207 {
11208 /* ADRP, followed by ADD. */
11209 *cost += COSTS_N_INSNS (1);
11210 if (speed)
11211 *cost += 2 * extra_cost->alu.arith;
11212 }
11213 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11214 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11215 {
11216 /* ADR. */
11217 if (speed)
11218 *cost += extra_cost->alu.arith;
11219 }
11220
11221 if (flag_pic)
11222 {
11223 /* One extra load instruction, after accessing the GOT. */
11224 *cost += COSTS_N_INSNS (1);
11225 if (speed)
11226 *cost += extra_cost->ldst.load;
11227 }
11228 return true;
11229
11230 case HIGH:
11231 case LO_SUM:
11232 /* ADRP/ADD (immediate). */
11233 if (speed)
11234 *cost += extra_cost->alu.arith;
11235 return true;
11236
11237 case ZERO_EXTRACT:
11238 case SIGN_EXTRACT:
11239 /* UBFX/SBFX. */
11240 if (speed)
11241 {
11242 if (VECTOR_MODE_P (mode))
11243 *cost += extra_cost->vect.alu;
11244 else
11245 *cost += extra_cost->alu.bfx;
11246 }
11247
11248 /* We can trust that the immediates used will be correct (there
11249 are no by-register forms), so we need only cost op0. */
11250 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11251 return true;
11252
11253 case MULT:
11254 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11255 /* aarch64_rtx_mult_cost always handles recursion to its
11256 operands. */
11257 return true;
11258
11259 case MOD:
11260 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11261 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11262 an unconditional negate. This case should only ever be reached through
11263 the set_smod_pow2_cheap check in expmed.c. */
11264 if (CONST_INT_P (XEXP (x, 1))
11265 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11266 && (mode == SImode || mode == DImode))
11267 {
11268 /* We expand to 4 instructions. Reset the baseline. */
11269 *cost = COSTS_N_INSNS (4);
11270
11271 if (speed)
11272 *cost += 2 * extra_cost->alu.logical
11273 + 2 * extra_cost->alu.arith;
11274
11275 return true;
11276 }
11277
11278 /* Fall-through. */
11279 case UMOD:
11280 if (speed)
11281 {
11282 /* Slighly prefer UMOD over SMOD. */
11283 if (VECTOR_MODE_P (mode))
11284 *cost += extra_cost->vect.alu;
11285 else if (GET_MODE_CLASS (mode) == MODE_INT)
11286 *cost += (extra_cost->mult[mode == DImode].add
11287 + extra_cost->mult[mode == DImode].idiv
11288 + (code == MOD ? 1 : 0));
11289 }
11290 return false; /* All arguments need to be in registers. */
11291
11292 case DIV:
11293 case UDIV:
11294 case SQRT:
11295 if (speed)
11296 {
11297 if (VECTOR_MODE_P (mode))
11298 *cost += extra_cost->vect.alu;
11299 else if (GET_MODE_CLASS (mode) == MODE_INT)
11300 /* There is no integer SQRT, so only DIV and UDIV can get
11301 here. */
11302 *cost += (extra_cost->mult[mode == DImode].idiv
11303 /* Slighly prefer UDIV over SDIV. */
11304 + (code == DIV ? 1 : 0));
11305 else
11306 *cost += extra_cost->fp[mode == DFmode].div;
11307 }
11308 return false; /* All arguments need to be in registers. */
11309
11310 case IF_THEN_ELSE:
11311 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11312 XEXP (x, 2), cost, speed);
11313
11314 case EQ:
11315 case NE:
11316 case GT:
11317 case GTU:
11318 case LT:
11319 case LTU:
11320 case GE:
11321 case GEU:
11322 case LE:
11323 case LEU:
11324
11325 return false; /* All arguments must be in registers. */
11326
11327 case FMA:
11328 op0 = XEXP (x, 0);
11329 op1 = XEXP (x, 1);
11330 op2 = XEXP (x, 2);
11331
11332 if (speed)
11333 {
11334 if (VECTOR_MODE_P (mode))
11335 *cost += extra_cost->vect.alu;
11336 else
11337 *cost += extra_cost->fp[mode == DFmode].fma;
11338 }
11339
11340 /* FMSUB, FNMADD, and FNMSUB are free. */
11341 if (GET_CODE (op0) == NEG)
11342 op0 = XEXP (op0, 0);
11343
11344 if (GET_CODE (op2) == NEG)
11345 op2 = XEXP (op2, 0);
11346
11347 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11348 and the by-element operand as operand 0. */
11349 if (GET_CODE (op1) == NEG)
11350 op1 = XEXP (op1, 0);
11351
11352 /* Catch vector-by-element operations. The by-element operand can
11353 either be (vec_duplicate (vec_select (x))) or just
11354 (vec_select (x)), depending on whether we are multiplying by
11355 a vector or a scalar.
11356
11357 Canonicalization is not very good in these cases, FMA4 will put the
11358 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11359 if (GET_CODE (op0) == VEC_DUPLICATE)
11360 op0 = XEXP (op0, 0);
11361 else if (GET_CODE (op1) == VEC_DUPLICATE)
11362 op1 = XEXP (op1, 0);
11363
11364 if (GET_CODE (op0) == VEC_SELECT)
11365 op0 = XEXP (op0, 0);
11366 else if (GET_CODE (op1) == VEC_SELECT)
11367 op1 = XEXP (op1, 0);
11368
11369 /* If the remaining parameters are not registers,
11370 get the cost to put them into registers. */
11371 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11372 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11373 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11374 return true;
11375
11376 case FLOAT:
11377 case UNSIGNED_FLOAT:
11378 if (speed)
11379 *cost += extra_cost->fp[mode == DFmode].fromint;
11380 return false;
11381
11382 case FLOAT_EXTEND:
11383 if (speed)
11384 {
11385 if (VECTOR_MODE_P (mode))
11386 {
11387 /*Vector truncate. */
11388 *cost += extra_cost->vect.alu;
11389 }
11390 else
11391 *cost += extra_cost->fp[mode == DFmode].widen;
11392 }
11393 return false;
11394
11395 case FLOAT_TRUNCATE:
11396 if (speed)
11397 {
11398 if (VECTOR_MODE_P (mode))
11399 {
11400 /*Vector conversion. */
11401 *cost += extra_cost->vect.alu;
11402 }
11403 else
11404 *cost += extra_cost->fp[mode == DFmode].narrow;
11405 }
11406 return false;
11407
11408 case FIX:
11409 case UNSIGNED_FIX:
11410 x = XEXP (x, 0);
11411 /* Strip the rounding part. They will all be implemented
11412 by the fcvt* family of instructions anyway. */
11413 if (GET_CODE (x) == UNSPEC)
11414 {
11415 unsigned int uns_code = XINT (x, 1);
11416
11417 if (uns_code == UNSPEC_FRINTA
11418 || uns_code == UNSPEC_FRINTM
11419 || uns_code == UNSPEC_FRINTN
11420 || uns_code == UNSPEC_FRINTP
11421 || uns_code == UNSPEC_FRINTZ)
11422 x = XVECEXP (x, 0, 0);
11423 }
11424
11425 if (speed)
11426 {
11427 if (VECTOR_MODE_P (mode))
11428 *cost += extra_cost->vect.alu;
11429 else
11430 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11431 }
11432
11433 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11434 fixed-point fcvt. */
11435 if (GET_CODE (x) == MULT
11436 && ((VECTOR_MODE_P (mode)
11437 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11438 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11439 {
11440 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11441 0, speed);
11442 return true;
11443 }
11444
11445 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11446 return true;
11447
11448 case ABS:
11449 if (VECTOR_MODE_P (mode))
11450 {
11451 /* ABS (vector). */
11452 if (speed)
11453 *cost += extra_cost->vect.alu;
11454 }
11455 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11456 {
11457 op0 = XEXP (x, 0);
11458
11459 /* FABD, which is analogous to FADD. */
11460 if (GET_CODE (op0) == MINUS)
11461 {
11462 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11463 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11464 if (speed)
11465 *cost += extra_cost->fp[mode == DFmode].addsub;
11466
11467 return true;
11468 }
11469 /* Simple FABS is analogous to FNEG. */
11470 if (speed)
11471 *cost += extra_cost->fp[mode == DFmode].neg;
11472 }
11473 else
11474 {
11475 /* Integer ABS will either be split to
11476 two arithmetic instructions, or will be an ABS
11477 (scalar), which we don't model. */
11478 *cost = COSTS_N_INSNS (2);
11479 if (speed)
11480 *cost += 2 * extra_cost->alu.arith;
11481 }
11482 return false;
11483
11484 case SMAX:
11485 case SMIN:
11486 if (speed)
11487 {
11488 if (VECTOR_MODE_P (mode))
11489 *cost += extra_cost->vect.alu;
11490 else
11491 {
11492 /* FMAXNM/FMINNM/FMAX/FMIN.
11493 TODO: This may not be accurate for all implementations, but
11494 we do not model this in the cost tables. */
11495 *cost += extra_cost->fp[mode == DFmode].addsub;
11496 }
11497 }
11498 return false;
11499
11500 case UNSPEC:
11501 /* The floating point round to integer frint* instructions. */
11502 if (aarch64_frint_unspec_p (XINT (x, 1)))
11503 {
11504 if (speed)
11505 *cost += extra_cost->fp[mode == DFmode].roundint;
11506
11507 return false;
11508 }
11509
11510 if (XINT (x, 1) == UNSPEC_RBIT)
11511 {
11512 if (speed)
11513 *cost += extra_cost->alu.rev;
11514
11515 return false;
11516 }
11517 break;
11518
11519 case TRUNCATE:
11520
11521 /* Decompose <su>muldi3_highpart. */
11522 if (/* (truncate:DI */
11523 mode == DImode
11524 /* (lshiftrt:TI */
11525 && GET_MODE (XEXP (x, 0)) == TImode
11526 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11527 /* (mult:TI */
11528 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11529 /* (ANY_EXTEND:TI (reg:DI))
11530 (ANY_EXTEND:TI (reg:DI))) */
11531 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11532 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11533 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11534 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11535 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11536 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11537 /* (const_int 64) */
11538 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11539 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11540 {
11541 /* UMULH/SMULH. */
11542 if (speed)
11543 *cost += extra_cost->mult[mode == DImode].extend;
11544 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11545 mode, MULT, 0, speed);
11546 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11547 mode, MULT, 1, speed);
11548 return true;
11549 }
11550
11551 /* Fall through. */
11552 default:
11553 break;
11554 }
11555
11556 if (dump_file
11557 && flag_aarch64_verbose_cost)
11558 fprintf (dump_file,
11559 "\nFailed to cost RTX. Assuming default cost.\n");
11560
11561 return true;
11562 }
11563
11564 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11565 calculated for X. This cost is stored in *COST. Returns true
11566 if the total cost of X was calculated. */
11567 static bool
11568 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11569 int param, int *cost, bool speed)
11570 {
11571 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11572
11573 if (dump_file
11574 && flag_aarch64_verbose_cost)
11575 {
11576 print_rtl_single (dump_file, x);
11577 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11578 speed ? "Hot" : "Cold",
11579 *cost, result ? "final" : "partial");
11580 }
11581
11582 return result;
11583 }
11584
11585 static int
11586 aarch64_register_move_cost (machine_mode mode,
11587 reg_class_t from_i, reg_class_t to_i)
11588 {
11589 enum reg_class from = (enum reg_class) from_i;
11590 enum reg_class to = (enum reg_class) to_i;
11591 const struct cpu_regmove_cost *regmove_cost
11592 = aarch64_tune_params.regmove_cost;
11593
11594 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11595 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11596 to = GENERAL_REGS;
11597
11598 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11599 from = GENERAL_REGS;
11600
11601 /* Moving between GPR and stack cost is the same as GP2GP. */
11602 if ((from == GENERAL_REGS && to == STACK_REG)
11603 || (to == GENERAL_REGS && from == STACK_REG))
11604 return regmove_cost->GP2GP;
11605
11606 /* To/From the stack register, we move via the gprs. */
11607 if (to == STACK_REG || from == STACK_REG)
11608 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11609 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11610
11611 if (known_eq (GET_MODE_SIZE (mode), 16))
11612 {
11613 /* 128-bit operations on general registers require 2 instructions. */
11614 if (from == GENERAL_REGS && to == GENERAL_REGS)
11615 return regmove_cost->GP2GP * 2;
11616 else if (from == GENERAL_REGS)
11617 return regmove_cost->GP2FP * 2;
11618 else if (to == GENERAL_REGS)
11619 return regmove_cost->FP2GP * 2;
11620
11621 /* When AdvSIMD instructions are disabled it is not possible to move
11622 a 128-bit value directly between Q registers. This is handled in
11623 secondary reload. A general register is used as a scratch to move
11624 the upper DI value and the lower DI value is moved directly,
11625 hence the cost is the sum of three moves. */
11626 if (! TARGET_SIMD)
11627 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11628
11629 return regmove_cost->FP2FP;
11630 }
11631
11632 if (from == GENERAL_REGS && to == GENERAL_REGS)
11633 return regmove_cost->GP2GP;
11634 else if (from == GENERAL_REGS)
11635 return regmove_cost->GP2FP;
11636 else if (to == GENERAL_REGS)
11637 return regmove_cost->FP2GP;
11638
11639 return regmove_cost->FP2FP;
11640 }
11641
11642 static int
11643 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11644 reg_class_t rclass ATTRIBUTE_UNUSED,
11645 bool in ATTRIBUTE_UNUSED)
11646 {
11647 return aarch64_tune_params.memmov_cost;
11648 }
11649
11650 /* Implement TARGET_INIT_BUILTINS. */
11651 static void
11652 aarch64_init_builtins ()
11653 {
11654 aarch64_general_init_builtins ();
11655 }
11656
11657 /* Implement TARGET_FOLD_BUILTIN. */
11658 static tree
11659 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
11660 {
11661 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11662 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11663 tree type = TREE_TYPE (TREE_TYPE (fndecl));
11664 switch (code & AARCH64_BUILTIN_CLASS)
11665 {
11666 case AARCH64_BUILTIN_GENERAL:
11667 return aarch64_general_fold_builtin (subcode, type, nargs, args);
11668 }
11669 gcc_unreachable ();
11670 }
11671
11672 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
11673 static bool
11674 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
11675 {
11676 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
11677 tree fndecl = gimple_call_fndecl (stmt);
11678 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11679 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11680 gimple *new_stmt = NULL;
11681 switch (code & AARCH64_BUILTIN_CLASS)
11682 {
11683 case AARCH64_BUILTIN_GENERAL:
11684 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
11685 break;
11686 }
11687
11688 if (!new_stmt)
11689 return false;
11690
11691 gsi_replace (gsi, new_stmt, true);
11692 return true;
11693 }
11694
11695 /* Implement TARGET_EXPAND_BUILTIN. */
11696 static rtx
11697 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int)
11698 {
11699 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11700 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11701 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11702 switch (code & AARCH64_BUILTIN_CLASS)
11703 {
11704 case AARCH64_BUILTIN_GENERAL:
11705 return aarch64_general_expand_builtin (subcode, exp, target);
11706 }
11707 gcc_unreachable ();
11708 }
11709
11710 /* Implement TARGET_BUILTIN_DECL. */
11711 static tree
11712 aarch64_builtin_decl (unsigned int code, bool initialize_p)
11713 {
11714 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11715 switch (code & AARCH64_BUILTIN_CLASS)
11716 {
11717 case AARCH64_BUILTIN_GENERAL:
11718 return aarch64_general_builtin_decl (subcode, initialize_p);
11719 }
11720 gcc_unreachable ();
11721 }
11722
11723 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11724 to optimize 1.0/sqrt. */
11725
11726 static bool
11727 use_rsqrt_p (machine_mode mode)
11728 {
11729 return (!flag_trapping_math
11730 && flag_unsafe_math_optimizations
11731 && ((aarch64_tune_params.approx_modes->recip_sqrt
11732 & AARCH64_APPROX_MODE (mode))
11733 || flag_mrecip_low_precision_sqrt));
11734 }
11735
11736 /* Function to decide when to use the approximate reciprocal square root
11737 builtin. */
11738
11739 static tree
11740 aarch64_builtin_reciprocal (tree fndecl)
11741 {
11742 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11743
11744 if (!use_rsqrt_p (mode))
11745 return NULL_TREE;
11746 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11747 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11748 switch (code & AARCH64_BUILTIN_CLASS)
11749 {
11750 case AARCH64_BUILTIN_GENERAL:
11751 return aarch64_general_builtin_rsqrt (subcode);
11752 }
11753 gcc_unreachable ();
11754 }
11755
11756 /* Emit instruction sequence to compute either the approximate square root
11757 or its approximate reciprocal, depending on the flag RECP, and return
11758 whether the sequence was emitted or not. */
11759
11760 bool
11761 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11762 {
11763 machine_mode mode = GET_MODE (dst);
11764
11765 if (GET_MODE_INNER (mode) == HFmode)
11766 {
11767 gcc_assert (!recp);
11768 return false;
11769 }
11770
11771 if (!recp)
11772 {
11773 if (!(flag_mlow_precision_sqrt
11774 || (aarch64_tune_params.approx_modes->sqrt
11775 & AARCH64_APPROX_MODE (mode))))
11776 return false;
11777
11778 if (flag_finite_math_only
11779 || flag_trapping_math
11780 || !flag_unsafe_math_optimizations
11781 || optimize_function_for_size_p (cfun))
11782 return false;
11783 }
11784 else
11785 /* Caller assumes we cannot fail. */
11786 gcc_assert (use_rsqrt_p (mode));
11787
11788 machine_mode mmsk = mode_for_int_vector (mode).require ();
11789 rtx xmsk = gen_reg_rtx (mmsk);
11790 if (!recp)
11791 /* When calculating the approximate square root, compare the
11792 argument with 0.0 and create a mask. */
11793 emit_insn (gen_rtx_SET (xmsk,
11794 gen_rtx_NEG (mmsk,
11795 gen_rtx_EQ (mmsk, src,
11796 CONST0_RTX (mode)))));
11797
11798 /* Estimate the approximate reciprocal square root. */
11799 rtx xdst = gen_reg_rtx (mode);
11800 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11801
11802 /* Iterate over the series twice for SF and thrice for DF. */
11803 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11804
11805 /* Optionally iterate over the series once less for faster performance
11806 while sacrificing the accuracy. */
11807 if ((recp && flag_mrecip_low_precision_sqrt)
11808 || (!recp && flag_mlow_precision_sqrt))
11809 iterations--;
11810
11811 /* Iterate over the series to calculate the approximate reciprocal square
11812 root. */
11813 rtx x1 = gen_reg_rtx (mode);
11814 while (iterations--)
11815 {
11816 rtx x2 = gen_reg_rtx (mode);
11817 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11818
11819 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11820
11821 if (iterations > 0)
11822 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11823 }
11824
11825 if (!recp)
11826 {
11827 /* Qualify the approximate reciprocal square root when the argument is
11828 0.0 by squashing the intermediary result to 0.0. */
11829 rtx xtmp = gen_reg_rtx (mmsk);
11830 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11831 gen_rtx_SUBREG (mmsk, xdst, 0)));
11832 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11833
11834 /* Calculate the approximate square root. */
11835 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11836 }
11837
11838 /* Finalize the approximation. */
11839 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11840
11841 return true;
11842 }
11843
11844 /* Emit the instruction sequence to compute the approximation for the division
11845 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11846
11847 bool
11848 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11849 {
11850 machine_mode mode = GET_MODE (quo);
11851
11852 if (GET_MODE_INNER (mode) == HFmode)
11853 return false;
11854
11855 bool use_approx_division_p = (flag_mlow_precision_div
11856 || (aarch64_tune_params.approx_modes->division
11857 & AARCH64_APPROX_MODE (mode)));
11858
11859 if (!flag_finite_math_only
11860 || flag_trapping_math
11861 || !flag_unsafe_math_optimizations
11862 || optimize_function_for_size_p (cfun)
11863 || !use_approx_division_p)
11864 return false;
11865
11866 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11867 return false;
11868
11869 /* Estimate the approximate reciprocal. */
11870 rtx xrcp = gen_reg_rtx (mode);
11871 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11872
11873 /* Iterate over the series twice for SF and thrice for DF. */
11874 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11875
11876 /* Optionally iterate over the series once less for faster performance,
11877 while sacrificing the accuracy. */
11878 if (flag_mlow_precision_div)
11879 iterations--;
11880
11881 /* Iterate over the series to calculate the approximate reciprocal. */
11882 rtx xtmp = gen_reg_rtx (mode);
11883 while (iterations--)
11884 {
11885 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11886
11887 if (iterations > 0)
11888 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11889 }
11890
11891 if (num != CONST1_RTX (mode))
11892 {
11893 /* As the approximate reciprocal of DEN is already calculated, only
11894 calculate the approximate division when NUM is not 1.0. */
11895 rtx xnum = force_reg (mode, num);
11896 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11897 }
11898
11899 /* Finalize the approximation. */
11900 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11901 return true;
11902 }
11903
11904 /* Return the number of instructions that can be issued per cycle. */
11905 static int
11906 aarch64_sched_issue_rate (void)
11907 {
11908 return aarch64_tune_params.issue_rate;
11909 }
11910
11911 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
11912 static int
11913 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
11914 {
11915 if (DEBUG_INSN_P (insn))
11916 return more;
11917
11918 rtx_code code = GET_CODE (PATTERN (insn));
11919 if (code == USE || code == CLOBBER)
11920 return more;
11921
11922 if (get_attr_type (insn) == TYPE_NO_INSN)
11923 return more;
11924
11925 return more - 1;
11926 }
11927
11928 static int
11929 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11930 {
11931 int issue_rate = aarch64_sched_issue_rate ();
11932
11933 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11934 }
11935
11936
11937 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11938 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11939 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11940
11941 static int
11942 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11943 int ready_index)
11944 {
11945 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11946 }
11947
11948
11949 /* Vectorizer cost model target hooks. */
11950
11951 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11952 static int
11953 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11954 tree vectype,
11955 int misalign ATTRIBUTE_UNUSED)
11956 {
11957 unsigned elements;
11958 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11959 bool fp = false;
11960
11961 if (vectype != NULL)
11962 fp = FLOAT_TYPE_P (vectype);
11963
11964 switch (type_of_cost)
11965 {
11966 case scalar_stmt:
11967 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11968
11969 case scalar_load:
11970 return costs->scalar_load_cost;
11971
11972 case scalar_store:
11973 return costs->scalar_store_cost;
11974
11975 case vector_stmt:
11976 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11977
11978 case vector_load:
11979 return costs->vec_align_load_cost;
11980
11981 case vector_store:
11982 return costs->vec_store_cost;
11983
11984 case vec_to_scalar:
11985 return costs->vec_to_scalar_cost;
11986
11987 case scalar_to_vec:
11988 return costs->scalar_to_vec_cost;
11989
11990 case unaligned_load:
11991 case vector_gather_load:
11992 return costs->vec_unalign_load_cost;
11993
11994 case unaligned_store:
11995 case vector_scatter_store:
11996 return costs->vec_unalign_store_cost;
11997
11998 case cond_branch_taken:
11999 return costs->cond_taken_branch_cost;
12000
12001 case cond_branch_not_taken:
12002 return costs->cond_not_taken_branch_cost;
12003
12004 case vec_perm:
12005 return costs->vec_permute_cost;
12006
12007 case vec_promote_demote:
12008 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12009
12010 case vec_construct:
12011 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12012 return elements / 2 + 1;
12013
12014 default:
12015 gcc_unreachable ();
12016 }
12017 }
12018
12019 /* Implement targetm.vectorize.add_stmt_cost. */
12020 static unsigned
12021 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
12022 struct _stmt_vec_info *stmt_info, int misalign,
12023 enum vect_cost_model_location where)
12024 {
12025 unsigned *cost = (unsigned *) data;
12026 unsigned retval = 0;
12027
12028 if (flag_vect_cost_model)
12029 {
12030 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
12031 int stmt_cost =
12032 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
12033
12034 /* Statements in an inner loop relative to the loop being
12035 vectorized are weighted more heavily. The value here is
12036 arbitrary and could potentially be improved with analysis. */
12037 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
12038 count *= 50; /* FIXME */
12039
12040 retval = (unsigned) (count * stmt_cost);
12041 cost[where] += retval;
12042 }
12043
12044 return retval;
12045 }
12046
12047 static void initialize_aarch64_code_model (struct gcc_options *);
12048
12049 /* Parse the TO_PARSE string and put the architecture struct that it
12050 selects into RES and the architectural features into ISA_FLAGS.
12051 Return an aarch64_parse_opt_result describing the parse result.
12052 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12053 When the TO_PARSE string contains an invalid extension,
12054 a copy of the string is created and stored to INVALID_EXTENSION. */
12055
12056 static enum aarch64_parse_opt_result
12057 aarch64_parse_arch (const char *to_parse, const struct processor **res,
12058 uint64_t *isa_flags, std::string *invalid_extension)
12059 {
12060 const char *ext;
12061 const struct processor *arch;
12062 size_t len;
12063
12064 ext = strchr (to_parse, '+');
12065
12066 if (ext != NULL)
12067 len = ext - to_parse;
12068 else
12069 len = strlen (to_parse);
12070
12071 if (len == 0)
12072 return AARCH64_PARSE_MISSING_ARG;
12073
12074
12075 /* Loop through the list of supported ARCHes to find a match. */
12076 for (arch = all_architectures; arch->name != NULL; arch++)
12077 {
12078 if (strlen (arch->name) == len
12079 && strncmp (arch->name, to_parse, len) == 0)
12080 {
12081 uint64_t isa_temp = arch->flags;
12082
12083 if (ext != NULL)
12084 {
12085 /* TO_PARSE string contains at least one extension. */
12086 enum aarch64_parse_opt_result ext_res
12087 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12088
12089 if (ext_res != AARCH64_PARSE_OK)
12090 return ext_res;
12091 }
12092 /* Extension parsing was successful. Confirm the result
12093 arch and ISA flags. */
12094 *res = arch;
12095 *isa_flags = isa_temp;
12096 return AARCH64_PARSE_OK;
12097 }
12098 }
12099
12100 /* ARCH name not found in list. */
12101 return AARCH64_PARSE_INVALID_ARG;
12102 }
12103
12104 /* Parse the TO_PARSE string and put the result tuning in RES and the
12105 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
12106 describing the parse result. If there is an error parsing, RES and
12107 ISA_FLAGS are left unchanged.
12108 When the TO_PARSE string contains an invalid extension,
12109 a copy of the string is created and stored to INVALID_EXTENSION. */
12110
12111 static enum aarch64_parse_opt_result
12112 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12113 uint64_t *isa_flags, std::string *invalid_extension)
12114 {
12115 const char *ext;
12116 const struct processor *cpu;
12117 size_t len;
12118
12119 ext = strchr (to_parse, '+');
12120
12121 if (ext != NULL)
12122 len = ext - to_parse;
12123 else
12124 len = strlen (to_parse);
12125
12126 if (len == 0)
12127 return AARCH64_PARSE_MISSING_ARG;
12128
12129
12130 /* Loop through the list of supported CPUs to find a match. */
12131 for (cpu = all_cores; cpu->name != NULL; cpu++)
12132 {
12133 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12134 {
12135 uint64_t isa_temp = cpu->flags;
12136
12137
12138 if (ext != NULL)
12139 {
12140 /* TO_PARSE string contains at least one extension. */
12141 enum aarch64_parse_opt_result ext_res
12142 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12143
12144 if (ext_res != AARCH64_PARSE_OK)
12145 return ext_res;
12146 }
12147 /* Extension parsing was successfull. Confirm the result
12148 cpu and ISA flags. */
12149 *res = cpu;
12150 *isa_flags = isa_temp;
12151 return AARCH64_PARSE_OK;
12152 }
12153 }
12154
12155 /* CPU name not found in list. */
12156 return AARCH64_PARSE_INVALID_ARG;
12157 }
12158
12159 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12160 Return an aarch64_parse_opt_result describing the parse result.
12161 If the parsing fails the RES does not change. */
12162
12163 static enum aarch64_parse_opt_result
12164 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12165 {
12166 const struct processor *cpu;
12167
12168 /* Loop through the list of supported CPUs to find a match. */
12169 for (cpu = all_cores; cpu->name != NULL; cpu++)
12170 {
12171 if (strcmp (cpu->name, to_parse) == 0)
12172 {
12173 *res = cpu;
12174 return AARCH64_PARSE_OK;
12175 }
12176 }
12177
12178 /* CPU name not found in list. */
12179 return AARCH64_PARSE_INVALID_ARG;
12180 }
12181
12182 /* Parse TOKEN, which has length LENGTH to see if it is an option
12183 described in FLAG. If it is, return the index bit for that fusion type.
12184 If not, error (printing OPTION_NAME) and return zero. */
12185
12186 static unsigned int
12187 aarch64_parse_one_option_token (const char *token,
12188 size_t length,
12189 const struct aarch64_flag_desc *flag,
12190 const char *option_name)
12191 {
12192 for (; flag->name != NULL; flag++)
12193 {
12194 if (length == strlen (flag->name)
12195 && !strncmp (flag->name, token, length))
12196 return flag->flag;
12197 }
12198
12199 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12200 return 0;
12201 }
12202
12203 /* Parse OPTION which is a comma-separated list of flags to enable.
12204 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12205 default state we inherit from the CPU tuning structures. OPTION_NAME
12206 gives the top-level option we are parsing in the -moverride string,
12207 for use in error messages. */
12208
12209 static unsigned int
12210 aarch64_parse_boolean_options (const char *option,
12211 const struct aarch64_flag_desc *flags,
12212 unsigned int initial_state,
12213 const char *option_name)
12214 {
12215 const char separator = '.';
12216 const char* specs = option;
12217 const char* ntoken = option;
12218 unsigned int found_flags = initial_state;
12219
12220 while ((ntoken = strchr (specs, separator)))
12221 {
12222 size_t token_length = ntoken - specs;
12223 unsigned token_ops = aarch64_parse_one_option_token (specs,
12224 token_length,
12225 flags,
12226 option_name);
12227 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12228 in the token stream, reset the supported operations. So:
12229
12230 adrp+add.cmp+branch.none.adrp+add
12231
12232 would have the result of turning on only adrp+add fusion. */
12233 if (!token_ops)
12234 found_flags = 0;
12235
12236 found_flags |= token_ops;
12237 specs = ++ntoken;
12238 }
12239
12240 /* We ended with a comma, print something. */
12241 if (!(*specs))
12242 {
12243 error ("%s string ill-formed\n", option_name);
12244 return 0;
12245 }
12246
12247 /* We still have one more token to parse. */
12248 size_t token_length = strlen (specs);
12249 unsigned token_ops = aarch64_parse_one_option_token (specs,
12250 token_length,
12251 flags,
12252 option_name);
12253 if (!token_ops)
12254 found_flags = 0;
12255
12256 found_flags |= token_ops;
12257 return found_flags;
12258 }
12259
12260 /* Support for overriding instruction fusion. */
12261
12262 static void
12263 aarch64_parse_fuse_string (const char *fuse_string,
12264 struct tune_params *tune)
12265 {
12266 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12267 aarch64_fusible_pairs,
12268 tune->fusible_ops,
12269 "fuse=");
12270 }
12271
12272 /* Support for overriding other tuning flags. */
12273
12274 static void
12275 aarch64_parse_tune_string (const char *tune_string,
12276 struct tune_params *tune)
12277 {
12278 tune->extra_tuning_flags
12279 = aarch64_parse_boolean_options (tune_string,
12280 aarch64_tuning_flags,
12281 tune->extra_tuning_flags,
12282 "tune=");
12283 }
12284
12285 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12286 Accept the valid SVE vector widths allowed by
12287 aarch64_sve_vector_bits_enum and use it to override sve_width
12288 in TUNE. */
12289
12290 static void
12291 aarch64_parse_sve_width_string (const char *tune_string,
12292 struct tune_params *tune)
12293 {
12294 int width = -1;
12295
12296 int n = sscanf (tune_string, "%d", &width);
12297 if (n == EOF)
12298 {
12299 error ("invalid format for sve_width");
12300 return;
12301 }
12302 switch (width)
12303 {
12304 case SVE_128:
12305 case SVE_256:
12306 case SVE_512:
12307 case SVE_1024:
12308 case SVE_2048:
12309 break;
12310 default:
12311 error ("invalid sve_width value: %d", width);
12312 }
12313 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12314 }
12315
12316 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12317 we understand. If it is, extract the option string and handoff to
12318 the appropriate function. */
12319
12320 void
12321 aarch64_parse_one_override_token (const char* token,
12322 size_t length,
12323 struct tune_params *tune)
12324 {
12325 const struct aarch64_tuning_override_function *fn
12326 = aarch64_tuning_override_functions;
12327
12328 const char *option_part = strchr (token, '=');
12329 if (!option_part)
12330 {
12331 error ("tuning string missing in option (%s)", token);
12332 return;
12333 }
12334
12335 /* Get the length of the option name. */
12336 length = option_part - token;
12337 /* Skip the '=' to get to the option string. */
12338 option_part++;
12339
12340 for (; fn->name != NULL; fn++)
12341 {
12342 if (!strncmp (fn->name, token, length))
12343 {
12344 fn->parse_override (option_part, tune);
12345 return;
12346 }
12347 }
12348
12349 error ("unknown tuning option (%s)",token);
12350 return;
12351 }
12352
12353 /* A checking mechanism for the implementation of the tls size. */
12354
12355 static void
12356 initialize_aarch64_tls_size (struct gcc_options *opts)
12357 {
12358 if (aarch64_tls_size == 0)
12359 aarch64_tls_size = 24;
12360
12361 switch (opts->x_aarch64_cmodel_var)
12362 {
12363 case AARCH64_CMODEL_TINY:
12364 /* Both the default and maximum TLS size allowed under tiny is 1M which
12365 needs two instructions to address, so we clamp the size to 24. */
12366 if (aarch64_tls_size > 24)
12367 aarch64_tls_size = 24;
12368 break;
12369 case AARCH64_CMODEL_SMALL:
12370 /* The maximum TLS size allowed under small is 4G. */
12371 if (aarch64_tls_size > 32)
12372 aarch64_tls_size = 32;
12373 break;
12374 case AARCH64_CMODEL_LARGE:
12375 /* The maximum TLS size allowed under large is 16E.
12376 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12377 if (aarch64_tls_size > 48)
12378 aarch64_tls_size = 48;
12379 break;
12380 default:
12381 gcc_unreachable ();
12382 }
12383
12384 return;
12385 }
12386
12387 /* Parse STRING looking for options in the format:
12388 string :: option:string
12389 option :: name=substring
12390 name :: {a-z}
12391 substring :: defined by option. */
12392
12393 static void
12394 aarch64_parse_override_string (const char* input_string,
12395 struct tune_params* tune)
12396 {
12397 const char separator = ':';
12398 size_t string_length = strlen (input_string) + 1;
12399 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12400 char *string = string_root;
12401 strncpy (string, input_string, string_length);
12402 string[string_length - 1] = '\0';
12403
12404 char* ntoken = string;
12405
12406 while ((ntoken = strchr (string, separator)))
12407 {
12408 size_t token_length = ntoken - string;
12409 /* Make this substring look like a string. */
12410 *ntoken = '\0';
12411 aarch64_parse_one_override_token (string, token_length, tune);
12412 string = ++ntoken;
12413 }
12414
12415 /* One last option to parse. */
12416 aarch64_parse_one_override_token (string, strlen (string), tune);
12417 free (string_root);
12418 }
12419
12420
12421 static void
12422 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12423 {
12424 if (accepted_branch_protection_string)
12425 {
12426 opts->x_aarch64_branch_protection_string
12427 = xstrdup (accepted_branch_protection_string);
12428 }
12429
12430 /* PR 70044: We have to be careful about being called multiple times for the
12431 same function. This means all changes should be repeatable. */
12432
12433 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12434 Disable the frame pointer flag so the mid-end will not use a frame
12435 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12436 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12437 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12438 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12439 if (opts->x_flag_omit_frame_pointer == 0)
12440 opts->x_flag_omit_frame_pointer = 2;
12441
12442 /* If not optimizing for size, set the default
12443 alignment to what the target wants. */
12444 if (!opts->x_optimize_size)
12445 {
12446 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12447 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12448 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12449 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12450 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12451 opts->x_str_align_functions = aarch64_tune_params.function_align;
12452 }
12453
12454 /* We default to no pc-relative literal loads. */
12455
12456 aarch64_pcrelative_literal_loads = false;
12457
12458 /* If -mpc-relative-literal-loads is set on the command line, this
12459 implies that the user asked for PC relative literal loads. */
12460 if (opts->x_pcrelative_literal_loads == 1)
12461 aarch64_pcrelative_literal_loads = true;
12462
12463 /* In the tiny memory model it makes no sense to disallow PC relative
12464 literal pool loads. */
12465 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12466 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12467 aarch64_pcrelative_literal_loads = true;
12468
12469 /* When enabling the lower precision Newton series for the square root, also
12470 enable it for the reciprocal square root, since the latter is an
12471 intermediary step for the former. */
12472 if (flag_mlow_precision_sqrt)
12473 flag_mrecip_low_precision_sqrt = true;
12474 }
12475
12476 /* 'Unpack' up the internal tuning structs and update the options
12477 in OPTS. The caller must have set up selected_tune and selected_arch
12478 as all the other target-specific codegen decisions are
12479 derived from them. */
12480
12481 void
12482 aarch64_override_options_internal (struct gcc_options *opts)
12483 {
12484 aarch64_tune_flags = selected_tune->flags;
12485 aarch64_tune = selected_tune->sched_core;
12486 /* Make a copy of the tuning parameters attached to the core, which
12487 we may later overwrite. */
12488 aarch64_tune_params = *(selected_tune->tune);
12489 aarch64_architecture_version = selected_arch->architecture_version;
12490
12491 if (opts->x_aarch64_override_tune_string)
12492 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12493 &aarch64_tune_params);
12494
12495 /* This target defaults to strict volatile bitfields. */
12496 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12497 opts->x_flag_strict_volatile_bitfields = 1;
12498
12499 if (aarch64_stack_protector_guard == SSP_GLOBAL
12500 && opts->x_aarch64_stack_protector_guard_offset_str)
12501 {
12502 error ("incompatible options %<-mstack-protector-guard=global%> and "
12503 "%<-mstack-protector-guard-offset=%s%>",
12504 aarch64_stack_protector_guard_offset_str);
12505 }
12506
12507 if (aarch64_stack_protector_guard == SSP_SYSREG
12508 && !(opts->x_aarch64_stack_protector_guard_offset_str
12509 && opts->x_aarch64_stack_protector_guard_reg_str))
12510 {
12511 error ("both %<-mstack-protector-guard-offset%> and "
12512 "%<-mstack-protector-guard-reg%> must be used "
12513 "with %<-mstack-protector-guard=sysreg%>");
12514 }
12515
12516 if (opts->x_aarch64_stack_protector_guard_reg_str)
12517 {
12518 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12519 error ("specify a system register with a small string length.");
12520 }
12521
12522 if (opts->x_aarch64_stack_protector_guard_offset_str)
12523 {
12524 char *end;
12525 const char *str = aarch64_stack_protector_guard_offset_str;
12526 errno = 0;
12527 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12528 if (!*str || *end || errno)
12529 error ("%qs is not a valid offset in %qs", str,
12530 "-mstack-protector-guard-offset=");
12531 aarch64_stack_protector_guard_offset = offs;
12532 }
12533
12534 initialize_aarch64_code_model (opts);
12535 initialize_aarch64_tls_size (opts);
12536
12537 int queue_depth = 0;
12538 switch (aarch64_tune_params.autoprefetcher_model)
12539 {
12540 case tune_params::AUTOPREFETCHER_OFF:
12541 queue_depth = -1;
12542 break;
12543 case tune_params::AUTOPREFETCHER_WEAK:
12544 queue_depth = 0;
12545 break;
12546 case tune_params::AUTOPREFETCHER_STRONG:
12547 queue_depth = max_insn_queue_index + 1;
12548 break;
12549 default:
12550 gcc_unreachable ();
12551 }
12552
12553 /* We don't mind passing in global_options_set here as we don't use
12554 the *options_set structs anyway. */
12555 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12556 queue_depth,
12557 opts->x_param_values,
12558 global_options_set.x_param_values);
12559
12560 /* Set up parameters to be used in prefetching algorithm. Do not
12561 override the defaults unless we are tuning for a core we have
12562 researched values for. */
12563 if (aarch64_tune_params.prefetch->num_slots > 0)
12564 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12565 aarch64_tune_params.prefetch->num_slots,
12566 opts->x_param_values,
12567 global_options_set.x_param_values);
12568 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12569 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12570 aarch64_tune_params.prefetch->l1_cache_size,
12571 opts->x_param_values,
12572 global_options_set.x_param_values);
12573 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12574 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12575 aarch64_tune_params.prefetch->l1_cache_line_size,
12576 opts->x_param_values,
12577 global_options_set.x_param_values);
12578 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12579 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12580 aarch64_tune_params.prefetch->l2_cache_size,
12581 opts->x_param_values,
12582 global_options_set.x_param_values);
12583 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12584 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12585 0,
12586 opts->x_param_values,
12587 global_options_set.x_param_values);
12588 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12589 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12590 aarch64_tune_params.prefetch->minimum_stride,
12591 opts->x_param_values,
12592 global_options_set.x_param_values);
12593
12594 /* Use the alternative scheduling-pressure algorithm by default. */
12595 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12596 opts->x_param_values,
12597 global_options_set.x_param_values);
12598
12599 /* If the user hasn't changed it via configure then set the default to 64 KB
12600 for the backend. */
12601 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12602 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12603 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12604 opts->x_param_values,
12605 global_options_set.x_param_values);
12606
12607 /* Validate the guard size. */
12608 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12609
12610 /* Enforce that interval is the same size as size so the mid-end does the
12611 right thing. */
12612 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12613 guard_size,
12614 opts->x_param_values,
12615 global_options_set.x_param_values);
12616
12617 /* The maybe_set calls won't update the value if the user has explicitly set
12618 one. Which means we need to validate that probing interval and guard size
12619 are equal. */
12620 int probe_interval
12621 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12622 if (guard_size != probe_interval)
12623 error ("stack clash guard size %<%d%> must be equal to probing interval "
12624 "%<%d%>", guard_size, probe_interval);
12625
12626 /* Enable sw prefetching at specified optimization level for
12627 CPUS that have prefetch. Lower optimization level threshold by 1
12628 when profiling is enabled. */
12629 if (opts->x_flag_prefetch_loop_arrays < 0
12630 && !opts->x_optimize_size
12631 && aarch64_tune_params.prefetch->default_opt_level >= 0
12632 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12633 opts->x_flag_prefetch_loop_arrays = 1;
12634
12635 if (opts->x_aarch64_arch_string == NULL)
12636 opts->x_aarch64_arch_string = selected_arch->name;
12637 if (opts->x_aarch64_cpu_string == NULL)
12638 opts->x_aarch64_cpu_string = selected_cpu->name;
12639 if (opts->x_aarch64_tune_string == NULL)
12640 opts->x_aarch64_tune_string = selected_tune->name;
12641
12642 aarch64_override_options_after_change_1 (opts);
12643 }
12644
12645 /* Print a hint with a suggestion for a core or architecture name that
12646 most closely resembles what the user passed in STR. ARCH is true if
12647 the user is asking for an architecture name. ARCH is false if the user
12648 is asking for a core name. */
12649
12650 static void
12651 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12652 {
12653 auto_vec<const char *> candidates;
12654 const struct processor *entry = arch ? all_architectures : all_cores;
12655 for (; entry->name != NULL; entry++)
12656 candidates.safe_push (entry->name);
12657
12658 #ifdef HAVE_LOCAL_CPU_DETECT
12659 /* Add also "native" as possible value. */
12660 if (arch)
12661 candidates.safe_push ("native");
12662 #endif
12663
12664 char *s;
12665 const char *hint = candidates_list_and_hint (str, s, candidates);
12666 if (hint)
12667 inform (input_location, "valid arguments are: %s;"
12668 " did you mean %qs?", s, hint);
12669 else
12670 inform (input_location, "valid arguments are: %s", s);
12671
12672 XDELETEVEC (s);
12673 }
12674
12675 /* Print a hint with a suggestion for a core name that most closely resembles
12676 what the user passed in STR. */
12677
12678 inline static void
12679 aarch64_print_hint_for_core (const char *str)
12680 {
12681 aarch64_print_hint_for_core_or_arch (str, false);
12682 }
12683
12684 /* Print a hint with a suggestion for an architecture name that most closely
12685 resembles what the user passed in STR. */
12686
12687 inline static void
12688 aarch64_print_hint_for_arch (const char *str)
12689 {
12690 aarch64_print_hint_for_core_or_arch (str, true);
12691 }
12692
12693
12694 /* Print a hint with a suggestion for an extension name
12695 that most closely resembles what the user passed in STR. */
12696
12697 void
12698 aarch64_print_hint_for_extensions (const std::string &str)
12699 {
12700 auto_vec<const char *> candidates;
12701 aarch64_get_all_extension_candidates (&candidates);
12702 char *s;
12703 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12704 if (hint)
12705 inform (input_location, "valid arguments are: %s;"
12706 " did you mean %qs?", s, hint);
12707 else
12708 inform (input_location, "valid arguments are: %s;", s);
12709
12710 XDELETEVEC (s);
12711 }
12712
12713 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12714 specified in STR and throw errors if appropriate. Put the results if
12715 they are valid in RES and ISA_FLAGS. Return whether the option is
12716 valid. */
12717
12718 static bool
12719 aarch64_validate_mcpu (const char *str, const struct processor **res,
12720 uint64_t *isa_flags)
12721 {
12722 std::string invalid_extension;
12723 enum aarch64_parse_opt_result parse_res
12724 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12725
12726 if (parse_res == AARCH64_PARSE_OK)
12727 return true;
12728
12729 switch (parse_res)
12730 {
12731 case AARCH64_PARSE_MISSING_ARG:
12732 error ("missing cpu name in %<-mcpu=%s%>", str);
12733 break;
12734 case AARCH64_PARSE_INVALID_ARG:
12735 error ("unknown value %qs for %<-mcpu%>", str);
12736 aarch64_print_hint_for_core (str);
12737 break;
12738 case AARCH64_PARSE_INVALID_FEATURE:
12739 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12740 invalid_extension.c_str (), str);
12741 aarch64_print_hint_for_extensions (invalid_extension);
12742 break;
12743 default:
12744 gcc_unreachable ();
12745 }
12746
12747 return false;
12748 }
12749
12750 /* Parses CONST_STR for branch protection features specified in
12751 aarch64_branch_protect_types, and set any global variables required. Returns
12752 the parsing result and assigns LAST_STR to the last processed token from
12753 CONST_STR so that it can be used for error reporting. */
12754
12755 static enum
12756 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12757 char** last_str)
12758 {
12759 char *str_root = xstrdup (const_str);
12760 char* token_save = NULL;
12761 char *str = strtok_r (str_root, "+", &token_save);
12762 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12763 if (!str)
12764 res = AARCH64_PARSE_MISSING_ARG;
12765 else
12766 {
12767 char *next_str = strtok_r (NULL, "+", &token_save);
12768 /* Reset the branch protection features to their defaults. */
12769 aarch64_handle_no_branch_protection (NULL, NULL);
12770
12771 while (str && res == AARCH64_PARSE_OK)
12772 {
12773 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12774 bool found = false;
12775 /* Search for this type. */
12776 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12777 {
12778 if (strcmp (str, type->name) == 0)
12779 {
12780 found = true;
12781 res = type->handler (str, next_str);
12782 str = next_str;
12783 next_str = strtok_r (NULL, "+", &token_save);
12784 }
12785 else
12786 type++;
12787 }
12788 if (found && res == AARCH64_PARSE_OK)
12789 {
12790 bool found_subtype = true;
12791 /* Loop through each token until we find one that isn't a
12792 subtype. */
12793 while (found_subtype)
12794 {
12795 found_subtype = false;
12796 const aarch64_branch_protect_type *subtype = type->subtypes;
12797 /* Search for the subtype. */
12798 while (str && subtype && subtype->name && !found_subtype
12799 && res == AARCH64_PARSE_OK)
12800 {
12801 if (strcmp (str, subtype->name) == 0)
12802 {
12803 found_subtype = true;
12804 res = subtype->handler (str, next_str);
12805 str = next_str;
12806 next_str = strtok_r (NULL, "+", &token_save);
12807 }
12808 else
12809 subtype++;
12810 }
12811 }
12812 }
12813 else if (!found)
12814 res = AARCH64_PARSE_INVALID_ARG;
12815 }
12816 }
12817 /* Copy the last processed token into the argument to pass it back.
12818 Used by option and attribute validation to print the offending token. */
12819 if (last_str)
12820 {
12821 if (str) strcpy (*last_str, str);
12822 else *last_str = NULL;
12823 }
12824 if (res == AARCH64_PARSE_OK)
12825 {
12826 /* If needed, alloc the accepted string then copy in const_str.
12827 Used by override_option_after_change_1. */
12828 if (!accepted_branch_protection_string)
12829 accepted_branch_protection_string = (char *) xmalloc (
12830 BRANCH_PROTECT_STR_MAX
12831 + 1);
12832 strncpy (accepted_branch_protection_string, const_str,
12833 BRANCH_PROTECT_STR_MAX + 1);
12834 /* Forcibly null-terminate. */
12835 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12836 }
12837 return res;
12838 }
12839
12840 static bool
12841 aarch64_validate_mbranch_protection (const char *const_str)
12842 {
12843 char *str = (char *) xmalloc (strlen (const_str));
12844 enum aarch64_parse_opt_result res =
12845 aarch64_parse_branch_protection (const_str, &str);
12846 if (res == AARCH64_PARSE_INVALID_ARG)
12847 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12848 else if (res == AARCH64_PARSE_MISSING_ARG)
12849 error ("missing argument for %<-mbranch-protection=%>");
12850 free (str);
12851 return res == AARCH64_PARSE_OK;
12852 }
12853
12854 /* Validate a command-line -march option. Parse the arch and extensions
12855 (if any) specified in STR and throw errors if appropriate. Put the
12856 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12857 option is valid. */
12858
12859 static bool
12860 aarch64_validate_march (const char *str, const struct processor **res,
12861 uint64_t *isa_flags)
12862 {
12863 std::string invalid_extension;
12864 enum aarch64_parse_opt_result parse_res
12865 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12866
12867 if (parse_res == AARCH64_PARSE_OK)
12868 return true;
12869
12870 switch (parse_res)
12871 {
12872 case AARCH64_PARSE_MISSING_ARG:
12873 error ("missing arch name in %<-march=%s%>", str);
12874 break;
12875 case AARCH64_PARSE_INVALID_ARG:
12876 error ("unknown value %qs for %<-march%>", str);
12877 aarch64_print_hint_for_arch (str);
12878 break;
12879 case AARCH64_PARSE_INVALID_FEATURE:
12880 error ("invalid feature modifier %qs in %<-march=%s%>",
12881 invalid_extension.c_str (), str);
12882 aarch64_print_hint_for_extensions (invalid_extension);
12883 break;
12884 default:
12885 gcc_unreachable ();
12886 }
12887
12888 return false;
12889 }
12890
12891 /* Validate a command-line -mtune option. Parse the cpu
12892 specified in STR and throw errors if appropriate. Put the
12893 result, if it is valid, in RES. Return whether the option is
12894 valid. */
12895
12896 static bool
12897 aarch64_validate_mtune (const char *str, const struct processor **res)
12898 {
12899 enum aarch64_parse_opt_result parse_res
12900 = aarch64_parse_tune (str, res);
12901
12902 if (parse_res == AARCH64_PARSE_OK)
12903 return true;
12904
12905 switch (parse_res)
12906 {
12907 case AARCH64_PARSE_MISSING_ARG:
12908 error ("missing cpu name in %<-mtune=%s%>", str);
12909 break;
12910 case AARCH64_PARSE_INVALID_ARG:
12911 error ("unknown value %qs for %<-mtune%>", str);
12912 aarch64_print_hint_for_core (str);
12913 break;
12914 default:
12915 gcc_unreachable ();
12916 }
12917 return false;
12918 }
12919
12920 /* Return the CPU corresponding to the enum CPU.
12921 If it doesn't specify a cpu, return the default. */
12922
12923 static const struct processor *
12924 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12925 {
12926 if (cpu != aarch64_none)
12927 return &all_cores[cpu];
12928
12929 /* The & 0x3f is to extract the bottom 6 bits that encode the
12930 default cpu as selected by the --with-cpu GCC configure option
12931 in config.gcc.
12932 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12933 flags mechanism should be reworked to make it more sane. */
12934 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12935 }
12936
12937 /* Return the architecture corresponding to the enum ARCH.
12938 If it doesn't specify a valid architecture, return the default. */
12939
12940 static const struct processor *
12941 aarch64_get_arch (enum aarch64_arch arch)
12942 {
12943 if (arch != aarch64_no_arch)
12944 return &all_architectures[arch];
12945
12946 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12947
12948 return &all_architectures[cpu->arch];
12949 }
12950
12951 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12952
12953 static poly_uint16
12954 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12955 {
12956 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12957 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12958 deciding which .md file patterns to use and when deciding whether
12959 something is a legitimate address or constant. */
12960 if (value == SVE_SCALABLE || value == SVE_128)
12961 return poly_uint16 (2, 2);
12962 else
12963 return (int) value / 64;
12964 }
12965
12966 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12967 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12968 tuning structs. In particular it must set selected_tune and
12969 aarch64_isa_flags that define the available ISA features and tuning
12970 decisions. It must also set selected_arch as this will be used to
12971 output the .arch asm tags for each function. */
12972
12973 static void
12974 aarch64_override_options (void)
12975 {
12976 uint64_t cpu_isa = 0;
12977 uint64_t arch_isa = 0;
12978 aarch64_isa_flags = 0;
12979
12980 bool valid_cpu = true;
12981 bool valid_tune = true;
12982 bool valid_arch = true;
12983
12984 selected_cpu = NULL;
12985 selected_arch = NULL;
12986 selected_tune = NULL;
12987
12988 if (aarch64_branch_protection_string)
12989 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12990
12991 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12992 If either of -march or -mtune is given, they override their
12993 respective component of -mcpu. */
12994 if (aarch64_cpu_string)
12995 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12996 &cpu_isa);
12997
12998 if (aarch64_arch_string)
12999 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13000 &arch_isa);
13001
13002 if (aarch64_tune_string)
13003 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13004
13005 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13006 SUBTARGET_OVERRIDE_OPTIONS;
13007 #endif
13008
13009 /* If the user did not specify a processor, choose the default
13010 one for them. This will be the CPU set during configuration using
13011 --with-cpu, otherwise it is "generic". */
13012 if (!selected_cpu)
13013 {
13014 if (selected_arch)
13015 {
13016 selected_cpu = &all_cores[selected_arch->ident];
13017 aarch64_isa_flags = arch_isa;
13018 explicit_arch = selected_arch->arch;
13019 }
13020 else
13021 {
13022 /* Get default configure-time CPU. */
13023 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
13024 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
13025 }
13026
13027 if (selected_tune)
13028 explicit_tune_core = selected_tune->ident;
13029 }
13030 /* If both -mcpu and -march are specified check that they are architecturally
13031 compatible, warn if they're not and prefer the -march ISA flags. */
13032 else if (selected_arch)
13033 {
13034 if (selected_arch->arch != selected_cpu->arch)
13035 {
13036 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13037 all_architectures[selected_cpu->arch].name,
13038 selected_arch->name);
13039 }
13040 aarch64_isa_flags = arch_isa;
13041 explicit_arch = selected_arch->arch;
13042 explicit_tune_core = selected_tune ? selected_tune->ident
13043 : selected_cpu->ident;
13044 }
13045 else
13046 {
13047 /* -mcpu but no -march. */
13048 aarch64_isa_flags = cpu_isa;
13049 explicit_tune_core = selected_tune ? selected_tune->ident
13050 : selected_cpu->ident;
13051 gcc_assert (selected_cpu);
13052 selected_arch = &all_architectures[selected_cpu->arch];
13053 explicit_arch = selected_arch->arch;
13054 }
13055
13056 /* Set the arch as well as we will need it when outputing
13057 the .arch directive in assembly. */
13058 if (!selected_arch)
13059 {
13060 gcc_assert (selected_cpu);
13061 selected_arch = &all_architectures[selected_cpu->arch];
13062 }
13063
13064 if (!selected_tune)
13065 selected_tune = selected_cpu;
13066
13067 if (aarch64_enable_bti == 2)
13068 {
13069 #ifdef TARGET_ENABLE_BTI
13070 aarch64_enable_bti = 1;
13071 #else
13072 aarch64_enable_bti = 0;
13073 #endif
13074 }
13075
13076 /* Return address signing is currently not supported for ILP32 targets. For
13077 LP64 targets use the configured option in the absence of a command-line
13078 option for -mbranch-protection. */
13079 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
13080 {
13081 #ifdef TARGET_ENABLE_PAC_RET
13082 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13083 #else
13084 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13085 #endif
13086 }
13087
13088 #ifndef HAVE_AS_MABI_OPTION
13089 /* The compiler may have been configured with 2.23.* binutils, which does
13090 not have support for ILP32. */
13091 if (TARGET_ILP32)
13092 error ("assembler does not support %<-mabi=ilp32%>");
13093 #endif
13094
13095 /* Convert -msve-vector-bits to a VG count. */
13096 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13097
13098 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13099 sorry ("return address signing is only supported for %<-mabi=lp64%>");
13100
13101 /* Make sure we properly set up the explicit options. */
13102 if ((aarch64_cpu_string && valid_cpu)
13103 || (aarch64_tune_string && valid_tune))
13104 gcc_assert (explicit_tune_core != aarch64_none);
13105
13106 if ((aarch64_cpu_string && valid_cpu)
13107 || (aarch64_arch_string && valid_arch))
13108 gcc_assert (explicit_arch != aarch64_no_arch);
13109
13110 /* The pass to insert speculation tracking runs before
13111 shrink-wrapping and the latter does not know how to update the
13112 tracking status. So disable it in this case. */
13113 if (aarch64_track_speculation)
13114 flag_shrink_wrap = 0;
13115
13116 aarch64_override_options_internal (&global_options);
13117
13118 /* Save these options as the default ones in case we push and pop them later
13119 while processing functions with potential target attributes. */
13120 target_option_default_node = target_option_current_node
13121 = build_target_option_node (&global_options);
13122 }
13123
13124 /* Implement targetm.override_options_after_change. */
13125
13126 static void
13127 aarch64_override_options_after_change (void)
13128 {
13129 aarch64_override_options_after_change_1 (&global_options);
13130 }
13131
13132 static struct machine_function *
13133 aarch64_init_machine_status (void)
13134 {
13135 struct machine_function *machine;
13136 machine = ggc_cleared_alloc<machine_function> ();
13137 return machine;
13138 }
13139
13140 void
13141 aarch64_init_expanders (void)
13142 {
13143 init_machine_status = aarch64_init_machine_status;
13144 }
13145
13146 /* A checking mechanism for the implementation of the various code models. */
13147 static void
13148 initialize_aarch64_code_model (struct gcc_options *opts)
13149 {
13150 if (opts->x_flag_pic)
13151 {
13152 switch (opts->x_aarch64_cmodel_var)
13153 {
13154 case AARCH64_CMODEL_TINY:
13155 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13156 break;
13157 case AARCH64_CMODEL_SMALL:
13158 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13159 aarch64_cmodel = (flag_pic == 2
13160 ? AARCH64_CMODEL_SMALL_PIC
13161 : AARCH64_CMODEL_SMALL_SPIC);
13162 #else
13163 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13164 #endif
13165 break;
13166 case AARCH64_CMODEL_LARGE:
13167 sorry ("code model %qs with %<-f%s%>", "large",
13168 opts->x_flag_pic > 1 ? "PIC" : "pic");
13169 break;
13170 default:
13171 gcc_unreachable ();
13172 }
13173 }
13174 else
13175 aarch64_cmodel = opts->x_aarch64_cmodel_var;
13176 }
13177
13178 /* Implement TARGET_OPTION_SAVE. */
13179
13180 static void
13181 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13182 {
13183 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13184 ptr->x_aarch64_branch_protection_string
13185 = opts->x_aarch64_branch_protection_string;
13186 }
13187
13188 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13189 using the information saved in PTR. */
13190
13191 static void
13192 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13193 {
13194 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13195 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13196 opts->x_explicit_arch = ptr->x_explicit_arch;
13197 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13198 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13199 opts->x_aarch64_branch_protection_string
13200 = ptr->x_aarch64_branch_protection_string;
13201 if (opts->x_aarch64_branch_protection_string)
13202 {
13203 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13204 NULL);
13205 }
13206
13207 aarch64_override_options_internal (opts);
13208 }
13209
13210 /* Implement TARGET_OPTION_PRINT. */
13211
13212 static void
13213 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13214 {
13215 const struct processor *cpu
13216 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13217 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13218 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13219 std::string extension
13220 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13221
13222 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13223 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13224 arch->name, extension.c_str ());
13225 }
13226
13227 static GTY(()) tree aarch64_previous_fndecl;
13228
13229 void
13230 aarch64_reset_previous_fndecl (void)
13231 {
13232 aarch64_previous_fndecl = NULL;
13233 }
13234
13235 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13236 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13237 make sure optab availability predicates are recomputed when necessary. */
13238
13239 void
13240 aarch64_save_restore_target_globals (tree new_tree)
13241 {
13242 if (TREE_TARGET_GLOBALS (new_tree))
13243 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13244 else if (new_tree == target_option_default_node)
13245 restore_target_globals (&default_target_globals);
13246 else
13247 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13248 }
13249
13250 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13251 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13252 of the function, if such exists. This function may be called multiple
13253 times on a single function so use aarch64_previous_fndecl to avoid
13254 setting up identical state. */
13255
13256 static void
13257 aarch64_set_current_function (tree fndecl)
13258 {
13259 if (!fndecl || fndecl == aarch64_previous_fndecl)
13260 return;
13261
13262 tree old_tree = (aarch64_previous_fndecl
13263 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13264 : NULL_TREE);
13265
13266 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13267
13268 /* If current function has no attributes but the previous one did,
13269 use the default node. */
13270 if (!new_tree && old_tree)
13271 new_tree = target_option_default_node;
13272
13273 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13274 the default have been handled by aarch64_save_restore_target_globals from
13275 aarch64_pragma_target_parse. */
13276 if (old_tree == new_tree)
13277 return;
13278
13279 aarch64_previous_fndecl = fndecl;
13280
13281 /* First set the target options. */
13282 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13283
13284 aarch64_save_restore_target_globals (new_tree);
13285 }
13286
13287 /* Enum describing the various ways we can handle attributes.
13288 In many cases we can reuse the generic option handling machinery. */
13289
13290 enum aarch64_attr_opt_type
13291 {
13292 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13293 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13294 aarch64_attr_enum, /* Attribute sets an enum variable. */
13295 aarch64_attr_custom /* Attribute requires a custom handling function. */
13296 };
13297
13298 /* All the information needed to handle a target attribute.
13299 NAME is the name of the attribute.
13300 ATTR_TYPE specifies the type of behavior of the attribute as described
13301 in the definition of enum aarch64_attr_opt_type.
13302 ALLOW_NEG is true if the attribute supports a "no-" form.
13303 HANDLER is the function that takes the attribute string as an argument
13304 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13305 OPT_NUM is the enum specifying the option that the attribute modifies.
13306 This is needed for attributes that mirror the behavior of a command-line
13307 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13308 aarch64_attr_enum. */
13309
13310 struct aarch64_attribute_info
13311 {
13312 const char *name;
13313 enum aarch64_attr_opt_type attr_type;
13314 bool allow_neg;
13315 bool (*handler) (const char *);
13316 enum opt_code opt_num;
13317 };
13318
13319 /* Handle the ARCH_STR argument to the arch= target attribute. */
13320
13321 static bool
13322 aarch64_handle_attr_arch (const char *str)
13323 {
13324 const struct processor *tmp_arch = NULL;
13325 std::string invalid_extension;
13326 enum aarch64_parse_opt_result parse_res
13327 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13328
13329 if (parse_res == AARCH64_PARSE_OK)
13330 {
13331 gcc_assert (tmp_arch);
13332 selected_arch = tmp_arch;
13333 explicit_arch = selected_arch->arch;
13334 return true;
13335 }
13336
13337 switch (parse_res)
13338 {
13339 case AARCH64_PARSE_MISSING_ARG:
13340 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13341 break;
13342 case AARCH64_PARSE_INVALID_ARG:
13343 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13344 aarch64_print_hint_for_arch (str);
13345 break;
13346 case AARCH64_PARSE_INVALID_FEATURE:
13347 error ("invalid feature modifier %s of value (\"%s\") in "
13348 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13349 aarch64_print_hint_for_extensions (invalid_extension);
13350 break;
13351 default:
13352 gcc_unreachable ();
13353 }
13354
13355 return false;
13356 }
13357
13358 /* Handle the argument CPU_STR to the cpu= target attribute. */
13359
13360 static bool
13361 aarch64_handle_attr_cpu (const char *str)
13362 {
13363 const struct processor *tmp_cpu = NULL;
13364 std::string invalid_extension;
13365 enum aarch64_parse_opt_result parse_res
13366 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13367
13368 if (parse_res == AARCH64_PARSE_OK)
13369 {
13370 gcc_assert (tmp_cpu);
13371 selected_tune = tmp_cpu;
13372 explicit_tune_core = selected_tune->ident;
13373
13374 selected_arch = &all_architectures[tmp_cpu->arch];
13375 explicit_arch = selected_arch->arch;
13376 return true;
13377 }
13378
13379 switch (parse_res)
13380 {
13381 case AARCH64_PARSE_MISSING_ARG:
13382 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13383 break;
13384 case AARCH64_PARSE_INVALID_ARG:
13385 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13386 aarch64_print_hint_for_core (str);
13387 break;
13388 case AARCH64_PARSE_INVALID_FEATURE:
13389 error ("invalid feature modifier %s of value (\"%s\") in "
13390 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13391 aarch64_print_hint_for_extensions (invalid_extension);
13392 break;
13393 default:
13394 gcc_unreachable ();
13395 }
13396
13397 return false;
13398 }
13399
13400 /* Handle the argument STR to the branch-protection= attribute. */
13401
13402 static bool
13403 aarch64_handle_attr_branch_protection (const char* str)
13404 {
13405 char *err_str = (char *) xmalloc (strlen (str));
13406 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13407 &err_str);
13408 bool success = false;
13409 switch (res)
13410 {
13411 case AARCH64_PARSE_MISSING_ARG:
13412 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13413 " attribute");
13414 break;
13415 case AARCH64_PARSE_INVALID_ARG:
13416 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13417 "=\")%> pragma or attribute", err_str);
13418 break;
13419 case AARCH64_PARSE_OK:
13420 success = true;
13421 /* Fall through. */
13422 case AARCH64_PARSE_INVALID_FEATURE:
13423 break;
13424 default:
13425 gcc_unreachable ();
13426 }
13427 free (err_str);
13428 return success;
13429 }
13430
13431 /* Handle the argument STR to the tune= target attribute. */
13432
13433 static bool
13434 aarch64_handle_attr_tune (const char *str)
13435 {
13436 const struct processor *tmp_tune = NULL;
13437 enum aarch64_parse_opt_result parse_res
13438 = aarch64_parse_tune (str, &tmp_tune);
13439
13440 if (parse_res == AARCH64_PARSE_OK)
13441 {
13442 gcc_assert (tmp_tune);
13443 selected_tune = tmp_tune;
13444 explicit_tune_core = selected_tune->ident;
13445 return true;
13446 }
13447
13448 switch (parse_res)
13449 {
13450 case AARCH64_PARSE_INVALID_ARG:
13451 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13452 aarch64_print_hint_for_core (str);
13453 break;
13454 default:
13455 gcc_unreachable ();
13456 }
13457
13458 return false;
13459 }
13460
13461 /* Parse an architecture extensions target attribute string specified in STR.
13462 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13463 if successful. Update aarch64_isa_flags to reflect the ISA features
13464 modified. */
13465
13466 static bool
13467 aarch64_handle_attr_isa_flags (char *str)
13468 {
13469 enum aarch64_parse_opt_result parse_res;
13470 uint64_t isa_flags = aarch64_isa_flags;
13471
13472 /* We allow "+nothing" in the beginning to clear out all architectural
13473 features if the user wants to handpick specific features. */
13474 if (strncmp ("+nothing", str, 8) == 0)
13475 {
13476 isa_flags = 0;
13477 str += 8;
13478 }
13479
13480 std::string invalid_extension;
13481 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13482
13483 if (parse_res == AARCH64_PARSE_OK)
13484 {
13485 aarch64_isa_flags = isa_flags;
13486 return true;
13487 }
13488
13489 switch (parse_res)
13490 {
13491 case AARCH64_PARSE_MISSING_ARG:
13492 error ("missing value in %<target()%> pragma or attribute");
13493 break;
13494
13495 case AARCH64_PARSE_INVALID_FEATURE:
13496 error ("invalid feature modifier %s of value (\"%s\") in "
13497 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13498 break;
13499
13500 default:
13501 gcc_unreachable ();
13502 }
13503
13504 return false;
13505 }
13506
13507 /* The target attributes that we support. On top of these we also support just
13508 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13509 handled explicitly in aarch64_process_one_target_attr. */
13510
13511 static const struct aarch64_attribute_info aarch64_attributes[] =
13512 {
13513 { "general-regs-only", aarch64_attr_mask, false, NULL,
13514 OPT_mgeneral_regs_only },
13515 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13516 OPT_mfix_cortex_a53_835769 },
13517 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13518 OPT_mfix_cortex_a53_843419 },
13519 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13520 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13521 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13522 OPT_momit_leaf_frame_pointer },
13523 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13524 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13525 OPT_march_ },
13526 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13527 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13528 OPT_mtune_ },
13529 { "branch-protection", aarch64_attr_custom, false,
13530 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13531 { "sign-return-address", aarch64_attr_enum, false, NULL,
13532 OPT_msign_return_address_ },
13533 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13534 };
13535
13536 /* Parse ARG_STR which contains the definition of one target attribute.
13537 Show appropriate errors if any or return true if the attribute is valid. */
13538
13539 static bool
13540 aarch64_process_one_target_attr (char *arg_str)
13541 {
13542 bool invert = false;
13543
13544 size_t len = strlen (arg_str);
13545
13546 if (len == 0)
13547 {
13548 error ("malformed %<target()%> pragma or attribute");
13549 return false;
13550 }
13551
13552 char *str_to_check = (char *) alloca (len + 1);
13553 strcpy (str_to_check, arg_str);
13554
13555 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13556 It is easier to detect and handle it explicitly here rather than going
13557 through the machinery for the rest of the target attributes in this
13558 function. */
13559 if (*str_to_check == '+')
13560 return aarch64_handle_attr_isa_flags (str_to_check);
13561
13562 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13563 {
13564 invert = true;
13565 str_to_check += 3;
13566 }
13567 char *arg = strchr (str_to_check, '=');
13568
13569 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13570 and point ARG to "foo". */
13571 if (arg)
13572 {
13573 *arg = '\0';
13574 arg++;
13575 }
13576 const struct aarch64_attribute_info *p_attr;
13577 bool found = false;
13578 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13579 {
13580 /* If the names don't match up, or the user has given an argument
13581 to an attribute that doesn't accept one, or didn't give an argument
13582 to an attribute that expects one, fail to match. */
13583 if (strcmp (str_to_check, p_attr->name) != 0)
13584 continue;
13585
13586 found = true;
13587 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13588 || p_attr->attr_type == aarch64_attr_enum;
13589
13590 if (attr_need_arg_p ^ (arg != NULL))
13591 {
13592 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13593 return false;
13594 }
13595
13596 /* If the name matches but the attribute does not allow "no-" versions
13597 then we can't match. */
13598 if (invert && !p_attr->allow_neg)
13599 {
13600 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13601 return false;
13602 }
13603
13604 switch (p_attr->attr_type)
13605 {
13606 /* Has a custom handler registered.
13607 For example, cpu=, arch=, tune=. */
13608 case aarch64_attr_custom:
13609 gcc_assert (p_attr->handler);
13610 if (!p_attr->handler (arg))
13611 return false;
13612 break;
13613
13614 /* Either set or unset a boolean option. */
13615 case aarch64_attr_bool:
13616 {
13617 struct cl_decoded_option decoded;
13618
13619 generate_option (p_attr->opt_num, NULL, !invert,
13620 CL_TARGET, &decoded);
13621 aarch64_handle_option (&global_options, &global_options_set,
13622 &decoded, input_location);
13623 break;
13624 }
13625 /* Set or unset a bit in the target_flags. aarch64_handle_option
13626 should know what mask to apply given the option number. */
13627 case aarch64_attr_mask:
13628 {
13629 struct cl_decoded_option decoded;
13630 /* We only need to specify the option number.
13631 aarch64_handle_option will know which mask to apply. */
13632 decoded.opt_index = p_attr->opt_num;
13633 decoded.value = !invert;
13634 aarch64_handle_option (&global_options, &global_options_set,
13635 &decoded, input_location);
13636 break;
13637 }
13638 /* Use the option setting machinery to set an option to an enum. */
13639 case aarch64_attr_enum:
13640 {
13641 gcc_assert (arg);
13642 bool valid;
13643 int value;
13644 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13645 &value, CL_TARGET);
13646 if (valid)
13647 {
13648 set_option (&global_options, NULL, p_attr->opt_num, value,
13649 NULL, DK_UNSPECIFIED, input_location,
13650 global_dc);
13651 }
13652 else
13653 {
13654 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13655 }
13656 break;
13657 }
13658 default:
13659 gcc_unreachable ();
13660 }
13661 }
13662
13663 /* If we reached here we either have found an attribute and validated
13664 it or didn't match any. If we matched an attribute but its arguments
13665 were malformed we will have returned false already. */
13666 return found;
13667 }
13668
13669 /* Count how many times the character C appears in
13670 NULL-terminated string STR. */
13671
13672 static unsigned int
13673 num_occurences_in_str (char c, char *str)
13674 {
13675 unsigned int res = 0;
13676 while (*str != '\0')
13677 {
13678 if (*str == c)
13679 res++;
13680
13681 str++;
13682 }
13683
13684 return res;
13685 }
13686
13687 /* Parse the tree in ARGS that contains the target attribute information
13688 and update the global target options space. */
13689
13690 bool
13691 aarch64_process_target_attr (tree args)
13692 {
13693 if (TREE_CODE (args) == TREE_LIST)
13694 {
13695 do
13696 {
13697 tree head = TREE_VALUE (args);
13698 if (head)
13699 {
13700 if (!aarch64_process_target_attr (head))
13701 return false;
13702 }
13703 args = TREE_CHAIN (args);
13704 } while (args);
13705
13706 return true;
13707 }
13708
13709 if (TREE_CODE (args) != STRING_CST)
13710 {
13711 error ("attribute %<target%> argument not a string");
13712 return false;
13713 }
13714
13715 size_t len = strlen (TREE_STRING_POINTER (args));
13716 char *str_to_check = (char *) alloca (len + 1);
13717 strcpy (str_to_check, TREE_STRING_POINTER (args));
13718
13719 if (len == 0)
13720 {
13721 error ("malformed %<target()%> pragma or attribute");
13722 return false;
13723 }
13724
13725 /* Used to catch empty spaces between commas i.e.
13726 attribute ((target ("attr1,,attr2"))). */
13727 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13728
13729 /* Handle multiple target attributes separated by ','. */
13730 char *token = strtok_r (str_to_check, ",", &str_to_check);
13731
13732 unsigned int num_attrs = 0;
13733 while (token)
13734 {
13735 num_attrs++;
13736 if (!aarch64_process_one_target_attr (token))
13737 {
13738 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13739 return false;
13740 }
13741
13742 token = strtok_r (NULL, ",", &str_to_check);
13743 }
13744
13745 if (num_attrs != num_commas + 1)
13746 {
13747 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13748 return false;
13749 }
13750
13751 return true;
13752 }
13753
13754 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13755 process attribute ((target ("..."))). */
13756
13757 static bool
13758 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13759 {
13760 struct cl_target_option cur_target;
13761 bool ret;
13762 tree old_optimize;
13763 tree new_target, new_optimize;
13764 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13765
13766 /* If what we're processing is the current pragma string then the
13767 target option node is already stored in target_option_current_node
13768 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13769 having to re-parse the string. This is especially useful to keep
13770 arm_neon.h compile times down since that header contains a lot
13771 of intrinsics enclosed in pragmas. */
13772 if (!existing_target && args == current_target_pragma)
13773 {
13774 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13775 return true;
13776 }
13777 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13778
13779 old_optimize = build_optimization_node (&global_options);
13780 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13781
13782 /* If the function changed the optimization levels as well as setting
13783 target options, start with the optimizations specified. */
13784 if (func_optimize && func_optimize != old_optimize)
13785 cl_optimization_restore (&global_options,
13786 TREE_OPTIMIZATION (func_optimize));
13787
13788 /* Save the current target options to restore at the end. */
13789 cl_target_option_save (&cur_target, &global_options);
13790
13791 /* If fndecl already has some target attributes applied to it, unpack
13792 them so that we add this attribute on top of them, rather than
13793 overwriting them. */
13794 if (existing_target)
13795 {
13796 struct cl_target_option *existing_options
13797 = TREE_TARGET_OPTION (existing_target);
13798
13799 if (existing_options)
13800 cl_target_option_restore (&global_options, existing_options);
13801 }
13802 else
13803 cl_target_option_restore (&global_options,
13804 TREE_TARGET_OPTION (target_option_current_node));
13805
13806 ret = aarch64_process_target_attr (args);
13807
13808 /* Set up any additional state. */
13809 if (ret)
13810 {
13811 aarch64_override_options_internal (&global_options);
13812 /* Initialize SIMD builtins if we haven't already.
13813 Set current_target_pragma to NULL for the duration so that
13814 the builtin initialization code doesn't try to tag the functions
13815 being built with the attributes specified by any current pragma, thus
13816 going into an infinite recursion. */
13817 if (TARGET_SIMD)
13818 {
13819 tree saved_current_target_pragma = current_target_pragma;
13820 current_target_pragma = NULL;
13821 aarch64_init_simd_builtins ();
13822 current_target_pragma = saved_current_target_pragma;
13823 }
13824 new_target = build_target_option_node (&global_options);
13825 }
13826 else
13827 new_target = NULL;
13828
13829 new_optimize = build_optimization_node (&global_options);
13830
13831 if (fndecl && ret)
13832 {
13833 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13834
13835 if (old_optimize != new_optimize)
13836 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13837 }
13838
13839 cl_target_option_restore (&global_options, &cur_target);
13840
13841 if (old_optimize != new_optimize)
13842 cl_optimization_restore (&global_options,
13843 TREE_OPTIMIZATION (old_optimize));
13844 return ret;
13845 }
13846
13847 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13848 tri-bool options (yes, no, don't care) and the default value is
13849 DEF, determine whether to reject inlining. */
13850
13851 static bool
13852 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13853 int dont_care, int def)
13854 {
13855 /* If the callee doesn't care, always allow inlining. */
13856 if (callee == dont_care)
13857 return true;
13858
13859 /* If the caller doesn't care, always allow inlining. */
13860 if (caller == dont_care)
13861 return true;
13862
13863 /* Otherwise, allow inlining if either the callee and caller values
13864 agree, or if the callee is using the default value. */
13865 return (callee == caller || callee == def);
13866 }
13867
13868 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13869 to inline CALLEE into CALLER based on target-specific info.
13870 Make sure that the caller and callee have compatible architectural
13871 features. Then go through the other possible target attributes
13872 and see if they can block inlining. Try not to reject always_inline
13873 callees unless they are incompatible architecturally. */
13874
13875 static bool
13876 aarch64_can_inline_p (tree caller, tree callee)
13877 {
13878 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13879 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13880
13881 struct cl_target_option *caller_opts
13882 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13883 : target_option_default_node);
13884
13885 struct cl_target_option *callee_opts
13886 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13887 : target_option_default_node);
13888
13889 /* Callee's ISA flags should be a subset of the caller's. */
13890 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13891 != callee_opts->x_aarch64_isa_flags)
13892 return false;
13893
13894 /* Allow non-strict aligned functions inlining into strict
13895 aligned ones. */
13896 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13897 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13898 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13899 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13900 return false;
13901
13902 bool always_inline = lookup_attribute ("always_inline",
13903 DECL_ATTRIBUTES (callee));
13904
13905 /* If the architectural features match up and the callee is always_inline
13906 then the other attributes don't matter. */
13907 if (always_inline)
13908 return true;
13909
13910 if (caller_opts->x_aarch64_cmodel_var
13911 != callee_opts->x_aarch64_cmodel_var)
13912 return false;
13913
13914 if (caller_opts->x_aarch64_tls_dialect
13915 != callee_opts->x_aarch64_tls_dialect)
13916 return false;
13917
13918 /* Honour explicit requests to workaround errata. */
13919 if (!aarch64_tribools_ok_for_inlining_p (
13920 caller_opts->x_aarch64_fix_a53_err835769,
13921 callee_opts->x_aarch64_fix_a53_err835769,
13922 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13923 return false;
13924
13925 if (!aarch64_tribools_ok_for_inlining_p (
13926 caller_opts->x_aarch64_fix_a53_err843419,
13927 callee_opts->x_aarch64_fix_a53_err843419,
13928 2, TARGET_FIX_ERR_A53_843419))
13929 return false;
13930
13931 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13932 caller and calle and they don't match up, reject inlining. */
13933 if (!aarch64_tribools_ok_for_inlining_p (
13934 caller_opts->x_flag_omit_leaf_frame_pointer,
13935 callee_opts->x_flag_omit_leaf_frame_pointer,
13936 2, 1))
13937 return false;
13938
13939 /* If the callee has specific tuning overrides, respect them. */
13940 if (callee_opts->x_aarch64_override_tune_string != NULL
13941 && caller_opts->x_aarch64_override_tune_string == NULL)
13942 return false;
13943
13944 /* If the user specified tuning override strings for the
13945 caller and callee and they don't match up, reject inlining.
13946 We just do a string compare here, we don't analyze the meaning
13947 of the string, as it would be too costly for little gain. */
13948 if (callee_opts->x_aarch64_override_tune_string
13949 && caller_opts->x_aarch64_override_tune_string
13950 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13951 caller_opts->x_aarch64_override_tune_string) != 0))
13952 return false;
13953
13954 return true;
13955 }
13956
13957 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
13958 been already. */
13959
13960 unsigned int
13961 aarch64_tlsdesc_abi_id ()
13962 {
13963 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
13964 if (!tlsdesc_abi.initialized_p ())
13965 {
13966 HARD_REG_SET full_reg_clobbers;
13967 CLEAR_HARD_REG_SET (full_reg_clobbers);
13968 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
13969 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
13970 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
13971 SET_HARD_REG_BIT (full_reg_clobbers, regno);
13972 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
13973 }
13974 return tlsdesc_abi.id ();
13975 }
13976
13977 /* Return true if SYMBOL_REF X binds locally. */
13978
13979 static bool
13980 aarch64_symbol_binds_local_p (const_rtx x)
13981 {
13982 return (SYMBOL_REF_DECL (x)
13983 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13984 : SYMBOL_REF_LOCAL_P (x));
13985 }
13986
13987 /* Return true if SYMBOL_REF X is thread local */
13988 static bool
13989 aarch64_tls_symbol_p (rtx x)
13990 {
13991 if (! TARGET_HAVE_TLS)
13992 return false;
13993
13994 if (GET_CODE (x) != SYMBOL_REF)
13995 return false;
13996
13997 return SYMBOL_REF_TLS_MODEL (x) != 0;
13998 }
13999
14000 /* Classify a TLS symbol into one of the TLS kinds. */
14001 enum aarch64_symbol_type
14002 aarch64_classify_tls_symbol (rtx x)
14003 {
14004 enum tls_model tls_kind = tls_symbolic_operand_type (x);
14005
14006 switch (tls_kind)
14007 {
14008 case TLS_MODEL_GLOBAL_DYNAMIC:
14009 case TLS_MODEL_LOCAL_DYNAMIC:
14010 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
14011
14012 case TLS_MODEL_INITIAL_EXEC:
14013 switch (aarch64_cmodel)
14014 {
14015 case AARCH64_CMODEL_TINY:
14016 case AARCH64_CMODEL_TINY_PIC:
14017 return SYMBOL_TINY_TLSIE;
14018 default:
14019 return SYMBOL_SMALL_TLSIE;
14020 }
14021
14022 case TLS_MODEL_LOCAL_EXEC:
14023 if (aarch64_tls_size == 12)
14024 return SYMBOL_TLSLE12;
14025 else if (aarch64_tls_size == 24)
14026 return SYMBOL_TLSLE24;
14027 else if (aarch64_tls_size == 32)
14028 return SYMBOL_TLSLE32;
14029 else if (aarch64_tls_size == 48)
14030 return SYMBOL_TLSLE48;
14031 else
14032 gcc_unreachable ();
14033
14034 case TLS_MODEL_EMULATED:
14035 case TLS_MODEL_NONE:
14036 return SYMBOL_FORCE_TO_MEM;
14037
14038 default:
14039 gcc_unreachable ();
14040 }
14041 }
14042
14043 /* Return the correct method for accessing X + OFFSET, where X is either
14044 a SYMBOL_REF or LABEL_REF. */
14045
14046 enum aarch64_symbol_type
14047 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
14048 {
14049 if (GET_CODE (x) == LABEL_REF)
14050 {
14051 switch (aarch64_cmodel)
14052 {
14053 case AARCH64_CMODEL_LARGE:
14054 return SYMBOL_FORCE_TO_MEM;
14055
14056 case AARCH64_CMODEL_TINY_PIC:
14057 case AARCH64_CMODEL_TINY:
14058 return SYMBOL_TINY_ABSOLUTE;
14059
14060 case AARCH64_CMODEL_SMALL_SPIC:
14061 case AARCH64_CMODEL_SMALL_PIC:
14062 case AARCH64_CMODEL_SMALL:
14063 return SYMBOL_SMALL_ABSOLUTE;
14064
14065 default:
14066 gcc_unreachable ();
14067 }
14068 }
14069
14070 if (GET_CODE (x) == SYMBOL_REF)
14071 {
14072 if (aarch64_tls_symbol_p (x))
14073 return aarch64_classify_tls_symbol (x);
14074
14075 switch (aarch64_cmodel)
14076 {
14077 case AARCH64_CMODEL_TINY:
14078 /* When we retrieve symbol + offset address, we have to make sure
14079 the offset does not cause overflow of the final address. But
14080 we have no way of knowing the address of symbol at compile time
14081 so we can't accurately say if the distance between the PC and
14082 symbol + offset is outside the addressible range of +/-1M in the
14083 TINY code model. So we rely on images not being greater than
14084 1M and cap the offset at 1M and anything beyond 1M will have to
14085 be loaded using an alternative mechanism. Furthermore if the
14086 symbol is a weak reference to something that isn't known to
14087 resolve to a symbol in this module, then force to memory. */
14088 if ((SYMBOL_REF_WEAK (x)
14089 && !aarch64_symbol_binds_local_p (x))
14090 || !IN_RANGE (offset, -1048575, 1048575))
14091 return SYMBOL_FORCE_TO_MEM;
14092 return SYMBOL_TINY_ABSOLUTE;
14093
14094 case AARCH64_CMODEL_SMALL:
14095 /* Same reasoning as the tiny code model, but the offset cap here is
14096 4G. */
14097 if ((SYMBOL_REF_WEAK (x)
14098 && !aarch64_symbol_binds_local_p (x))
14099 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
14100 HOST_WIDE_INT_C (4294967264)))
14101 return SYMBOL_FORCE_TO_MEM;
14102 return SYMBOL_SMALL_ABSOLUTE;
14103
14104 case AARCH64_CMODEL_TINY_PIC:
14105 if (!aarch64_symbol_binds_local_p (x))
14106 return SYMBOL_TINY_GOT;
14107 return SYMBOL_TINY_ABSOLUTE;
14108
14109 case AARCH64_CMODEL_SMALL_SPIC:
14110 case AARCH64_CMODEL_SMALL_PIC:
14111 if (!aarch64_symbol_binds_local_p (x))
14112 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14113 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14114 return SYMBOL_SMALL_ABSOLUTE;
14115
14116 case AARCH64_CMODEL_LARGE:
14117 /* This is alright even in PIC code as the constant
14118 pool reference is always PC relative and within
14119 the same translation unit. */
14120 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14121 return SYMBOL_SMALL_ABSOLUTE;
14122 else
14123 return SYMBOL_FORCE_TO_MEM;
14124
14125 default:
14126 gcc_unreachable ();
14127 }
14128 }
14129
14130 /* By default push everything into the constant pool. */
14131 return SYMBOL_FORCE_TO_MEM;
14132 }
14133
14134 bool
14135 aarch64_constant_address_p (rtx x)
14136 {
14137 return (CONSTANT_P (x) && memory_address_p (DImode, x));
14138 }
14139
14140 bool
14141 aarch64_legitimate_pic_operand_p (rtx x)
14142 {
14143 if (GET_CODE (x) == SYMBOL_REF
14144 || (GET_CODE (x) == CONST
14145 && GET_CODE (XEXP (x, 0)) == PLUS
14146 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14147 return false;
14148
14149 return true;
14150 }
14151
14152 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14153 that should be rematerialized rather than spilled. */
14154
14155 static bool
14156 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14157 {
14158 /* Support CSE and rematerialization of common constants. */
14159 if (CONST_INT_P (x)
14160 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14161 || GET_CODE (x) == CONST_VECTOR)
14162 return true;
14163
14164 /* Do not allow vector struct mode constants for Advanced SIMD.
14165 We could support 0 and -1 easily, but they need support in
14166 aarch64-simd.md. */
14167 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14168 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14169 return false;
14170
14171 /* Only accept variable-length vector constants if they can be
14172 handled directly.
14173
14174 ??? It would be possible to handle rematerialization of other
14175 constants via secondary reloads. */
14176 if (vec_flags & VEC_ANY_SVE)
14177 return aarch64_simd_valid_immediate (x, NULL);
14178
14179 if (GET_CODE (x) == HIGH)
14180 x = XEXP (x, 0);
14181
14182 /* Accept polynomial constants that can be calculated by using the
14183 destination of a move as the sole temporary. Constants that
14184 require a second temporary cannot be rematerialized (they can't be
14185 forced to memory and also aren't legitimate constants). */
14186 poly_int64 offset;
14187 if (poly_int_rtx_p (x, &offset))
14188 return aarch64_offset_temporaries (false, offset) <= 1;
14189
14190 /* If an offset is being added to something else, we need to allow the
14191 base to be moved into the destination register, meaning that there
14192 are no free temporaries for the offset. */
14193 x = strip_offset (x, &offset);
14194 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14195 return false;
14196
14197 /* Do not allow const (plus (anchor_symbol, const_int)). */
14198 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14199 return false;
14200
14201 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14202 so spilling them is better than rematerialization. */
14203 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14204 return true;
14205
14206 /* Label references are always constant. */
14207 if (GET_CODE (x) == LABEL_REF)
14208 return true;
14209
14210 return false;
14211 }
14212
14213 rtx
14214 aarch64_load_tp (rtx target)
14215 {
14216 if (!target
14217 || GET_MODE (target) != Pmode
14218 || !register_operand (target, Pmode))
14219 target = gen_reg_rtx (Pmode);
14220
14221 /* Can return in any reg. */
14222 emit_insn (gen_aarch64_load_tp_hard (target));
14223 return target;
14224 }
14225
14226 /* On AAPCS systems, this is the "struct __va_list". */
14227 static GTY(()) tree va_list_type;
14228
14229 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14230 Return the type to use as __builtin_va_list.
14231
14232 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14233
14234 struct __va_list
14235 {
14236 void *__stack;
14237 void *__gr_top;
14238 void *__vr_top;
14239 int __gr_offs;
14240 int __vr_offs;
14241 }; */
14242
14243 static tree
14244 aarch64_build_builtin_va_list (void)
14245 {
14246 tree va_list_name;
14247 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14248
14249 /* Create the type. */
14250 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14251 /* Give it the required name. */
14252 va_list_name = build_decl (BUILTINS_LOCATION,
14253 TYPE_DECL,
14254 get_identifier ("__va_list"),
14255 va_list_type);
14256 DECL_ARTIFICIAL (va_list_name) = 1;
14257 TYPE_NAME (va_list_type) = va_list_name;
14258 TYPE_STUB_DECL (va_list_type) = va_list_name;
14259
14260 /* Create the fields. */
14261 f_stack = build_decl (BUILTINS_LOCATION,
14262 FIELD_DECL, get_identifier ("__stack"),
14263 ptr_type_node);
14264 f_grtop = build_decl (BUILTINS_LOCATION,
14265 FIELD_DECL, get_identifier ("__gr_top"),
14266 ptr_type_node);
14267 f_vrtop = build_decl (BUILTINS_LOCATION,
14268 FIELD_DECL, get_identifier ("__vr_top"),
14269 ptr_type_node);
14270 f_groff = build_decl (BUILTINS_LOCATION,
14271 FIELD_DECL, get_identifier ("__gr_offs"),
14272 integer_type_node);
14273 f_vroff = build_decl (BUILTINS_LOCATION,
14274 FIELD_DECL, get_identifier ("__vr_offs"),
14275 integer_type_node);
14276
14277 /* Tell tree-stdarg pass about our internal offset fields.
14278 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14279 purpose to identify whether the code is updating va_list internal
14280 offset fields through irregular way. */
14281 va_list_gpr_counter_field = f_groff;
14282 va_list_fpr_counter_field = f_vroff;
14283
14284 DECL_ARTIFICIAL (f_stack) = 1;
14285 DECL_ARTIFICIAL (f_grtop) = 1;
14286 DECL_ARTIFICIAL (f_vrtop) = 1;
14287 DECL_ARTIFICIAL (f_groff) = 1;
14288 DECL_ARTIFICIAL (f_vroff) = 1;
14289
14290 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14291 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14292 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14293 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14294 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14295
14296 TYPE_FIELDS (va_list_type) = f_stack;
14297 DECL_CHAIN (f_stack) = f_grtop;
14298 DECL_CHAIN (f_grtop) = f_vrtop;
14299 DECL_CHAIN (f_vrtop) = f_groff;
14300 DECL_CHAIN (f_groff) = f_vroff;
14301
14302 /* Compute its layout. */
14303 layout_type (va_list_type);
14304
14305 return va_list_type;
14306 }
14307
14308 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14309 static void
14310 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14311 {
14312 const CUMULATIVE_ARGS *cum;
14313 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14314 tree stack, grtop, vrtop, groff, vroff;
14315 tree t;
14316 int gr_save_area_size = cfun->va_list_gpr_size;
14317 int vr_save_area_size = cfun->va_list_fpr_size;
14318 int vr_offset;
14319
14320 cum = &crtl->args.info;
14321 if (cfun->va_list_gpr_size)
14322 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14323 cfun->va_list_gpr_size);
14324 if (cfun->va_list_fpr_size)
14325 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14326 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14327
14328 if (!TARGET_FLOAT)
14329 {
14330 gcc_assert (cum->aapcs_nvrn == 0);
14331 vr_save_area_size = 0;
14332 }
14333
14334 f_stack = TYPE_FIELDS (va_list_type_node);
14335 f_grtop = DECL_CHAIN (f_stack);
14336 f_vrtop = DECL_CHAIN (f_grtop);
14337 f_groff = DECL_CHAIN (f_vrtop);
14338 f_vroff = DECL_CHAIN (f_groff);
14339
14340 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14341 NULL_TREE);
14342 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14343 NULL_TREE);
14344 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14345 NULL_TREE);
14346 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14347 NULL_TREE);
14348 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14349 NULL_TREE);
14350
14351 /* Emit code to initialize STACK, which points to the next varargs stack
14352 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14353 by named arguments. STACK is 8-byte aligned. */
14354 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14355 if (cum->aapcs_stack_size > 0)
14356 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14357 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14358 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14359
14360 /* Emit code to initialize GRTOP, the top of the GR save area.
14361 virtual_incoming_args_rtx should have been 16 byte aligned. */
14362 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14363 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14364 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14365
14366 /* Emit code to initialize VRTOP, the top of the VR save area.
14367 This address is gr_save_area_bytes below GRTOP, rounded
14368 down to the next 16-byte boundary. */
14369 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14370 vr_offset = ROUND_UP (gr_save_area_size,
14371 STACK_BOUNDARY / BITS_PER_UNIT);
14372
14373 if (vr_offset)
14374 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14375 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14376 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14377
14378 /* Emit code to initialize GROFF, the offset from GRTOP of the
14379 next GPR argument. */
14380 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14381 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14382 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14383
14384 /* Likewise emit code to initialize VROFF, the offset from FTOP
14385 of the next VR argument. */
14386 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14387 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14388 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14389 }
14390
14391 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14392
14393 static tree
14394 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14395 gimple_seq *post_p ATTRIBUTE_UNUSED)
14396 {
14397 tree addr;
14398 bool indirect_p;
14399 bool is_ha; /* is HFA or HVA. */
14400 bool dw_align; /* double-word align. */
14401 machine_mode ag_mode = VOIDmode;
14402 int nregs;
14403 machine_mode mode;
14404
14405 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14406 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14407 HOST_WIDE_INT size, rsize, adjust, align;
14408 tree t, u, cond1, cond2;
14409
14410 indirect_p = pass_va_arg_by_reference (type);
14411 if (indirect_p)
14412 type = build_pointer_type (type);
14413
14414 mode = TYPE_MODE (type);
14415
14416 f_stack = TYPE_FIELDS (va_list_type_node);
14417 f_grtop = DECL_CHAIN (f_stack);
14418 f_vrtop = DECL_CHAIN (f_grtop);
14419 f_groff = DECL_CHAIN (f_vrtop);
14420 f_vroff = DECL_CHAIN (f_groff);
14421
14422 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14423 f_stack, NULL_TREE);
14424 size = int_size_in_bytes (type);
14425
14426 bool abi_break;
14427 align
14428 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14429
14430 dw_align = false;
14431 adjust = 0;
14432 if (aarch64_vfp_is_call_or_return_candidate (mode,
14433 type,
14434 &ag_mode,
14435 &nregs,
14436 &is_ha))
14437 {
14438 /* No frontends can create types with variable-sized modes, so we
14439 shouldn't be asked to pass or return them. */
14440 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14441
14442 /* TYPE passed in fp/simd registers. */
14443 if (!TARGET_FLOAT)
14444 aarch64_err_no_fpadvsimd (mode);
14445
14446 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14447 unshare_expr (valist), f_vrtop, NULL_TREE);
14448 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14449 unshare_expr (valist), f_vroff, NULL_TREE);
14450
14451 rsize = nregs * UNITS_PER_VREG;
14452
14453 if (is_ha)
14454 {
14455 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14456 adjust = UNITS_PER_VREG - ag_size;
14457 }
14458 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14459 && size < UNITS_PER_VREG)
14460 {
14461 adjust = UNITS_PER_VREG - size;
14462 }
14463 }
14464 else
14465 {
14466 /* TYPE passed in general registers. */
14467 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14468 unshare_expr (valist), f_grtop, NULL_TREE);
14469 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14470 unshare_expr (valist), f_groff, NULL_TREE);
14471 rsize = ROUND_UP (size, UNITS_PER_WORD);
14472 nregs = rsize / UNITS_PER_WORD;
14473
14474 if (align > 8)
14475 {
14476 if (abi_break && warn_psabi)
14477 inform (input_location, "parameter passing for argument of type "
14478 "%qT changed in GCC 9.1", type);
14479 dw_align = true;
14480 }
14481
14482 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14483 && size < UNITS_PER_WORD)
14484 {
14485 adjust = UNITS_PER_WORD - size;
14486 }
14487 }
14488
14489 /* Get a local temporary for the field value. */
14490 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14491
14492 /* Emit code to branch if off >= 0. */
14493 t = build2 (GE_EXPR, boolean_type_node, off,
14494 build_int_cst (TREE_TYPE (off), 0));
14495 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14496
14497 if (dw_align)
14498 {
14499 /* Emit: offs = (offs + 15) & -16. */
14500 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14501 build_int_cst (TREE_TYPE (off), 15));
14502 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14503 build_int_cst (TREE_TYPE (off), -16));
14504 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14505 }
14506 else
14507 roundup = NULL;
14508
14509 /* Update ap.__[g|v]r_offs */
14510 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14511 build_int_cst (TREE_TYPE (off), rsize));
14512 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14513
14514 /* String up. */
14515 if (roundup)
14516 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14517
14518 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14519 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14520 build_int_cst (TREE_TYPE (f_off), 0));
14521 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14522
14523 /* String up: make sure the assignment happens before the use. */
14524 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14525 COND_EXPR_ELSE (cond1) = t;
14526
14527 /* Prepare the trees handling the argument that is passed on the stack;
14528 the top level node will store in ON_STACK. */
14529 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14530 if (align > 8)
14531 {
14532 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14533 t = fold_build_pointer_plus_hwi (arg, 15);
14534 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14535 build_int_cst (TREE_TYPE (t), -16));
14536 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14537 }
14538 else
14539 roundup = NULL;
14540 /* Advance ap.__stack */
14541 t = fold_build_pointer_plus_hwi (arg, size + 7);
14542 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14543 build_int_cst (TREE_TYPE (t), -8));
14544 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14545 /* String up roundup and advance. */
14546 if (roundup)
14547 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14548 /* String up with arg */
14549 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14550 /* Big-endianness related address adjustment. */
14551 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14552 && size < UNITS_PER_WORD)
14553 {
14554 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14555 size_int (UNITS_PER_WORD - size));
14556 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14557 }
14558
14559 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14560 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14561
14562 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14563 t = off;
14564 if (adjust)
14565 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14566 build_int_cst (TREE_TYPE (off), adjust));
14567
14568 t = fold_convert (sizetype, t);
14569 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14570
14571 if (is_ha)
14572 {
14573 /* type ha; // treat as "struct {ftype field[n];}"
14574 ... [computing offs]
14575 for (i = 0; i <nregs; ++i, offs += 16)
14576 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14577 return ha; */
14578 int i;
14579 tree tmp_ha, field_t, field_ptr_t;
14580
14581 /* Declare a local variable. */
14582 tmp_ha = create_tmp_var_raw (type, "ha");
14583 gimple_add_tmp_var (tmp_ha);
14584
14585 /* Establish the base type. */
14586 switch (ag_mode)
14587 {
14588 case E_SFmode:
14589 field_t = float_type_node;
14590 field_ptr_t = float_ptr_type_node;
14591 break;
14592 case E_DFmode:
14593 field_t = double_type_node;
14594 field_ptr_t = double_ptr_type_node;
14595 break;
14596 case E_TFmode:
14597 field_t = long_double_type_node;
14598 field_ptr_t = long_double_ptr_type_node;
14599 break;
14600 case E_HFmode:
14601 field_t = aarch64_fp16_type_node;
14602 field_ptr_t = aarch64_fp16_ptr_type_node;
14603 break;
14604 case E_V2SImode:
14605 case E_V4SImode:
14606 {
14607 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14608 field_t = build_vector_type_for_mode (innertype, ag_mode);
14609 field_ptr_t = build_pointer_type (field_t);
14610 }
14611 break;
14612 default:
14613 gcc_assert (0);
14614 }
14615
14616 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14617 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14618 addr = t;
14619 t = fold_convert (field_ptr_t, addr);
14620 t = build2 (MODIFY_EXPR, field_t,
14621 build1 (INDIRECT_REF, field_t, tmp_ha),
14622 build1 (INDIRECT_REF, field_t, t));
14623
14624 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14625 for (i = 1; i < nregs; ++i)
14626 {
14627 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14628 u = fold_convert (field_ptr_t, addr);
14629 u = build2 (MODIFY_EXPR, field_t,
14630 build2 (MEM_REF, field_t, tmp_ha,
14631 build_int_cst (field_ptr_t,
14632 (i *
14633 int_size_in_bytes (field_t)))),
14634 build1 (INDIRECT_REF, field_t, u));
14635 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14636 }
14637
14638 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14639 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14640 }
14641
14642 COND_EXPR_ELSE (cond2) = t;
14643 addr = fold_convert (build_pointer_type (type), cond1);
14644 addr = build_va_arg_indirect_ref (addr);
14645
14646 if (indirect_p)
14647 addr = build_va_arg_indirect_ref (addr);
14648
14649 return addr;
14650 }
14651
14652 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14653
14654 static void
14655 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14656 const function_arg_info &arg,
14657 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14658 {
14659 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14660 CUMULATIVE_ARGS local_cum;
14661 int gr_saved = cfun->va_list_gpr_size;
14662 int vr_saved = cfun->va_list_fpr_size;
14663
14664 /* The caller has advanced CUM up to, but not beyond, the last named
14665 argument. Advance a local copy of CUM past the last "real" named
14666 argument, to find out how many registers are left over. */
14667 local_cum = *cum;
14668 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14669
14670 /* Found out how many registers we need to save.
14671 Honor tree-stdvar analysis results. */
14672 if (cfun->va_list_gpr_size)
14673 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14674 cfun->va_list_gpr_size / UNITS_PER_WORD);
14675 if (cfun->va_list_fpr_size)
14676 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14677 cfun->va_list_fpr_size / UNITS_PER_VREG);
14678
14679 if (!TARGET_FLOAT)
14680 {
14681 gcc_assert (local_cum.aapcs_nvrn == 0);
14682 vr_saved = 0;
14683 }
14684
14685 if (!no_rtl)
14686 {
14687 if (gr_saved > 0)
14688 {
14689 rtx ptr, mem;
14690
14691 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14692 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14693 - gr_saved * UNITS_PER_WORD);
14694 mem = gen_frame_mem (BLKmode, ptr);
14695 set_mem_alias_set (mem, get_varargs_alias_set ());
14696
14697 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14698 mem, gr_saved);
14699 }
14700 if (vr_saved > 0)
14701 {
14702 /* We can't use move_block_from_reg, because it will use
14703 the wrong mode, storing D regs only. */
14704 machine_mode mode = TImode;
14705 int off, i, vr_start;
14706
14707 /* Set OFF to the offset from virtual_incoming_args_rtx of
14708 the first vector register. The VR save area lies below
14709 the GR one, and is aligned to 16 bytes. */
14710 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14711 STACK_BOUNDARY / BITS_PER_UNIT);
14712 off -= vr_saved * UNITS_PER_VREG;
14713
14714 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14715 for (i = 0; i < vr_saved; ++i)
14716 {
14717 rtx ptr, mem;
14718
14719 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14720 mem = gen_frame_mem (mode, ptr);
14721 set_mem_alias_set (mem, get_varargs_alias_set ());
14722 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14723 off += UNITS_PER_VREG;
14724 }
14725 }
14726 }
14727
14728 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14729 any complication of having crtl->args.pretend_args_size changed. */
14730 cfun->machine->frame.saved_varargs_size
14731 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14732 STACK_BOUNDARY / BITS_PER_UNIT)
14733 + vr_saved * UNITS_PER_VREG);
14734 }
14735
14736 static void
14737 aarch64_conditional_register_usage (void)
14738 {
14739 int i;
14740 if (!TARGET_FLOAT)
14741 {
14742 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14743 {
14744 fixed_regs[i] = 1;
14745 call_used_regs[i] = 1;
14746 }
14747 }
14748 if (!TARGET_SVE)
14749 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14750 {
14751 fixed_regs[i] = 1;
14752 call_used_regs[i] = 1;
14753 }
14754
14755 /* When tracking speculation, we need a couple of call-clobbered registers
14756 to track the speculation state. It would be nice to just use
14757 IP0 and IP1, but currently there are numerous places that just
14758 assume these registers are free for other uses (eg pointer
14759 authentication). */
14760 if (aarch64_track_speculation)
14761 {
14762 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14763 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14764 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14765 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14766 }
14767 }
14768
14769 /* Walk down the type tree of TYPE counting consecutive base elements.
14770 If *MODEP is VOIDmode, then set it to the first valid floating point
14771 type. If a non-floating point type is found, or if a floating point
14772 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14773 otherwise return the count in the sub-tree. */
14774 static int
14775 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14776 {
14777 machine_mode mode;
14778 HOST_WIDE_INT size;
14779
14780 switch (TREE_CODE (type))
14781 {
14782 case REAL_TYPE:
14783 mode = TYPE_MODE (type);
14784 if (mode != DFmode && mode != SFmode
14785 && mode != TFmode && mode != HFmode)
14786 return -1;
14787
14788 if (*modep == VOIDmode)
14789 *modep = mode;
14790
14791 if (*modep == mode)
14792 return 1;
14793
14794 break;
14795
14796 case COMPLEX_TYPE:
14797 mode = TYPE_MODE (TREE_TYPE (type));
14798 if (mode != DFmode && mode != SFmode
14799 && mode != TFmode && mode != HFmode)
14800 return -1;
14801
14802 if (*modep == VOIDmode)
14803 *modep = mode;
14804
14805 if (*modep == mode)
14806 return 2;
14807
14808 break;
14809
14810 case VECTOR_TYPE:
14811 /* Use V2SImode and V4SImode as representatives of all 64-bit
14812 and 128-bit vector types. */
14813 size = int_size_in_bytes (type);
14814 switch (size)
14815 {
14816 case 8:
14817 mode = V2SImode;
14818 break;
14819 case 16:
14820 mode = V4SImode;
14821 break;
14822 default:
14823 return -1;
14824 }
14825
14826 if (*modep == VOIDmode)
14827 *modep = mode;
14828
14829 /* Vector modes are considered to be opaque: two vectors are
14830 equivalent for the purposes of being homogeneous aggregates
14831 if they are the same size. */
14832 if (*modep == mode)
14833 return 1;
14834
14835 break;
14836
14837 case ARRAY_TYPE:
14838 {
14839 int count;
14840 tree index = TYPE_DOMAIN (type);
14841
14842 /* Can't handle incomplete types nor sizes that are not
14843 fixed. */
14844 if (!COMPLETE_TYPE_P (type)
14845 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14846 return -1;
14847
14848 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14849 if (count == -1
14850 || !index
14851 || !TYPE_MAX_VALUE (index)
14852 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14853 || !TYPE_MIN_VALUE (index)
14854 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14855 || count < 0)
14856 return -1;
14857
14858 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14859 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14860
14861 /* There must be no padding. */
14862 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14863 count * GET_MODE_BITSIZE (*modep)))
14864 return -1;
14865
14866 return count;
14867 }
14868
14869 case RECORD_TYPE:
14870 {
14871 int count = 0;
14872 int sub_count;
14873 tree field;
14874
14875 /* Can't handle incomplete types nor sizes that are not
14876 fixed. */
14877 if (!COMPLETE_TYPE_P (type)
14878 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14879 return -1;
14880
14881 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14882 {
14883 if (TREE_CODE (field) != FIELD_DECL)
14884 continue;
14885
14886 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14887 if (sub_count < 0)
14888 return -1;
14889 count += sub_count;
14890 }
14891
14892 /* There must be no padding. */
14893 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14894 count * GET_MODE_BITSIZE (*modep)))
14895 return -1;
14896
14897 return count;
14898 }
14899
14900 case UNION_TYPE:
14901 case QUAL_UNION_TYPE:
14902 {
14903 /* These aren't very interesting except in a degenerate case. */
14904 int count = 0;
14905 int sub_count;
14906 tree field;
14907
14908 /* Can't handle incomplete types nor sizes that are not
14909 fixed. */
14910 if (!COMPLETE_TYPE_P (type)
14911 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14912 return -1;
14913
14914 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14915 {
14916 if (TREE_CODE (field) != FIELD_DECL)
14917 continue;
14918
14919 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14920 if (sub_count < 0)
14921 return -1;
14922 count = count > sub_count ? count : sub_count;
14923 }
14924
14925 /* There must be no padding. */
14926 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14927 count * GET_MODE_BITSIZE (*modep)))
14928 return -1;
14929
14930 return count;
14931 }
14932
14933 default:
14934 break;
14935 }
14936
14937 return -1;
14938 }
14939
14940 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14941 type as described in AAPCS64 \S 4.1.2.
14942
14943 See the comment above aarch64_composite_type_p for the notes on MODE. */
14944
14945 static bool
14946 aarch64_short_vector_p (const_tree type,
14947 machine_mode mode)
14948 {
14949 poly_int64 size = -1;
14950
14951 if (type && TREE_CODE (type) == VECTOR_TYPE)
14952 size = int_size_in_bytes (type);
14953 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14954 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14955 size = GET_MODE_SIZE (mode);
14956
14957 return known_eq (size, 8) || known_eq (size, 16);
14958 }
14959
14960 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14961 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14962 array types. The C99 floating-point complex types are also considered
14963 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14964 types, which are GCC extensions and out of the scope of AAPCS64, are
14965 treated as composite types here as well.
14966
14967 Note that MODE itself is not sufficient in determining whether a type
14968 is such a composite type or not. This is because
14969 stor-layout.c:compute_record_mode may have already changed the MODE
14970 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14971 structure with only one field may have its MODE set to the mode of the
14972 field. Also an integer mode whose size matches the size of the
14973 RECORD_TYPE type may be used to substitute the original mode
14974 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14975 solely relied on. */
14976
14977 static bool
14978 aarch64_composite_type_p (const_tree type,
14979 machine_mode mode)
14980 {
14981 if (aarch64_short_vector_p (type, mode))
14982 return false;
14983
14984 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14985 return true;
14986
14987 if (mode == BLKmode
14988 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14989 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14990 return true;
14991
14992 return false;
14993 }
14994
14995 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14996 shall be passed or returned in simd/fp register(s) (providing these
14997 parameter passing registers are available).
14998
14999 Upon successful return, *COUNT returns the number of needed registers,
15000 *BASE_MODE returns the mode of the individual register and when IS_HAF
15001 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
15002 floating-point aggregate or a homogeneous short-vector aggregate. */
15003
15004 static bool
15005 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
15006 const_tree type,
15007 machine_mode *base_mode,
15008 int *count,
15009 bool *is_ha)
15010 {
15011 machine_mode new_mode = VOIDmode;
15012 bool composite_p = aarch64_composite_type_p (type, mode);
15013
15014 if (is_ha != NULL) *is_ha = false;
15015
15016 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
15017 || aarch64_short_vector_p (type, mode))
15018 {
15019 *count = 1;
15020 new_mode = mode;
15021 }
15022 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
15023 {
15024 if (is_ha != NULL) *is_ha = true;
15025 *count = 2;
15026 new_mode = GET_MODE_INNER (mode);
15027 }
15028 else if (type && composite_p)
15029 {
15030 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
15031
15032 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
15033 {
15034 if (is_ha != NULL) *is_ha = true;
15035 *count = ag_count;
15036 }
15037 else
15038 return false;
15039 }
15040 else
15041 return false;
15042
15043 *base_mode = new_mode;
15044 return true;
15045 }
15046
15047 /* Implement TARGET_STRUCT_VALUE_RTX. */
15048
15049 static rtx
15050 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
15051 int incoming ATTRIBUTE_UNUSED)
15052 {
15053 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
15054 }
15055
15056 /* Implements target hook vector_mode_supported_p. */
15057 static bool
15058 aarch64_vector_mode_supported_p (machine_mode mode)
15059 {
15060 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15061 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
15062 }
15063
15064 /* Return the full-width SVE vector mode for element mode MODE, if one
15065 exists. */
15066 opt_machine_mode
15067 aarch64_full_sve_mode (scalar_mode mode)
15068 {
15069 switch (mode)
15070 {
15071 case E_DFmode:
15072 return VNx2DFmode;
15073 case E_SFmode:
15074 return VNx4SFmode;
15075 case E_HFmode:
15076 return VNx8HFmode;
15077 case E_DImode:
15078 return VNx2DImode;
15079 case E_SImode:
15080 return VNx4SImode;
15081 case E_HImode:
15082 return VNx8HImode;
15083 case E_QImode:
15084 return VNx16QImode;
15085 default:
15086 return opt_machine_mode ();
15087 }
15088 }
15089
15090 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15091 if it exists. */
15092 opt_machine_mode
15093 aarch64_vq_mode (scalar_mode mode)
15094 {
15095 switch (mode)
15096 {
15097 case E_DFmode:
15098 return V2DFmode;
15099 case E_SFmode:
15100 return V4SFmode;
15101 case E_HFmode:
15102 return V8HFmode;
15103 case E_SImode:
15104 return V4SImode;
15105 case E_HImode:
15106 return V8HImode;
15107 case E_QImode:
15108 return V16QImode;
15109 case E_DImode:
15110 return V2DImode;
15111 default:
15112 return opt_machine_mode ();
15113 }
15114 }
15115
15116 /* Return appropriate SIMD container
15117 for MODE within a vector of WIDTH bits. */
15118 static machine_mode
15119 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15120 {
15121 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15122 return aarch64_full_sve_mode (mode).else_mode (word_mode);
15123
15124 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15125 if (TARGET_SIMD)
15126 {
15127 if (known_eq (width, 128))
15128 return aarch64_vq_mode (mode).else_mode (word_mode);
15129 else
15130 switch (mode)
15131 {
15132 case E_SFmode:
15133 return V2SFmode;
15134 case E_HFmode:
15135 return V4HFmode;
15136 case E_SImode:
15137 return V2SImode;
15138 case E_HImode:
15139 return V4HImode;
15140 case E_QImode:
15141 return V8QImode;
15142 default:
15143 break;
15144 }
15145 }
15146 return word_mode;
15147 }
15148
15149 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15150 static machine_mode
15151 aarch64_preferred_simd_mode (scalar_mode mode)
15152 {
15153 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15154 return aarch64_simd_container_mode (mode, bits);
15155 }
15156
15157 /* Return a list of possible vector sizes for the vectorizer
15158 to iterate over. */
15159 static void
15160 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15161 {
15162 if (TARGET_SVE)
15163 sizes->safe_push (BYTES_PER_SVE_VECTOR);
15164 sizes->safe_push (16);
15165 sizes->safe_push (8);
15166 }
15167
15168 /* Implement TARGET_MANGLE_TYPE. */
15169
15170 static const char *
15171 aarch64_mangle_type (const_tree type)
15172 {
15173 /* The AArch64 ABI documents say that "__va_list" has to be
15174 mangled as if it is in the "std" namespace. */
15175 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15176 return "St9__va_list";
15177
15178 /* Half-precision float. */
15179 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15180 return "Dh";
15181
15182 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15183 builtin types. */
15184 if (TYPE_NAME (type) != NULL)
15185 return aarch64_general_mangle_builtin_type (type);
15186
15187 /* Use the default mangling. */
15188 return NULL;
15189 }
15190
15191 /* Find the first rtx_insn before insn that will generate an assembly
15192 instruction. */
15193
15194 static rtx_insn *
15195 aarch64_prev_real_insn (rtx_insn *insn)
15196 {
15197 if (!insn)
15198 return NULL;
15199
15200 do
15201 {
15202 insn = prev_real_insn (insn);
15203 }
15204 while (insn && recog_memoized (insn) < 0);
15205
15206 return insn;
15207 }
15208
15209 static bool
15210 is_madd_op (enum attr_type t1)
15211 {
15212 unsigned int i;
15213 /* A number of these may be AArch32 only. */
15214 enum attr_type mlatypes[] = {
15215 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15216 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15217 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15218 };
15219
15220 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15221 {
15222 if (t1 == mlatypes[i])
15223 return true;
15224 }
15225
15226 return false;
15227 }
15228
15229 /* Check if there is a register dependency between a load and the insn
15230 for which we hold recog_data. */
15231
15232 static bool
15233 dep_between_memop_and_curr (rtx memop)
15234 {
15235 rtx load_reg;
15236 int opno;
15237
15238 gcc_assert (GET_CODE (memop) == SET);
15239
15240 if (!REG_P (SET_DEST (memop)))
15241 return false;
15242
15243 load_reg = SET_DEST (memop);
15244 for (opno = 1; opno < recog_data.n_operands; opno++)
15245 {
15246 rtx operand = recog_data.operand[opno];
15247 if (REG_P (operand)
15248 && reg_overlap_mentioned_p (load_reg, operand))
15249 return true;
15250
15251 }
15252 return false;
15253 }
15254
15255
15256 /* When working around the Cortex-A53 erratum 835769,
15257 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15258 instruction and has a preceding memory instruction such that a NOP
15259 should be inserted between them. */
15260
15261 bool
15262 aarch64_madd_needs_nop (rtx_insn* insn)
15263 {
15264 enum attr_type attr_type;
15265 rtx_insn *prev;
15266 rtx body;
15267
15268 if (!TARGET_FIX_ERR_A53_835769)
15269 return false;
15270
15271 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15272 return false;
15273
15274 attr_type = get_attr_type (insn);
15275 if (!is_madd_op (attr_type))
15276 return false;
15277
15278 prev = aarch64_prev_real_insn (insn);
15279 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15280 Restore recog state to INSN to avoid state corruption. */
15281 extract_constrain_insn_cached (insn);
15282
15283 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15284 return false;
15285
15286 body = single_set (prev);
15287
15288 /* If the previous insn is a memory op and there is no dependency between
15289 it and the DImode madd, emit a NOP between them. If body is NULL then we
15290 have a complex memory operation, probably a load/store pair.
15291 Be conservative for now and emit a NOP. */
15292 if (GET_MODE (recog_data.operand[0]) == DImode
15293 && (!body || !dep_between_memop_and_curr (body)))
15294 return true;
15295
15296 return false;
15297
15298 }
15299
15300
15301 /* Implement FINAL_PRESCAN_INSN. */
15302
15303 void
15304 aarch64_final_prescan_insn (rtx_insn *insn)
15305 {
15306 if (aarch64_madd_needs_nop (insn))
15307 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15308 }
15309
15310
15311 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15312 instruction. */
15313
15314 bool
15315 aarch64_sve_index_immediate_p (rtx base_or_step)
15316 {
15317 return (CONST_INT_P (base_or_step)
15318 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15319 }
15320
15321 /* Return true if X is a valid immediate for the SVE ADD and SUB
15322 instructions. Negate X first if NEGATE_P is true. */
15323
15324 bool
15325 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15326 {
15327 rtx elt;
15328
15329 if (!const_vec_duplicate_p (x, &elt)
15330 || !CONST_INT_P (elt))
15331 return false;
15332
15333 HOST_WIDE_INT val = INTVAL (elt);
15334 if (negate_p)
15335 val = -val;
15336 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15337
15338 if (val & 0xff)
15339 return IN_RANGE (val, 0, 0xff);
15340 return IN_RANGE (val, 0, 0xff00);
15341 }
15342
15343 /* Return true if X is a valid immediate operand for an SVE logical
15344 instruction such as AND. */
15345
15346 bool
15347 aarch64_sve_bitmask_immediate_p (rtx x)
15348 {
15349 rtx elt;
15350
15351 return (const_vec_duplicate_p (x, &elt)
15352 && CONST_INT_P (elt)
15353 && aarch64_bitmask_imm (INTVAL (elt),
15354 GET_MODE_INNER (GET_MODE (x))));
15355 }
15356
15357 /* Return true if X is a valid immediate for the SVE DUP and CPY
15358 instructions. */
15359
15360 bool
15361 aarch64_sve_dup_immediate_p (rtx x)
15362 {
15363 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15364 if (!CONST_INT_P (x))
15365 return false;
15366
15367 HOST_WIDE_INT val = INTVAL (x);
15368 if (val & 0xff)
15369 return IN_RANGE (val, -0x80, 0x7f);
15370 return IN_RANGE (val, -0x8000, 0x7f00);
15371 }
15372
15373 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15374 SIGNED_P says whether the operand is signed rather than unsigned. */
15375
15376 bool
15377 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15378 {
15379 rtx elt;
15380
15381 return (const_vec_duplicate_p (x, &elt)
15382 && CONST_INT_P (elt)
15383 && (signed_p
15384 ? IN_RANGE (INTVAL (elt), -16, 15)
15385 : IN_RANGE (INTVAL (elt), 0, 127)));
15386 }
15387
15388 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15389 instruction. Negate X first if NEGATE_P is true. */
15390
15391 bool
15392 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15393 {
15394 rtx elt;
15395 REAL_VALUE_TYPE r;
15396
15397 if (!const_vec_duplicate_p (x, &elt)
15398 || GET_CODE (elt) != CONST_DOUBLE)
15399 return false;
15400
15401 r = *CONST_DOUBLE_REAL_VALUE (elt);
15402
15403 if (negate_p)
15404 r = real_value_negate (&r);
15405
15406 if (real_equal (&r, &dconst1))
15407 return true;
15408 if (real_equal (&r, &dconsthalf))
15409 return true;
15410 return false;
15411 }
15412
15413 /* Return true if X is a valid immediate operand for an SVE FMUL
15414 instruction. */
15415
15416 bool
15417 aarch64_sve_float_mul_immediate_p (rtx x)
15418 {
15419 rtx elt;
15420
15421 return (const_vec_duplicate_p (x, &elt)
15422 && GET_CODE (elt) == CONST_DOUBLE
15423 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15424 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15425 }
15426
15427 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15428 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15429 is nonnull, use it to describe valid immediates. */
15430 static bool
15431 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15432 simd_immediate_info *info,
15433 enum simd_immediate_check which,
15434 simd_immediate_info::insn_type insn)
15435 {
15436 /* Try a 4-byte immediate with LSL. */
15437 for (unsigned int shift = 0; shift < 32; shift += 8)
15438 if ((val32 & (0xff << shift)) == val32)
15439 {
15440 if (info)
15441 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15442 simd_immediate_info::LSL, shift);
15443 return true;
15444 }
15445
15446 /* Try a 2-byte immediate with LSL. */
15447 unsigned int imm16 = val32 & 0xffff;
15448 if (imm16 == (val32 >> 16))
15449 for (unsigned int shift = 0; shift < 16; shift += 8)
15450 if ((imm16 & (0xff << shift)) == imm16)
15451 {
15452 if (info)
15453 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15454 simd_immediate_info::LSL, shift);
15455 return true;
15456 }
15457
15458 /* Try a 4-byte immediate with MSL, except for cases that MVN
15459 can handle. */
15460 if (which == AARCH64_CHECK_MOV)
15461 for (unsigned int shift = 8; shift < 24; shift += 8)
15462 {
15463 unsigned int low = (1 << shift) - 1;
15464 if (((val32 & (0xff << shift)) | low) == val32)
15465 {
15466 if (info)
15467 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15468 simd_immediate_info::MSL, shift);
15469 return true;
15470 }
15471 }
15472
15473 return false;
15474 }
15475
15476 /* Return true if replicating VAL64 is a valid immediate for the
15477 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15478 use it to describe valid immediates. */
15479 static bool
15480 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15481 simd_immediate_info *info,
15482 enum simd_immediate_check which)
15483 {
15484 unsigned int val32 = val64 & 0xffffffff;
15485 unsigned int val16 = val64 & 0xffff;
15486 unsigned int val8 = val64 & 0xff;
15487
15488 if (val32 == (val64 >> 32))
15489 {
15490 if ((which & AARCH64_CHECK_ORR) != 0
15491 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15492 simd_immediate_info::MOV))
15493 return true;
15494
15495 if ((which & AARCH64_CHECK_BIC) != 0
15496 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15497 simd_immediate_info::MVN))
15498 return true;
15499
15500 /* Try using a replicated byte. */
15501 if (which == AARCH64_CHECK_MOV
15502 && val16 == (val32 >> 16)
15503 && val8 == (val16 >> 8))
15504 {
15505 if (info)
15506 *info = simd_immediate_info (QImode, val8);
15507 return true;
15508 }
15509 }
15510
15511 /* Try using a bit-to-bytemask. */
15512 if (which == AARCH64_CHECK_MOV)
15513 {
15514 unsigned int i;
15515 for (i = 0; i < 64; i += 8)
15516 {
15517 unsigned char byte = (val64 >> i) & 0xff;
15518 if (byte != 0 && byte != 0xff)
15519 break;
15520 }
15521 if (i == 64)
15522 {
15523 if (info)
15524 *info = simd_immediate_info (DImode, val64);
15525 return true;
15526 }
15527 }
15528 return false;
15529 }
15530
15531 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15532 instruction. If INFO is nonnull, use it to describe valid immediates. */
15533
15534 static bool
15535 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15536 simd_immediate_info *info)
15537 {
15538 scalar_int_mode mode = DImode;
15539 unsigned int val32 = val64 & 0xffffffff;
15540 if (val32 == (val64 >> 32))
15541 {
15542 mode = SImode;
15543 unsigned int val16 = val32 & 0xffff;
15544 if (val16 == (val32 >> 16))
15545 {
15546 mode = HImode;
15547 unsigned int val8 = val16 & 0xff;
15548 if (val8 == (val16 >> 8))
15549 mode = QImode;
15550 }
15551 }
15552 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15553 if (IN_RANGE (val, -0x80, 0x7f))
15554 {
15555 /* DUP with no shift. */
15556 if (info)
15557 *info = simd_immediate_info (mode, val);
15558 return true;
15559 }
15560 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15561 {
15562 /* DUP with LSL #8. */
15563 if (info)
15564 *info = simd_immediate_info (mode, val);
15565 return true;
15566 }
15567 if (aarch64_bitmask_imm (val64, mode))
15568 {
15569 /* DUPM. */
15570 if (info)
15571 *info = simd_immediate_info (mode, val);
15572 return true;
15573 }
15574 return false;
15575 }
15576
15577 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15578 it to describe valid immediates. */
15579
15580 static bool
15581 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15582 {
15583 if (x == CONST0_RTX (GET_MODE (x)))
15584 {
15585 if (info)
15586 *info = simd_immediate_info (DImode, 0);
15587 return true;
15588 }
15589
15590 /* Analyze the value as a VNx16BImode. This should be relatively
15591 efficient, since rtx_vector_builder has enough built-in capacity
15592 to store all VLA predicate constants without needing the heap. */
15593 rtx_vector_builder builder;
15594 if (!aarch64_get_sve_pred_bits (builder, x))
15595 return false;
15596
15597 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15598 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15599 {
15600 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15601 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15602 if (pattern != AARCH64_NUM_SVPATTERNS)
15603 {
15604 if (info)
15605 {
15606 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15607 *info = simd_immediate_info (int_mode, pattern);
15608 }
15609 return true;
15610 }
15611 }
15612 return false;
15613 }
15614
15615 /* Return true if OP is a valid SIMD immediate for the operation
15616 described by WHICH. If INFO is nonnull, use it to describe valid
15617 immediates. */
15618 bool
15619 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15620 enum simd_immediate_check which)
15621 {
15622 machine_mode mode = GET_MODE (op);
15623 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15624 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15625 return false;
15626
15627 if (vec_flags & VEC_SVE_PRED)
15628 return aarch64_sve_pred_valid_immediate (op, info);
15629
15630 scalar_mode elt_mode = GET_MODE_INNER (mode);
15631 rtx base, step;
15632 unsigned int n_elts;
15633 if (GET_CODE (op) == CONST_VECTOR
15634 && CONST_VECTOR_DUPLICATE_P (op))
15635 n_elts = CONST_VECTOR_NPATTERNS (op);
15636 else if ((vec_flags & VEC_SVE_DATA)
15637 && const_vec_series_p (op, &base, &step))
15638 {
15639 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15640 if (!aarch64_sve_index_immediate_p (base)
15641 || !aarch64_sve_index_immediate_p (step))
15642 return false;
15643
15644 if (info)
15645 *info = simd_immediate_info (elt_mode, base, step);
15646 return true;
15647 }
15648 else if (GET_CODE (op) == CONST_VECTOR
15649 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15650 /* N_ELTS set above. */;
15651 else
15652 return false;
15653
15654 scalar_float_mode elt_float_mode;
15655 if (n_elts == 1
15656 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15657 {
15658 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15659 if (aarch64_float_const_zero_rtx_p (elt)
15660 || aarch64_float_const_representable_p (elt))
15661 {
15662 if (info)
15663 *info = simd_immediate_info (elt_float_mode, elt);
15664 return true;
15665 }
15666 }
15667
15668 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15669 if (elt_size > 8)
15670 return false;
15671
15672 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15673
15674 /* Expand the vector constant out into a byte vector, with the least
15675 significant byte of the register first. */
15676 auto_vec<unsigned char, 16> bytes;
15677 bytes.reserve (n_elts * elt_size);
15678 for (unsigned int i = 0; i < n_elts; i++)
15679 {
15680 /* The vector is provided in gcc endian-neutral fashion.
15681 For aarch64_be Advanced SIMD, it must be laid out in the vector
15682 register in reverse order. */
15683 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15684 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15685
15686 if (elt_mode != elt_int_mode)
15687 elt = gen_lowpart (elt_int_mode, elt);
15688
15689 if (!CONST_INT_P (elt))
15690 return false;
15691
15692 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15693 for (unsigned int byte = 0; byte < elt_size; byte++)
15694 {
15695 bytes.quick_push (elt_val & 0xff);
15696 elt_val >>= BITS_PER_UNIT;
15697 }
15698 }
15699
15700 /* The immediate must repeat every eight bytes. */
15701 unsigned int nbytes = bytes.length ();
15702 for (unsigned i = 8; i < nbytes; ++i)
15703 if (bytes[i] != bytes[i - 8])
15704 return false;
15705
15706 /* Get the repeating 8-byte value as an integer. No endian correction
15707 is needed here because bytes is already in lsb-first order. */
15708 unsigned HOST_WIDE_INT val64 = 0;
15709 for (unsigned int i = 0; i < 8; i++)
15710 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15711 << (i * BITS_PER_UNIT));
15712
15713 if (vec_flags & VEC_SVE_DATA)
15714 return aarch64_sve_valid_immediate (val64, info);
15715 else
15716 return aarch64_advsimd_valid_immediate (val64, info, which);
15717 }
15718
15719 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15720 has a step in the range of INDEX. Return the index expression if so,
15721 otherwise return null. */
15722 rtx
15723 aarch64_check_zero_based_sve_index_immediate (rtx x)
15724 {
15725 rtx base, step;
15726 if (const_vec_series_p (x, &base, &step)
15727 && base == const0_rtx
15728 && aarch64_sve_index_immediate_p (step))
15729 return step;
15730 return NULL_RTX;
15731 }
15732
15733 /* Check of immediate shift constants are within range. */
15734 bool
15735 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15736 {
15737 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15738 if (left)
15739 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15740 else
15741 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15742 }
15743
15744 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15745 operation of width WIDTH at bit position POS. */
15746
15747 rtx
15748 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15749 {
15750 gcc_assert (CONST_INT_P (width));
15751 gcc_assert (CONST_INT_P (pos));
15752
15753 unsigned HOST_WIDE_INT mask
15754 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15755 return GEN_INT (mask << UINTVAL (pos));
15756 }
15757
15758 bool
15759 aarch64_mov_operand_p (rtx x, machine_mode mode)
15760 {
15761 if (GET_CODE (x) == HIGH
15762 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15763 return true;
15764
15765 if (CONST_INT_P (x))
15766 return true;
15767
15768 if (VECTOR_MODE_P (GET_MODE (x)))
15769 {
15770 /* Require predicate constants to be VNx16BI before RA, so that we
15771 force everything to have a canonical form. */
15772 if (!lra_in_progress
15773 && !reload_completed
15774 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15775 && GET_MODE (x) != VNx16BImode)
15776 return false;
15777
15778 return aarch64_simd_valid_immediate (x, NULL);
15779 }
15780
15781 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15782 return true;
15783
15784 if (aarch64_sve_cnt_immediate_p (x))
15785 return true;
15786
15787 return aarch64_classify_symbolic_expression (x)
15788 == SYMBOL_TINY_ABSOLUTE;
15789 }
15790
15791 /* Return a const_int vector of VAL. */
15792 rtx
15793 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15794 {
15795 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15796 return gen_const_vec_duplicate (mode, c);
15797 }
15798
15799 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15800
15801 bool
15802 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15803 {
15804 machine_mode vmode;
15805
15806 vmode = aarch64_simd_container_mode (mode, 64);
15807 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15808 return aarch64_simd_valid_immediate (op_v, NULL);
15809 }
15810
15811 /* Construct and return a PARALLEL RTX vector with elements numbering the
15812 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15813 the vector - from the perspective of the architecture. This does not
15814 line up with GCC's perspective on lane numbers, so we end up with
15815 different masks depending on our target endian-ness. The diagram
15816 below may help. We must draw the distinction when building masks
15817 which select one half of the vector. An instruction selecting
15818 architectural low-lanes for a big-endian target, must be described using
15819 a mask selecting GCC high-lanes.
15820
15821 Big-Endian Little-Endian
15822
15823 GCC 0 1 2 3 3 2 1 0
15824 | x | x | x | x | | x | x | x | x |
15825 Architecture 3 2 1 0 3 2 1 0
15826
15827 Low Mask: { 2, 3 } { 0, 1 }
15828 High Mask: { 0, 1 } { 2, 3 }
15829
15830 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15831
15832 rtx
15833 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15834 {
15835 rtvec v = rtvec_alloc (nunits / 2);
15836 int high_base = nunits / 2;
15837 int low_base = 0;
15838 int base;
15839 rtx t1;
15840 int i;
15841
15842 if (BYTES_BIG_ENDIAN)
15843 base = high ? low_base : high_base;
15844 else
15845 base = high ? high_base : low_base;
15846
15847 for (i = 0; i < nunits / 2; i++)
15848 RTVEC_ELT (v, i) = GEN_INT (base + i);
15849
15850 t1 = gen_rtx_PARALLEL (mode, v);
15851 return t1;
15852 }
15853
15854 /* Check OP for validity as a PARALLEL RTX vector with elements
15855 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15856 from the perspective of the architecture. See the diagram above
15857 aarch64_simd_vect_par_cnst_half for more details. */
15858
15859 bool
15860 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15861 bool high)
15862 {
15863 int nelts;
15864 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15865 return false;
15866
15867 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15868 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15869 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15870 int i = 0;
15871
15872 if (count_op != count_ideal)
15873 return false;
15874
15875 for (i = 0; i < count_ideal; i++)
15876 {
15877 rtx elt_op = XVECEXP (op, 0, i);
15878 rtx elt_ideal = XVECEXP (ideal, 0, i);
15879
15880 if (!CONST_INT_P (elt_op)
15881 || INTVAL (elt_ideal) != INTVAL (elt_op))
15882 return false;
15883 }
15884 return true;
15885 }
15886
15887 /* Return a PARALLEL containing NELTS elements, with element I equal
15888 to BASE + I * STEP. */
15889
15890 rtx
15891 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15892 {
15893 rtvec vec = rtvec_alloc (nelts);
15894 for (unsigned int i = 0; i < nelts; ++i)
15895 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15896 return gen_rtx_PARALLEL (VOIDmode, vec);
15897 }
15898
15899 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15900 series with step STEP. */
15901
15902 bool
15903 aarch64_stepped_int_parallel_p (rtx op, int step)
15904 {
15905 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15906 return false;
15907
15908 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15909 for (int i = 1; i < XVECLEN (op, 0); ++i)
15910 if (!CONST_INT_P (XVECEXP (op, 0, i))
15911 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15912 return false;
15913
15914 return true;
15915 }
15916
15917 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15918 HIGH (exclusive). */
15919 void
15920 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15921 const_tree exp)
15922 {
15923 HOST_WIDE_INT lane;
15924 gcc_assert (CONST_INT_P (operand));
15925 lane = INTVAL (operand);
15926
15927 if (lane < low || lane >= high)
15928 {
15929 if (exp)
15930 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15931 else
15932 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15933 }
15934 }
15935
15936 /* Peform endian correction on lane number N, which indexes a vector
15937 of mode MODE, and return the result as an SImode rtx. */
15938
15939 rtx
15940 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15941 {
15942 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15943 }
15944
15945 /* Return TRUE if OP is a valid vector addressing mode. */
15946
15947 bool
15948 aarch64_simd_mem_operand_p (rtx op)
15949 {
15950 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15951 || REG_P (XEXP (op, 0)));
15952 }
15953
15954 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15955
15956 bool
15957 aarch64_sve_ld1r_operand_p (rtx op)
15958 {
15959 struct aarch64_address_info addr;
15960 scalar_mode mode;
15961
15962 return (MEM_P (op)
15963 && is_a <scalar_mode> (GET_MODE (op), &mode)
15964 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15965 && addr.type == ADDRESS_REG_IMM
15966 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15967 }
15968
15969 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15970 bool
15971 aarch64_sve_ld1rq_operand_p (rtx op)
15972 {
15973 struct aarch64_address_info addr;
15974 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15975 if (!MEM_P (op)
15976 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15977 return false;
15978
15979 if (addr.type == ADDRESS_REG_IMM)
15980 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15981
15982 if (addr.type == ADDRESS_REG_REG)
15983 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15984
15985 return false;
15986 }
15987
15988 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15989 The conditions for STR are the same. */
15990 bool
15991 aarch64_sve_ldr_operand_p (rtx op)
15992 {
15993 struct aarch64_address_info addr;
15994
15995 return (MEM_P (op)
15996 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15997 false, ADDR_QUERY_ANY)
15998 && addr.type == ADDRESS_REG_IMM);
15999 }
16000
16001 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
16002 We need to be able to access the individual pieces, so the range
16003 is different from LD[234] and ST[234]. */
16004 bool
16005 aarch64_sve_struct_memory_operand_p (rtx op)
16006 {
16007 if (!MEM_P (op))
16008 return false;
16009
16010 machine_mode mode = GET_MODE (op);
16011 struct aarch64_address_info addr;
16012 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
16013 ADDR_QUERY_ANY)
16014 || addr.type != ADDRESS_REG_IMM)
16015 return false;
16016
16017 poly_int64 first = addr.const_offset;
16018 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
16019 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
16020 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
16021 }
16022
16023 /* Emit a register copy from operand to operand, taking care not to
16024 early-clobber source registers in the process.
16025
16026 COUNT is the number of components into which the copy needs to be
16027 decomposed. */
16028 void
16029 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
16030 unsigned int count)
16031 {
16032 unsigned int i;
16033 int rdest = REGNO (operands[0]);
16034 int rsrc = REGNO (operands[1]);
16035
16036 if (!reg_overlap_mentioned_p (operands[0], operands[1])
16037 || rdest < rsrc)
16038 for (i = 0; i < count; i++)
16039 emit_move_insn (gen_rtx_REG (mode, rdest + i),
16040 gen_rtx_REG (mode, rsrc + i));
16041 else
16042 for (i = 0; i < count; i++)
16043 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
16044 gen_rtx_REG (mode, rsrc + count - i - 1));
16045 }
16046
16047 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16048 one of VSTRUCT modes: OI, CI, or XI. */
16049 int
16050 aarch64_simd_attr_length_rglist (machine_mode mode)
16051 {
16052 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
16053 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
16054 }
16055
16056 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
16057 alignment of a vector to 128 bits. SVE predicates have an alignment of
16058 16 bits. */
16059 static HOST_WIDE_INT
16060 aarch64_simd_vector_alignment (const_tree type)
16061 {
16062 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16063 be set for non-predicate vectors of booleans. Modes are the most
16064 direct way we have of identifying real SVE predicate types. */
16065 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
16066 return 16;
16067 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16068 return 128;
16069 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
16070 }
16071
16072 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
16073 static poly_uint64
16074 aarch64_vectorize_preferred_vector_alignment (const_tree type)
16075 {
16076 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
16077 {
16078 /* If the length of the vector is fixed, try to align to that length,
16079 otherwise don't try to align at all. */
16080 HOST_WIDE_INT result;
16081 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
16082 result = TYPE_ALIGN (TREE_TYPE (type));
16083 return result;
16084 }
16085 return TYPE_ALIGN (type);
16086 }
16087
16088 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
16089 static bool
16090 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
16091 {
16092 if (is_packed)
16093 return false;
16094
16095 /* For fixed-length vectors, check that the vectorizer will aim for
16096 full-vector alignment. This isn't true for generic GCC vectors
16097 that are wider than the ABI maximum of 128 bits. */
16098 poly_uint64 preferred_alignment =
16099 aarch64_vectorize_preferred_vector_alignment (type);
16100 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16101 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16102 preferred_alignment))
16103 return false;
16104
16105 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
16106 return true;
16107 }
16108
16109 /* Return true if the vector misalignment factor is supported by the
16110 target. */
16111 static bool
16112 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16113 const_tree type, int misalignment,
16114 bool is_packed)
16115 {
16116 if (TARGET_SIMD && STRICT_ALIGNMENT)
16117 {
16118 /* Return if movmisalign pattern is not supported for this mode. */
16119 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16120 return false;
16121
16122 /* Misalignment factor is unknown at compile time. */
16123 if (misalignment == -1)
16124 return false;
16125 }
16126 return default_builtin_support_vector_misalignment (mode, type, misalignment,
16127 is_packed);
16128 }
16129
16130 /* If VALS is a vector constant that can be loaded into a register
16131 using DUP, generate instructions to do so and return an RTX to
16132 assign to the register. Otherwise return NULL_RTX. */
16133 static rtx
16134 aarch64_simd_dup_constant (rtx vals)
16135 {
16136 machine_mode mode = GET_MODE (vals);
16137 machine_mode inner_mode = GET_MODE_INNER (mode);
16138 rtx x;
16139
16140 if (!const_vec_duplicate_p (vals, &x))
16141 return NULL_RTX;
16142
16143 /* We can load this constant by using DUP and a constant in a
16144 single ARM register. This will be cheaper than a vector
16145 load. */
16146 x = copy_to_mode_reg (inner_mode, x);
16147 return gen_vec_duplicate (mode, x);
16148 }
16149
16150
16151 /* Generate code to load VALS, which is a PARALLEL containing only
16152 constants (for vec_init) or CONST_VECTOR, efficiently into a
16153 register. Returns an RTX to copy into the register, or NULL_RTX
16154 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16155 static rtx
16156 aarch64_simd_make_constant (rtx vals)
16157 {
16158 machine_mode mode = GET_MODE (vals);
16159 rtx const_dup;
16160 rtx const_vec = NULL_RTX;
16161 int n_const = 0;
16162 int i;
16163
16164 if (GET_CODE (vals) == CONST_VECTOR)
16165 const_vec = vals;
16166 else if (GET_CODE (vals) == PARALLEL)
16167 {
16168 /* A CONST_VECTOR must contain only CONST_INTs and
16169 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16170 Only store valid constants in a CONST_VECTOR. */
16171 int n_elts = XVECLEN (vals, 0);
16172 for (i = 0; i < n_elts; ++i)
16173 {
16174 rtx x = XVECEXP (vals, 0, i);
16175 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16176 n_const++;
16177 }
16178 if (n_const == n_elts)
16179 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16180 }
16181 else
16182 gcc_unreachable ();
16183
16184 if (const_vec != NULL_RTX
16185 && aarch64_simd_valid_immediate (const_vec, NULL))
16186 /* Load using MOVI/MVNI. */
16187 return const_vec;
16188 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16189 /* Loaded using DUP. */
16190 return const_dup;
16191 else if (const_vec != NULL_RTX)
16192 /* Load from constant pool. We cannot take advantage of single-cycle
16193 LD1 because we need a PC-relative addressing mode. */
16194 return const_vec;
16195 else
16196 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16197 We cannot construct an initializer. */
16198 return NULL_RTX;
16199 }
16200
16201 /* Expand a vector initialisation sequence, such that TARGET is
16202 initialised to contain VALS. */
16203
16204 void
16205 aarch64_expand_vector_init (rtx target, rtx vals)
16206 {
16207 machine_mode mode = GET_MODE (target);
16208 scalar_mode inner_mode = GET_MODE_INNER (mode);
16209 /* The number of vector elements. */
16210 int n_elts = XVECLEN (vals, 0);
16211 /* The number of vector elements which are not constant. */
16212 int n_var = 0;
16213 rtx any_const = NULL_RTX;
16214 /* The first element of vals. */
16215 rtx v0 = XVECEXP (vals, 0, 0);
16216 bool all_same = true;
16217
16218 /* This is a special vec_init<M><N> where N is not an element mode but a
16219 vector mode with half the elements of M. We expect to find two entries
16220 of mode N in VALS and we must put their concatentation into TARGET. */
16221 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16222 {
16223 gcc_assert (known_eq (GET_MODE_SIZE (mode),
16224 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16225 rtx lo = XVECEXP (vals, 0, 0);
16226 rtx hi = XVECEXP (vals, 0, 1);
16227 machine_mode narrow_mode = GET_MODE (lo);
16228 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16229 gcc_assert (narrow_mode == GET_MODE (hi));
16230
16231 /* When we want to concatenate a half-width vector with zeroes we can
16232 use the aarch64_combinez[_be] patterns. Just make sure that the
16233 zeroes are in the right half. */
16234 if (BYTES_BIG_ENDIAN
16235 && aarch64_simd_imm_zero (lo, narrow_mode)
16236 && general_operand (hi, narrow_mode))
16237 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16238 else if (!BYTES_BIG_ENDIAN
16239 && aarch64_simd_imm_zero (hi, narrow_mode)
16240 && general_operand (lo, narrow_mode))
16241 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16242 else
16243 {
16244 /* Else create the two half-width registers and combine them. */
16245 if (!REG_P (lo))
16246 lo = force_reg (GET_MODE (lo), lo);
16247 if (!REG_P (hi))
16248 hi = force_reg (GET_MODE (hi), hi);
16249
16250 if (BYTES_BIG_ENDIAN)
16251 std::swap (lo, hi);
16252 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16253 }
16254 return;
16255 }
16256
16257 /* Count the number of variable elements to initialise. */
16258 for (int i = 0; i < n_elts; ++i)
16259 {
16260 rtx x = XVECEXP (vals, 0, i);
16261 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16262 ++n_var;
16263 else
16264 any_const = x;
16265
16266 all_same &= rtx_equal_p (x, v0);
16267 }
16268
16269 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16270 how best to handle this. */
16271 if (n_var == 0)
16272 {
16273 rtx constant = aarch64_simd_make_constant (vals);
16274 if (constant != NULL_RTX)
16275 {
16276 emit_move_insn (target, constant);
16277 return;
16278 }
16279 }
16280
16281 /* Splat a single non-constant element if we can. */
16282 if (all_same)
16283 {
16284 rtx x = copy_to_mode_reg (inner_mode, v0);
16285 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16286 return;
16287 }
16288
16289 enum insn_code icode = optab_handler (vec_set_optab, mode);
16290 gcc_assert (icode != CODE_FOR_nothing);
16291
16292 /* If there are only variable elements, try to optimize
16293 the insertion using dup for the most common element
16294 followed by insertions. */
16295
16296 /* The algorithm will fill matches[*][0] with the earliest matching element,
16297 and matches[X][1] with the count of duplicate elements (if X is the
16298 earliest element which has duplicates). */
16299
16300 if (n_var == n_elts && n_elts <= 16)
16301 {
16302 int matches[16][2] = {0};
16303 for (int i = 0; i < n_elts; i++)
16304 {
16305 for (int j = 0; j <= i; j++)
16306 {
16307 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16308 {
16309 matches[i][0] = j;
16310 matches[j][1]++;
16311 break;
16312 }
16313 }
16314 }
16315 int maxelement = 0;
16316 int maxv = 0;
16317 for (int i = 0; i < n_elts; i++)
16318 if (matches[i][1] > maxv)
16319 {
16320 maxelement = i;
16321 maxv = matches[i][1];
16322 }
16323
16324 /* Create a duplicate of the most common element, unless all elements
16325 are equally useless to us, in which case just immediately set the
16326 vector register using the first element. */
16327
16328 if (maxv == 1)
16329 {
16330 /* For vectors of two 64-bit elements, we can do even better. */
16331 if (n_elts == 2
16332 && (inner_mode == E_DImode
16333 || inner_mode == E_DFmode))
16334
16335 {
16336 rtx x0 = XVECEXP (vals, 0, 0);
16337 rtx x1 = XVECEXP (vals, 0, 1);
16338 /* Combine can pick up this case, but handling it directly
16339 here leaves clearer RTL.
16340
16341 This is load_pair_lanes<mode>, and also gives us a clean-up
16342 for store_pair_lanes<mode>. */
16343 if (memory_operand (x0, inner_mode)
16344 && memory_operand (x1, inner_mode)
16345 && !STRICT_ALIGNMENT
16346 && rtx_equal_p (XEXP (x1, 0),
16347 plus_constant (Pmode,
16348 XEXP (x0, 0),
16349 GET_MODE_SIZE (inner_mode))))
16350 {
16351 rtx t;
16352 if (inner_mode == DFmode)
16353 t = gen_load_pair_lanesdf (target, x0, x1);
16354 else
16355 t = gen_load_pair_lanesdi (target, x0, x1);
16356 emit_insn (t);
16357 return;
16358 }
16359 }
16360 /* The subreg-move sequence below will move into lane zero of the
16361 vector register. For big-endian we want that position to hold
16362 the last element of VALS. */
16363 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16364 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16365 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16366 }
16367 else
16368 {
16369 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16370 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16371 }
16372
16373 /* Insert the rest. */
16374 for (int i = 0; i < n_elts; i++)
16375 {
16376 rtx x = XVECEXP (vals, 0, i);
16377 if (matches[i][0] == maxelement)
16378 continue;
16379 x = copy_to_mode_reg (inner_mode, x);
16380 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16381 }
16382 return;
16383 }
16384
16385 /* Initialise a vector which is part-variable. We want to first try
16386 to build those lanes which are constant in the most efficient way we
16387 can. */
16388 if (n_var != n_elts)
16389 {
16390 rtx copy = copy_rtx (vals);
16391
16392 /* Load constant part of vector. We really don't care what goes into the
16393 parts we will overwrite, but we're more likely to be able to load the
16394 constant efficiently if it has fewer, larger, repeating parts
16395 (see aarch64_simd_valid_immediate). */
16396 for (int i = 0; i < n_elts; i++)
16397 {
16398 rtx x = XVECEXP (vals, 0, i);
16399 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16400 continue;
16401 rtx subst = any_const;
16402 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16403 {
16404 /* Look in the copied vector, as more elements are const. */
16405 rtx test = XVECEXP (copy, 0, i ^ bit);
16406 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16407 {
16408 subst = test;
16409 break;
16410 }
16411 }
16412 XVECEXP (copy, 0, i) = subst;
16413 }
16414 aarch64_expand_vector_init (target, copy);
16415 }
16416
16417 /* Insert the variable lanes directly. */
16418 for (int i = 0; i < n_elts; i++)
16419 {
16420 rtx x = XVECEXP (vals, 0, i);
16421 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16422 continue;
16423 x = copy_to_mode_reg (inner_mode, x);
16424 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16425 }
16426 }
16427
16428 /* Emit RTL corresponding to:
16429 insr TARGET, ELEM. */
16430
16431 static void
16432 emit_insr (rtx target, rtx elem)
16433 {
16434 machine_mode mode = GET_MODE (target);
16435 scalar_mode elem_mode = GET_MODE_INNER (mode);
16436 elem = force_reg (elem_mode, elem);
16437
16438 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16439 gcc_assert (icode != CODE_FOR_nothing);
16440 emit_insn (GEN_FCN (icode) (target, target, elem));
16441 }
16442
16443 /* Subroutine of aarch64_sve_expand_vector_init for handling
16444 trailing constants.
16445 This function works as follows:
16446 (a) Create a new vector consisting of trailing constants.
16447 (b) Initialize TARGET with the constant vector using emit_move_insn.
16448 (c) Insert remaining elements in TARGET using insr.
16449 NELTS is the total number of elements in original vector while
16450 while NELTS_REQD is the number of elements that are actually
16451 significant.
16452
16453 ??? The heuristic used is to do above only if number of constants
16454 is at least half the total number of elements. May need fine tuning. */
16455
16456 static bool
16457 aarch64_sve_expand_vector_init_handle_trailing_constants
16458 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16459 {
16460 machine_mode mode = GET_MODE (target);
16461 scalar_mode elem_mode = GET_MODE_INNER (mode);
16462 int n_trailing_constants = 0;
16463
16464 for (int i = nelts_reqd - 1;
16465 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16466 i--)
16467 n_trailing_constants++;
16468
16469 if (n_trailing_constants >= nelts_reqd / 2)
16470 {
16471 rtx_vector_builder v (mode, 1, nelts);
16472 for (int i = 0; i < nelts; i++)
16473 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16474 rtx const_vec = v.build ();
16475 emit_move_insn (target, const_vec);
16476
16477 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16478 emit_insr (target, builder.elt (i));
16479
16480 return true;
16481 }
16482
16483 return false;
16484 }
16485
16486 /* Subroutine of aarch64_sve_expand_vector_init.
16487 Works as follows:
16488 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16489 (b) Skip trailing elements from BUILDER, which are the same as
16490 element NELTS_REQD - 1.
16491 (c) Insert earlier elements in reverse order in TARGET using insr. */
16492
16493 static void
16494 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16495 const rtx_vector_builder &builder,
16496 int nelts_reqd)
16497 {
16498 machine_mode mode = GET_MODE (target);
16499 scalar_mode elem_mode = GET_MODE_INNER (mode);
16500
16501 struct expand_operand ops[2];
16502 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16503 gcc_assert (icode != CODE_FOR_nothing);
16504
16505 create_output_operand (&ops[0], target, mode);
16506 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16507 expand_insn (icode, 2, ops);
16508
16509 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16510 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16511 emit_insr (target, builder.elt (i));
16512 }
16513
16514 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16515 when all trailing elements of builder are same.
16516 This works as follows:
16517 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16518 (b) Insert remaining elements in TARGET using insr.
16519
16520 ??? The heuristic used is to do above if number of same trailing elements
16521 is at least 3/4 of total number of elements, loosely based on
16522 heuristic from mostly_zeros_p. May need fine-tuning. */
16523
16524 static bool
16525 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16526 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16527 {
16528 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16529 if (ndups >= (3 * nelts_reqd) / 4)
16530 {
16531 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16532 nelts_reqd - ndups + 1);
16533 return true;
16534 }
16535
16536 return false;
16537 }
16538
16539 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16540 of elements in BUILDER.
16541
16542 The function tries to initialize TARGET from BUILDER if it fits one
16543 of the special cases outlined below.
16544
16545 Failing that, the function divides BUILDER into two sub-vectors:
16546 v_even = even elements of BUILDER;
16547 v_odd = odd elements of BUILDER;
16548
16549 and recursively calls itself with v_even and v_odd.
16550
16551 if (recursive call succeeded for v_even or v_odd)
16552 TARGET = zip (v_even, v_odd)
16553
16554 The function returns true if it managed to build TARGET from BUILDER
16555 with one of the special cases, false otherwise.
16556
16557 Example: {a, 1, b, 2, c, 3, d, 4}
16558
16559 The vector gets divided into:
16560 v_even = {a, b, c, d}
16561 v_odd = {1, 2, 3, 4}
16562
16563 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16564 initialize tmp2 from constant vector v_odd using emit_move_insn.
16565
16566 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16567 4 elements, so we construct tmp1 from v_even using insr:
16568 tmp1 = dup(d)
16569 insr tmp1, c
16570 insr tmp1, b
16571 insr tmp1, a
16572
16573 And finally:
16574 TARGET = zip (tmp1, tmp2)
16575 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16576
16577 static bool
16578 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16579 int nelts, int nelts_reqd)
16580 {
16581 machine_mode mode = GET_MODE (target);
16582
16583 /* Case 1: Vector contains trailing constants. */
16584
16585 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16586 (target, builder, nelts, nelts_reqd))
16587 return true;
16588
16589 /* Case 2: Vector contains leading constants. */
16590
16591 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16592 for (int i = 0; i < nelts_reqd; i++)
16593 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16594 rev_builder.finalize ();
16595
16596 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16597 (target, rev_builder, nelts, nelts_reqd))
16598 {
16599 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16600 return true;
16601 }
16602
16603 /* Case 3: Vector contains trailing same element. */
16604
16605 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16606 (target, builder, nelts_reqd))
16607 return true;
16608
16609 /* Case 4: Vector contains leading same element. */
16610
16611 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16612 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16613 {
16614 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16615 return true;
16616 }
16617
16618 /* Avoid recursing below 4-elements.
16619 ??? The threshold 4 may need fine-tuning. */
16620
16621 if (nelts_reqd <= 4)
16622 return false;
16623
16624 rtx_vector_builder v_even (mode, 1, nelts);
16625 rtx_vector_builder v_odd (mode, 1, nelts);
16626
16627 for (int i = 0; i < nelts * 2; i += 2)
16628 {
16629 v_even.quick_push (builder.elt (i));
16630 v_odd.quick_push (builder.elt (i + 1));
16631 }
16632
16633 v_even.finalize ();
16634 v_odd.finalize ();
16635
16636 rtx tmp1 = gen_reg_rtx (mode);
16637 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16638 nelts, nelts_reqd / 2);
16639
16640 rtx tmp2 = gen_reg_rtx (mode);
16641 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16642 nelts, nelts_reqd / 2);
16643
16644 if (!did_even_p && !did_odd_p)
16645 return false;
16646
16647 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16648 special cases and zip v_even, v_odd. */
16649
16650 if (!did_even_p)
16651 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16652
16653 if (!did_odd_p)
16654 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16655
16656 rtvec v = gen_rtvec (2, tmp1, tmp2);
16657 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16658 return true;
16659 }
16660
16661 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16662
16663 void
16664 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16665 {
16666 machine_mode mode = GET_MODE (target);
16667 int nelts = XVECLEN (vals, 0);
16668
16669 rtx_vector_builder v (mode, 1, nelts);
16670 for (int i = 0; i < nelts; i++)
16671 v.quick_push (XVECEXP (vals, 0, i));
16672 v.finalize ();
16673
16674 /* If neither sub-vectors of v could be initialized specially,
16675 then use INSR to insert all elements from v into TARGET.
16676 ??? This might not be optimal for vectors with large
16677 initializers like 16-element or above.
16678 For nelts < 4, it probably isn't useful to handle specially. */
16679
16680 if (nelts < 4
16681 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16682 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16683 }
16684
16685 /* Check whether VALUE is a vector constant in which every element
16686 is either a power of 2 or a negated power of 2. If so, return
16687 a constant vector of log2s, and flip CODE between PLUS and MINUS
16688 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16689
16690 static rtx
16691 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16692 {
16693 if (GET_CODE (value) != CONST_VECTOR)
16694 return NULL_RTX;
16695
16696 rtx_vector_builder builder;
16697 if (!builder.new_unary_operation (GET_MODE (value), value, false))
16698 return NULL_RTX;
16699
16700 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16701 /* 1 if the result of the multiplication must be negated,
16702 0 if it mustn't, or -1 if we don't yet care. */
16703 int negate = -1;
16704 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16705 for (unsigned int i = 0; i < encoded_nelts; ++i)
16706 {
16707 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16708 if (!CONST_SCALAR_INT_P (elt))
16709 return NULL_RTX;
16710 rtx_mode_t val (elt, int_mode);
16711 wide_int pow2 = wi::neg (val);
16712 if (val != pow2)
16713 {
16714 /* It matters whether we negate or not. Make that choice,
16715 and make sure that it's consistent with previous elements. */
16716 if (negate == !wi::neg_p (val))
16717 return NULL_RTX;
16718 negate = wi::neg_p (val);
16719 if (!negate)
16720 pow2 = val;
16721 }
16722 /* POW2 is now the value that we want to be a power of 2. */
16723 int shift = wi::exact_log2 (pow2);
16724 if (shift < 0)
16725 return NULL_RTX;
16726 builder.quick_push (gen_int_mode (shift, int_mode));
16727 }
16728 if (negate == -1)
16729 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16730 code = PLUS;
16731 else if (negate == 1)
16732 code = code == PLUS ? MINUS : PLUS;
16733 return builder.build ();
16734 }
16735
16736 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16737 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16738 operands array, in the same order as for fma_optab. Return true if
16739 the function emitted all the necessary instructions, false if the caller
16740 should generate the pattern normally with the new OPERANDS array. */
16741
16742 bool
16743 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16744 {
16745 machine_mode mode = GET_MODE (operands[0]);
16746 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16747 {
16748 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16749 NULL_RTX, true, OPTAB_DIRECT);
16750 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16751 operands[3], product, operands[0], true,
16752 OPTAB_DIRECT);
16753 return true;
16754 }
16755 operands[2] = force_reg (mode, operands[2]);
16756 return false;
16757 }
16758
16759 /* Likewise, but for a conditional pattern. */
16760
16761 bool
16762 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16763 {
16764 machine_mode mode = GET_MODE (operands[0]);
16765 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16766 {
16767 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16768 NULL_RTX, true, OPTAB_DIRECT);
16769 emit_insn (gen_cond (code, mode, operands[0], operands[1],
16770 operands[4], product, operands[5]));
16771 return true;
16772 }
16773 operands[3] = force_reg (mode, operands[3]);
16774 return false;
16775 }
16776
16777 static unsigned HOST_WIDE_INT
16778 aarch64_shift_truncation_mask (machine_mode mode)
16779 {
16780 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16781 return 0;
16782 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16783 }
16784
16785 /* Select a format to encode pointers in exception handling data. */
16786 int
16787 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16788 {
16789 int type;
16790 switch (aarch64_cmodel)
16791 {
16792 case AARCH64_CMODEL_TINY:
16793 case AARCH64_CMODEL_TINY_PIC:
16794 case AARCH64_CMODEL_SMALL:
16795 case AARCH64_CMODEL_SMALL_PIC:
16796 case AARCH64_CMODEL_SMALL_SPIC:
16797 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16798 for everything. */
16799 type = DW_EH_PE_sdata4;
16800 break;
16801 default:
16802 /* No assumptions here. 8-byte relocs required. */
16803 type = DW_EH_PE_sdata8;
16804 break;
16805 }
16806 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16807 }
16808
16809 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16810
16811 static void
16812 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16813 {
16814 if (aarch64_simd_decl_p (decl))
16815 {
16816 fprintf (stream, "\t.variant_pcs\t");
16817 assemble_name (stream, name);
16818 fprintf (stream, "\n");
16819 }
16820 }
16821
16822 /* The last .arch and .tune assembly strings that we printed. */
16823 static std::string aarch64_last_printed_arch_string;
16824 static std::string aarch64_last_printed_tune_string;
16825
16826 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16827 by the function fndecl. */
16828
16829 void
16830 aarch64_declare_function_name (FILE *stream, const char* name,
16831 tree fndecl)
16832 {
16833 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16834
16835 struct cl_target_option *targ_options;
16836 if (target_parts)
16837 targ_options = TREE_TARGET_OPTION (target_parts);
16838 else
16839 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16840 gcc_assert (targ_options);
16841
16842 const struct processor *this_arch
16843 = aarch64_get_arch (targ_options->x_explicit_arch);
16844
16845 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16846 std::string extension
16847 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16848 this_arch->flags);
16849 /* Only update the assembler .arch string if it is distinct from the last
16850 such string we printed. */
16851 std::string to_print = this_arch->name + extension;
16852 if (to_print != aarch64_last_printed_arch_string)
16853 {
16854 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16855 aarch64_last_printed_arch_string = to_print;
16856 }
16857
16858 /* Print the cpu name we're tuning for in the comments, might be
16859 useful to readers of the generated asm. Do it only when it changes
16860 from function to function and verbose assembly is requested. */
16861 const struct processor *this_tune
16862 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16863
16864 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16865 {
16866 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16867 this_tune->name);
16868 aarch64_last_printed_tune_string = this_tune->name;
16869 }
16870
16871 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16872
16873 /* Don't forget the type directive for ELF. */
16874 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16875 ASM_OUTPUT_LABEL (stream, name);
16876 }
16877
16878 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16879
16880 void
16881 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16882 {
16883 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16884 const char *value = IDENTIFIER_POINTER (target);
16885 aarch64_asm_output_variant_pcs (stream, decl, name);
16886 ASM_OUTPUT_DEF (stream, name, value);
16887 }
16888
16889 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16890 function symbol references. */
16891
16892 void
16893 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16894 {
16895 default_elf_asm_output_external (stream, decl, name);
16896 aarch64_asm_output_variant_pcs (stream, decl, name);
16897 }
16898
16899 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16900 Used to output the .cfi_b_key_frame directive when signing the current
16901 function with the B key. */
16902
16903 void
16904 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16905 {
16906 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16907 && aarch64_ra_sign_key == AARCH64_KEY_B)
16908 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16909 }
16910
16911 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16912
16913 static void
16914 aarch64_start_file (void)
16915 {
16916 struct cl_target_option *default_options
16917 = TREE_TARGET_OPTION (target_option_default_node);
16918
16919 const struct processor *default_arch
16920 = aarch64_get_arch (default_options->x_explicit_arch);
16921 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16922 std::string extension
16923 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16924 default_arch->flags);
16925
16926 aarch64_last_printed_arch_string = default_arch->name + extension;
16927 aarch64_last_printed_tune_string = "";
16928 asm_fprintf (asm_out_file, "\t.arch %s\n",
16929 aarch64_last_printed_arch_string.c_str ());
16930
16931 default_file_start ();
16932 }
16933
16934 /* Emit load exclusive. */
16935
16936 static void
16937 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16938 rtx mem, rtx model_rtx)
16939 {
16940 if (mode == TImode)
16941 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
16942 gen_highpart (DImode, rval),
16943 mem, model_rtx));
16944 else
16945 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16946 }
16947
16948 /* Emit store exclusive. */
16949
16950 static void
16951 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16952 rtx mem, rtx rval, rtx model_rtx)
16953 {
16954 if (mode == TImode)
16955 emit_insn (gen_aarch64_store_exclusive_pair
16956 (bval, mem, operand_subword (rval, 0, 0, TImode),
16957 operand_subword (rval, 1, 0, TImode), model_rtx));
16958 else
16959 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
16960 }
16961
16962 /* Mark the previous jump instruction as unlikely. */
16963
16964 static void
16965 aarch64_emit_unlikely_jump (rtx insn)
16966 {
16967 rtx_insn *jump = emit_jump_insn (insn);
16968 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16969 }
16970
16971 /* We store the names of the various atomic helpers in a 5x4 array.
16972 Return the libcall function given MODE, MODEL and NAMES. */
16973
16974 rtx
16975 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
16976 const atomic_ool_names *names)
16977 {
16978 memmodel model = memmodel_base (INTVAL (model_rtx));
16979 int mode_idx, model_idx;
16980
16981 switch (mode)
16982 {
16983 case E_QImode:
16984 mode_idx = 0;
16985 break;
16986 case E_HImode:
16987 mode_idx = 1;
16988 break;
16989 case E_SImode:
16990 mode_idx = 2;
16991 break;
16992 case E_DImode:
16993 mode_idx = 3;
16994 break;
16995 case E_TImode:
16996 mode_idx = 4;
16997 break;
16998 default:
16999 gcc_unreachable ();
17000 }
17001
17002 switch (model)
17003 {
17004 case MEMMODEL_RELAXED:
17005 model_idx = 0;
17006 break;
17007 case MEMMODEL_CONSUME:
17008 case MEMMODEL_ACQUIRE:
17009 model_idx = 1;
17010 break;
17011 case MEMMODEL_RELEASE:
17012 model_idx = 2;
17013 break;
17014 case MEMMODEL_ACQ_REL:
17015 case MEMMODEL_SEQ_CST:
17016 model_idx = 3;
17017 break;
17018 default:
17019 gcc_unreachable ();
17020 }
17021
17022 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
17023 VISIBILITY_HIDDEN);
17024 }
17025
17026 #define DEF0(B, N) \
17027 { "__aarch64_" #B #N "_relax", \
17028 "__aarch64_" #B #N "_acq", \
17029 "__aarch64_" #B #N "_rel", \
17030 "__aarch64_" #B #N "_acq_rel" }
17031
17032 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17033 { NULL, NULL, NULL, NULL }
17034 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17035
17036 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
17037 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
17038 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
17039 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
17040 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
17041 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
17042
17043 #undef DEF0
17044 #undef DEF4
17045 #undef DEF5
17046
17047 /* Expand a compare and swap pattern. */
17048
17049 void
17050 aarch64_expand_compare_and_swap (rtx operands[])
17051 {
17052 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
17053 machine_mode mode, r_mode;
17054
17055 bval = operands[0];
17056 rval = operands[1];
17057 mem = operands[2];
17058 oldval = operands[3];
17059 newval = operands[4];
17060 is_weak = operands[5];
17061 mod_s = operands[6];
17062 mod_f = operands[7];
17063 mode = GET_MODE (mem);
17064
17065 /* Normally the succ memory model must be stronger than fail, but in the
17066 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17067 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
17068 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
17069 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
17070 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
17071
17072 r_mode = mode;
17073 if (mode == QImode || mode == HImode)
17074 {
17075 r_mode = SImode;
17076 rval = gen_reg_rtx (r_mode);
17077 }
17078
17079 if (TARGET_LSE)
17080 {
17081 /* The CAS insn requires oldval and rval overlap, but we need to
17082 have a copy of oldval saved across the operation to tell if
17083 the operation is successful. */
17084 if (reg_overlap_mentioned_p (rval, oldval))
17085 rval = copy_to_mode_reg (r_mode, oldval);
17086 else
17087 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
17088
17089 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
17090 newval, mod_s));
17091 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17092 }
17093 else if (TARGET_OUTLINE_ATOMICS)
17094 {
17095 /* Oldval must satisfy compare afterward. */
17096 if (!aarch64_plus_operand (oldval, mode))
17097 oldval = force_reg (mode, oldval);
17098 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
17099 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
17100 oldval, mode, newval, mode,
17101 XEXP (mem, 0), Pmode);
17102 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17103 }
17104 else
17105 {
17106 /* The oldval predicate varies by mode. Test it and force to reg. */
17107 insn_code code = code_for_aarch64_compare_and_swap (mode);
17108 if (!insn_data[code].operand[2].predicate (oldval, mode))
17109 oldval = force_reg (mode, oldval);
17110
17111 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17112 is_weak, mod_s, mod_f));
17113 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17114 }
17115
17116 if (r_mode != mode)
17117 rval = gen_lowpart (mode, rval);
17118 emit_move_insn (operands[1], rval);
17119
17120 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
17121 emit_insn (gen_rtx_SET (bval, x));
17122 }
17123
17124 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17125 sequence implementing an atomic operation. */
17126
17127 static void
17128 aarch64_emit_post_barrier (enum memmodel model)
17129 {
17130 const enum memmodel base_model = memmodel_base (model);
17131
17132 if (is_mm_sync (model)
17133 && (base_model == MEMMODEL_ACQUIRE
17134 || base_model == MEMMODEL_ACQ_REL
17135 || base_model == MEMMODEL_SEQ_CST))
17136 {
17137 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
17138 }
17139 }
17140
17141 /* Split a compare and swap pattern. */
17142
17143 void
17144 aarch64_split_compare_and_swap (rtx operands[])
17145 {
17146 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
17147 machine_mode mode;
17148 bool is_weak;
17149 rtx_code_label *label1, *label2;
17150 enum memmodel model;
17151
17152 rval = operands[0];
17153 mem = operands[1];
17154 oldval = operands[2];
17155 newval = operands[3];
17156 is_weak = (operands[4] != const0_rtx);
17157 model_rtx = operands[5];
17158 scratch = operands[7];
17159 mode = GET_MODE (mem);
17160 model = memmodel_from_int (INTVAL (model_rtx));
17161
17162 /* When OLDVAL is zero and we want the strong version we can emit a tighter
17163 loop:
17164 .label1:
17165 LD[A]XR rval, [mem]
17166 CBNZ rval, .label2
17167 ST[L]XR scratch, newval, [mem]
17168 CBNZ scratch, .label1
17169 .label2:
17170 CMP rval, 0. */
17171 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
17172 oldval == const0_rtx && mode != TImode);
17173
17174 label1 = NULL;
17175 if (!is_weak)
17176 {
17177 label1 = gen_label_rtx ();
17178 emit_label (label1);
17179 }
17180 label2 = gen_label_rtx ();
17181
17182 /* The initial load can be relaxed for a __sync operation since a final
17183 barrier will be emitted to stop code hoisting. */
17184 if (is_mm_sync (model))
17185 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
17186 else
17187 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
17188
17189 if (strong_zero_p)
17190 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17191 else
17192 {
17193 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17194 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17195 }
17196 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17197 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
17198 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17199
17200 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
17201
17202 if (!is_weak)
17203 {
17204 if (aarch64_track_speculation)
17205 {
17206 /* Emit an explicit compare instruction, so that we can correctly
17207 track the condition codes. */
17208 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17209 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17210 }
17211 else
17212 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
17213
17214 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17215 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
17216 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17217 }
17218 else
17219 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17220
17221 emit_label (label2);
17222
17223 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17224 to set the condition flags. If this is not used it will be removed by
17225 later passes. */
17226 if (strong_zero_p)
17227 aarch64_gen_compare_reg (NE, rval, const0_rtx);
17228
17229 /* Emit any final barrier needed for a __sync operation. */
17230 if (is_mm_sync (model))
17231 aarch64_emit_post_barrier (model);
17232 }
17233
17234 /* Split an atomic operation. */
17235
17236 void
17237 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17238 rtx value, rtx model_rtx, rtx cond)
17239 {
17240 machine_mode mode = GET_MODE (mem);
17241 machine_mode wmode = (mode == DImode ? DImode : SImode);
17242 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17243 const bool is_sync = is_mm_sync (model);
17244 rtx_code_label *label;
17245 rtx x;
17246
17247 /* Split the atomic operation into a sequence. */
17248 label = gen_label_rtx ();
17249 emit_label (label);
17250
17251 if (new_out)
17252 new_out = gen_lowpart (wmode, new_out);
17253 if (old_out)
17254 old_out = gen_lowpart (wmode, old_out);
17255 else
17256 old_out = new_out;
17257 value = simplify_gen_subreg (wmode, value, mode, 0);
17258
17259 /* The initial load can be relaxed for a __sync operation since a final
17260 barrier will be emitted to stop code hoisting. */
17261 if (is_sync)
17262 aarch64_emit_load_exclusive (mode, old_out, mem,
17263 GEN_INT (MEMMODEL_RELAXED));
17264 else
17265 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17266
17267 switch (code)
17268 {
17269 case SET:
17270 new_out = value;
17271 break;
17272
17273 case NOT:
17274 x = gen_rtx_AND (wmode, old_out, value);
17275 emit_insn (gen_rtx_SET (new_out, x));
17276 x = gen_rtx_NOT (wmode, new_out);
17277 emit_insn (gen_rtx_SET (new_out, x));
17278 break;
17279
17280 case MINUS:
17281 if (CONST_INT_P (value))
17282 {
17283 value = GEN_INT (-INTVAL (value));
17284 code = PLUS;
17285 }
17286 /* Fall through. */
17287
17288 default:
17289 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17290 emit_insn (gen_rtx_SET (new_out, x));
17291 break;
17292 }
17293
17294 aarch64_emit_store_exclusive (mode, cond, mem,
17295 gen_lowpart (mode, new_out), model_rtx);
17296
17297 if (aarch64_track_speculation)
17298 {
17299 /* Emit an explicit compare instruction, so that we can correctly
17300 track the condition codes. */
17301 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17302 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17303 }
17304 else
17305 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17306
17307 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17308 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17309 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17310
17311 /* Emit any final barrier needed for a __sync operation. */
17312 if (is_sync)
17313 aarch64_emit_post_barrier (model);
17314 }
17315
17316 static void
17317 aarch64_init_libfuncs (void)
17318 {
17319 /* Half-precision float operations. The compiler handles all operations
17320 with NULL libfuncs by converting to SFmode. */
17321
17322 /* Conversions. */
17323 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17324 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17325
17326 /* Arithmetic. */
17327 set_optab_libfunc (add_optab, HFmode, NULL);
17328 set_optab_libfunc (sdiv_optab, HFmode, NULL);
17329 set_optab_libfunc (smul_optab, HFmode, NULL);
17330 set_optab_libfunc (neg_optab, HFmode, NULL);
17331 set_optab_libfunc (sub_optab, HFmode, NULL);
17332
17333 /* Comparisons. */
17334 set_optab_libfunc (eq_optab, HFmode, NULL);
17335 set_optab_libfunc (ne_optab, HFmode, NULL);
17336 set_optab_libfunc (lt_optab, HFmode, NULL);
17337 set_optab_libfunc (le_optab, HFmode, NULL);
17338 set_optab_libfunc (ge_optab, HFmode, NULL);
17339 set_optab_libfunc (gt_optab, HFmode, NULL);
17340 set_optab_libfunc (unord_optab, HFmode, NULL);
17341 }
17342
17343 /* Target hook for c_mode_for_suffix. */
17344 static machine_mode
17345 aarch64_c_mode_for_suffix (char suffix)
17346 {
17347 if (suffix == 'q')
17348 return TFmode;
17349
17350 return VOIDmode;
17351 }
17352
17353 /* We can only represent floating point constants which will fit in
17354 "quarter-precision" values. These values are characterised by
17355 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17356 by:
17357
17358 (-1)^s * (n/16) * 2^r
17359
17360 Where:
17361 's' is the sign bit.
17362 'n' is an integer in the range 16 <= n <= 31.
17363 'r' is an integer in the range -3 <= r <= 4. */
17364
17365 /* Return true iff X can be represented by a quarter-precision
17366 floating point immediate operand X. Note, we cannot represent 0.0. */
17367 bool
17368 aarch64_float_const_representable_p (rtx x)
17369 {
17370 /* This represents our current view of how many bits
17371 make up the mantissa. */
17372 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17373 int exponent;
17374 unsigned HOST_WIDE_INT mantissa, mask;
17375 REAL_VALUE_TYPE r, m;
17376 bool fail;
17377
17378 x = unwrap_const_vec_duplicate (x);
17379 if (!CONST_DOUBLE_P (x))
17380 return false;
17381
17382 if (GET_MODE (x) == VOIDmode
17383 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17384 return false;
17385
17386 r = *CONST_DOUBLE_REAL_VALUE (x);
17387
17388 /* We cannot represent infinities, NaNs or +/-zero. We won't
17389 know if we have +zero until we analyse the mantissa, but we
17390 can reject the other invalid values. */
17391 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17392 || REAL_VALUE_MINUS_ZERO (r))
17393 return false;
17394
17395 /* Extract exponent. */
17396 r = real_value_abs (&r);
17397 exponent = REAL_EXP (&r);
17398
17399 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17400 highest (sign) bit, with a fixed binary point at bit point_pos.
17401 m1 holds the low part of the mantissa, m2 the high part.
17402 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17403 bits for the mantissa, this can fail (low bits will be lost). */
17404 real_ldexp (&m, &r, point_pos - exponent);
17405 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17406
17407 /* If the low part of the mantissa has bits set we cannot represent
17408 the value. */
17409 if (w.ulow () != 0)
17410 return false;
17411 /* We have rejected the lower HOST_WIDE_INT, so update our
17412 understanding of how many bits lie in the mantissa and
17413 look only at the high HOST_WIDE_INT. */
17414 mantissa = w.elt (1);
17415 point_pos -= HOST_BITS_PER_WIDE_INT;
17416
17417 /* We can only represent values with a mantissa of the form 1.xxxx. */
17418 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17419 if ((mantissa & mask) != 0)
17420 return false;
17421
17422 /* Having filtered unrepresentable values, we may now remove all
17423 but the highest 5 bits. */
17424 mantissa >>= point_pos - 5;
17425
17426 /* We cannot represent the value 0.0, so reject it. This is handled
17427 elsewhere. */
17428 if (mantissa == 0)
17429 return false;
17430
17431 /* Then, as bit 4 is always set, we can mask it off, leaving
17432 the mantissa in the range [0, 15]. */
17433 mantissa &= ~(1 << 4);
17434 gcc_assert (mantissa <= 15);
17435
17436 /* GCC internally does not use IEEE754-like encoding (where normalized
17437 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17438 Our mantissa values are shifted 4 places to the left relative to
17439 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17440 by 5 places to correct for GCC's representation. */
17441 exponent = 5 - exponent;
17442
17443 return (exponent >= 0 && exponent <= 7);
17444 }
17445
17446 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17447 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17448 output MOVI/MVNI, ORR or BIC immediate. */
17449 char*
17450 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17451 enum simd_immediate_check which)
17452 {
17453 bool is_valid;
17454 static char templ[40];
17455 const char *mnemonic;
17456 const char *shift_op;
17457 unsigned int lane_count = 0;
17458 char element_char;
17459
17460 struct simd_immediate_info info;
17461
17462 /* This will return true to show const_vector is legal for use as either
17463 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17464 It will also update INFO to show how the immediate should be generated.
17465 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17466 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17467 gcc_assert (is_valid);
17468
17469 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17470 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17471
17472 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17473 {
17474 gcc_assert (info.insn == simd_immediate_info::MOV
17475 && info.u.mov.shift == 0);
17476 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17477 move immediate path. */
17478 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17479 info.u.mov.value = GEN_INT (0);
17480 else
17481 {
17482 const unsigned int buf_size = 20;
17483 char float_buf[buf_size] = {'\0'};
17484 real_to_decimal_for_mode (float_buf,
17485 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17486 buf_size, buf_size, 1, info.elt_mode);
17487
17488 if (lane_count == 1)
17489 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17490 else
17491 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17492 lane_count, element_char, float_buf);
17493 return templ;
17494 }
17495 }
17496
17497 gcc_assert (CONST_INT_P (info.u.mov.value));
17498
17499 if (which == AARCH64_CHECK_MOV)
17500 {
17501 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17502 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17503 ? "msl" : "lsl");
17504 if (lane_count == 1)
17505 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17506 mnemonic, UINTVAL (info.u.mov.value));
17507 else if (info.u.mov.shift)
17508 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17509 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17510 element_char, UINTVAL (info.u.mov.value), shift_op,
17511 info.u.mov.shift);
17512 else
17513 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17514 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17515 element_char, UINTVAL (info.u.mov.value));
17516 }
17517 else
17518 {
17519 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17520 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17521 if (info.u.mov.shift)
17522 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17523 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17524 element_char, UINTVAL (info.u.mov.value), "lsl",
17525 info.u.mov.shift);
17526 else
17527 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17528 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17529 element_char, UINTVAL (info.u.mov.value));
17530 }
17531 return templ;
17532 }
17533
17534 char*
17535 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17536 {
17537
17538 /* If a floating point number was passed and we desire to use it in an
17539 integer mode do the conversion to integer. */
17540 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17541 {
17542 unsigned HOST_WIDE_INT ival;
17543 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17544 gcc_unreachable ();
17545 immediate = gen_int_mode (ival, mode);
17546 }
17547
17548 machine_mode vmode;
17549 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17550 a 128 bit vector mode. */
17551 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17552
17553 vmode = aarch64_simd_container_mode (mode, width);
17554 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17555 return aarch64_output_simd_mov_immediate (v_op, width);
17556 }
17557
17558 /* Return the output string to use for moving immediate CONST_VECTOR
17559 into an SVE register. */
17560
17561 char *
17562 aarch64_output_sve_mov_immediate (rtx const_vector)
17563 {
17564 static char templ[40];
17565 struct simd_immediate_info info;
17566 char element_char;
17567
17568 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17569 gcc_assert (is_valid);
17570
17571 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17572
17573 machine_mode vec_mode = GET_MODE (const_vector);
17574 if (aarch64_sve_pred_mode_p (vec_mode))
17575 {
17576 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17577 if (info.insn == simd_immediate_info::MOV)
17578 {
17579 gcc_assert (info.u.mov.value == const0_rtx);
17580 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17581 }
17582 else
17583 {
17584 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17585 unsigned int total_bytes;
17586 if (info.u.pattern == AARCH64_SV_ALL
17587 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17588 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17589 total_bytes / GET_MODE_SIZE (info.elt_mode));
17590 else
17591 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17592 svpattern_token (info.u.pattern));
17593 }
17594 return buf;
17595 }
17596
17597 if (info.insn == simd_immediate_info::INDEX)
17598 {
17599 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17600 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17601 element_char, INTVAL (info.u.index.base),
17602 INTVAL (info.u.index.step));
17603 return templ;
17604 }
17605
17606 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17607 {
17608 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17609 info.u.mov.value = GEN_INT (0);
17610 else
17611 {
17612 const int buf_size = 20;
17613 char float_buf[buf_size] = {};
17614 real_to_decimal_for_mode (float_buf,
17615 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17616 buf_size, buf_size, 1, info.elt_mode);
17617
17618 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17619 element_char, float_buf);
17620 return templ;
17621 }
17622 }
17623
17624 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17625 element_char, INTVAL (info.u.mov.value));
17626 return templ;
17627 }
17628
17629 /* Split operands into moves from op[1] + op[2] into op[0]. */
17630
17631 void
17632 aarch64_split_combinev16qi (rtx operands[3])
17633 {
17634 unsigned int dest = REGNO (operands[0]);
17635 unsigned int src1 = REGNO (operands[1]);
17636 unsigned int src2 = REGNO (operands[2]);
17637 machine_mode halfmode = GET_MODE (operands[1]);
17638 unsigned int halfregs = REG_NREGS (operands[1]);
17639 rtx destlo, desthi;
17640
17641 gcc_assert (halfmode == V16QImode);
17642
17643 if (src1 == dest && src2 == dest + halfregs)
17644 {
17645 /* No-op move. Can't split to nothing; emit something. */
17646 emit_note (NOTE_INSN_DELETED);
17647 return;
17648 }
17649
17650 /* Preserve register attributes for variable tracking. */
17651 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17652 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17653 GET_MODE_SIZE (halfmode));
17654
17655 /* Special case of reversed high/low parts. */
17656 if (reg_overlap_mentioned_p (operands[2], destlo)
17657 && reg_overlap_mentioned_p (operands[1], desthi))
17658 {
17659 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17660 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17661 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17662 }
17663 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17664 {
17665 /* Try to avoid unnecessary moves if part of the result
17666 is in the right place already. */
17667 if (src1 != dest)
17668 emit_move_insn (destlo, operands[1]);
17669 if (src2 != dest + halfregs)
17670 emit_move_insn (desthi, operands[2]);
17671 }
17672 else
17673 {
17674 if (src2 != dest + halfregs)
17675 emit_move_insn (desthi, operands[2]);
17676 if (src1 != dest)
17677 emit_move_insn (destlo, operands[1]);
17678 }
17679 }
17680
17681 /* vec_perm support. */
17682
17683 struct expand_vec_perm_d
17684 {
17685 rtx target, op0, op1;
17686 vec_perm_indices perm;
17687 machine_mode vmode;
17688 unsigned int vec_flags;
17689 bool one_vector_p;
17690 bool testing_p;
17691 };
17692
17693 /* Generate a variable permutation. */
17694
17695 static void
17696 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17697 {
17698 machine_mode vmode = GET_MODE (target);
17699 bool one_vector_p = rtx_equal_p (op0, op1);
17700
17701 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17702 gcc_checking_assert (GET_MODE (op0) == vmode);
17703 gcc_checking_assert (GET_MODE (op1) == vmode);
17704 gcc_checking_assert (GET_MODE (sel) == vmode);
17705 gcc_checking_assert (TARGET_SIMD);
17706
17707 if (one_vector_p)
17708 {
17709 if (vmode == V8QImode)
17710 {
17711 /* Expand the argument to a V16QI mode by duplicating it. */
17712 rtx pair = gen_reg_rtx (V16QImode);
17713 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17714 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17715 }
17716 else
17717 {
17718 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17719 }
17720 }
17721 else
17722 {
17723 rtx pair;
17724
17725 if (vmode == V8QImode)
17726 {
17727 pair = gen_reg_rtx (V16QImode);
17728 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17729 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17730 }
17731 else
17732 {
17733 pair = gen_reg_rtx (OImode);
17734 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17735 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17736 }
17737 }
17738 }
17739
17740 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17741 NELT is the number of elements in the vector. */
17742
17743 void
17744 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17745 unsigned int nelt)
17746 {
17747 machine_mode vmode = GET_MODE (target);
17748 bool one_vector_p = rtx_equal_p (op0, op1);
17749 rtx mask;
17750
17751 /* The TBL instruction does not use a modulo index, so we must take care
17752 of that ourselves. */
17753 mask = aarch64_simd_gen_const_vector_dup (vmode,
17754 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17755 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17756
17757 /* For big-endian, we also need to reverse the index within the vector
17758 (but not which vector). */
17759 if (BYTES_BIG_ENDIAN)
17760 {
17761 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17762 if (!one_vector_p)
17763 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17764 sel = expand_simple_binop (vmode, XOR, sel, mask,
17765 NULL, 0, OPTAB_LIB_WIDEN);
17766 }
17767 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17768 }
17769
17770 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17771
17772 static void
17773 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17774 {
17775 emit_insn (gen_rtx_SET (target,
17776 gen_rtx_UNSPEC (GET_MODE (target),
17777 gen_rtvec (2, op0, op1), code)));
17778 }
17779
17780 /* Expand an SVE vec_perm with the given operands. */
17781
17782 void
17783 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17784 {
17785 machine_mode data_mode = GET_MODE (target);
17786 machine_mode sel_mode = GET_MODE (sel);
17787 /* Enforced by the pattern condition. */
17788 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17789
17790 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17791 size of the two value vectors, i.e. the upper bits of the indices
17792 are effectively ignored. SVE TBL instead produces 0 for any
17793 out-of-range indices, so we need to modulo all the vec_perm indices
17794 to ensure they are all in range. */
17795 rtx sel_reg = force_reg (sel_mode, sel);
17796
17797 /* Check if the sel only references the first values vector. */
17798 if (GET_CODE (sel) == CONST_VECTOR
17799 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17800 {
17801 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17802 return;
17803 }
17804
17805 /* Check if the two values vectors are the same. */
17806 if (rtx_equal_p (op0, op1))
17807 {
17808 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17809 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17810 NULL, 0, OPTAB_DIRECT);
17811 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17812 return;
17813 }
17814
17815 /* Run TBL on for each value vector and combine the results. */
17816
17817 rtx res0 = gen_reg_rtx (data_mode);
17818 rtx res1 = gen_reg_rtx (data_mode);
17819 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17820 if (GET_CODE (sel) != CONST_VECTOR
17821 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17822 {
17823 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17824 2 * nunits - 1);
17825 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17826 NULL, 0, OPTAB_DIRECT);
17827 }
17828 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17829 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17830 NULL, 0, OPTAB_DIRECT);
17831 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17832 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17833 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17834 else
17835 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17836 }
17837
17838 /* Recognize patterns suitable for the TRN instructions. */
17839 static bool
17840 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17841 {
17842 HOST_WIDE_INT odd;
17843 poly_uint64 nelt = d->perm.length ();
17844 rtx out, in0, in1, x;
17845 machine_mode vmode = d->vmode;
17846
17847 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17848 return false;
17849
17850 /* Note that these are little-endian tests.
17851 We correct for big-endian later. */
17852 if (!d->perm[0].is_constant (&odd)
17853 || (odd != 0 && odd != 1)
17854 || !d->perm.series_p (0, 2, odd, 2)
17855 || !d->perm.series_p (1, 2, nelt + odd, 2))
17856 return false;
17857
17858 /* Success! */
17859 if (d->testing_p)
17860 return true;
17861
17862 in0 = d->op0;
17863 in1 = d->op1;
17864 /* We don't need a big-endian lane correction for SVE; see the comment
17865 at the head of aarch64-sve.md for details. */
17866 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17867 {
17868 x = in0, in0 = in1, in1 = x;
17869 odd = !odd;
17870 }
17871 out = d->target;
17872
17873 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17874 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17875 return true;
17876 }
17877
17878 /* Recognize patterns suitable for the UZP instructions. */
17879 static bool
17880 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17881 {
17882 HOST_WIDE_INT odd;
17883 rtx out, in0, in1, x;
17884 machine_mode vmode = d->vmode;
17885
17886 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17887 return false;
17888
17889 /* Note that these are little-endian tests.
17890 We correct for big-endian later. */
17891 if (!d->perm[0].is_constant (&odd)
17892 || (odd != 0 && odd != 1)
17893 || !d->perm.series_p (0, 1, odd, 2))
17894 return false;
17895
17896 /* Success! */
17897 if (d->testing_p)
17898 return true;
17899
17900 in0 = d->op0;
17901 in1 = d->op1;
17902 /* We don't need a big-endian lane correction for SVE; see the comment
17903 at the head of aarch64-sve.md for details. */
17904 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17905 {
17906 x = in0, in0 = in1, in1 = x;
17907 odd = !odd;
17908 }
17909 out = d->target;
17910
17911 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17912 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17913 return true;
17914 }
17915
17916 /* Recognize patterns suitable for the ZIP instructions. */
17917 static bool
17918 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17919 {
17920 unsigned int high;
17921 poly_uint64 nelt = d->perm.length ();
17922 rtx out, in0, in1, x;
17923 machine_mode vmode = d->vmode;
17924
17925 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17926 return false;
17927
17928 /* Note that these are little-endian tests.
17929 We correct for big-endian later. */
17930 poly_uint64 first = d->perm[0];
17931 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17932 || !d->perm.series_p (0, 2, first, 1)
17933 || !d->perm.series_p (1, 2, first + nelt, 1))
17934 return false;
17935 high = maybe_ne (first, 0U);
17936
17937 /* Success! */
17938 if (d->testing_p)
17939 return true;
17940
17941 in0 = d->op0;
17942 in1 = d->op1;
17943 /* We don't need a big-endian lane correction for SVE; see the comment
17944 at the head of aarch64-sve.md for details. */
17945 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17946 {
17947 x = in0, in0 = in1, in1 = x;
17948 high = !high;
17949 }
17950 out = d->target;
17951
17952 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17953 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17954 return true;
17955 }
17956
17957 /* Recognize patterns for the EXT insn. */
17958
17959 static bool
17960 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17961 {
17962 HOST_WIDE_INT location;
17963 rtx offset;
17964
17965 /* The first element always refers to the first vector.
17966 Check if the extracted indices are increasing by one. */
17967 if (d->vec_flags == VEC_SVE_PRED
17968 || !d->perm[0].is_constant (&location)
17969 || !d->perm.series_p (0, 1, location, 1))
17970 return false;
17971
17972 /* Success! */
17973 if (d->testing_p)
17974 return true;
17975
17976 /* The case where (location == 0) is a no-op for both big- and little-endian,
17977 and is removed by the mid-end at optimization levels -O1 and higher.
17978
17979 We don't need a big-endian lane correction for SVE; see the comment
17980 at the head of aarch64-sve.md for details. */
17981 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17982 {
17983 /* After setup, we want the high elements of the first vector (stored
17984 at the LSB end of the register), and the low elements of the second
17985 vector (stored at the MSB end of the register). So swap. */
17986 std::swap (d->op0, d->op1);
17987 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17988 to_constant () is safe since this is restricted to Advanced SIMD
17989 vectors. */
17990 location = d->perm.length ().to_constant () - location;
17991 }
17992
17993 offset = GEN_INT (location);
17994 emit_set_insn (d->target,
17995 gen_rtx_UNSPEC (d->vmode,
17996 gen_rtvec (3, d->op0, d->op1, offset),
17997 UNSPEC_EXT));
17998 return true;
17999 }
18000
18001 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
18002 within each 64-bit, 32-bit or 16-bit granule. */
18003
18004 static bool
18005 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
18006 {
18007 HOST_WIDE_INT diff;
18008 unsigned int i, size, unspec;
18009 machine_mode pred_mode;
18010
18011 if (d->vec_flags == VEC_SVE_PRED
18012 || !d->one_vector_p
18013 || !d->perm[0].is_constant (&diff))
18014 return false;
18015
18016 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
18017 if (size == 8)
18018 {
18019 unspec = UNSPEC_REV64;
18020 pred_mode = VNx2BImode;
18021 }
18022 else if (size == 4)
18023 {
18024 unspec = UNSPEC_REV32;
18025 pred_mode = VNx4BImode;
18026 }
18027 else if (size == 2)
18028 {
18029 unspec = UNSPEC_REV16;
18030 pred_mode = VNx8BImode;
18031 }
18032 else
18033 return false;
18034
18035 unsigned int step = diff + 1;
18036 for (i = 0; i < step; ++i)
18037 if (!d->perm.series_p (i, step, diff - i, step))
18038 return false;
18039
18040 /* Success! */
18041 if (d->testing_p)
18042 return true;
18043
18044 if (d->vec_flags == VEC_SVE_DATA)
18045 {
18046 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
18047 rtx target = gen_reg_rtx (int_mode);
18048 if (BYTES_BIG_ENDIAN)
18049 /* The act of taking a subreg between INT_MODE and d->vmode
18050 is itself a reversing operation on big-endian targets;
18051 see the comment at the head of aarch64-sve.md for details.
18052 First reinterpret OP0 as INT_MODE without using a subreg
18053 and without changing the contents. */
18054 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
18055 else
18056 {
18057 /* For SVE we use REV[BHW] unspecs derived from the element size
18058 of v->mode and vector modes whose elements have SIZE bytes.
18059 This ensures that the vector modes match the predicate modes. */
18060 int unspec = aarch64_sve_rev_unspec (d->vmode);
18061 rtx pred = aarch64_ptrue_reg (pred_mode);
18062 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
18063 gen_lowpart (int_mode, d->op0)));
18064 }
18065 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18066 return true;
18067 }
18068 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
18069 emit_set_insn (d->target, src);
18070 return true;
18071 }
18072
18073 /* Recognize patterns for the REV insn, which reverses elements within
18074 a full vector. */
18075
18076 static bool
18077 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
18078 {
18079 poly_uint64 nelt = d->perm.length ();
18080
18081 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
18082 return false;
18083
18084 if (!d->perm.series_p (0, 1, nelt - 1, -1))
18085 return false;
18086
18087 /* Success! */
18088 if (d->testing_p)
18089 return true;
18090
18091 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
18092 emit_set_insn (d->target, src);
18093 return true;
18094 }
18095
18096 static bool
18097 aarch64_evpc_dup (struct expand_vec_perm_d *d)
18098 {
18099 rtx out = d->target;
18100 rtx in0;
18101 HOST_WIDE_INT elt;
18102 machine_mode vmode = d->vmode;
18103 rtx lane;
18104
18105 if (d->vec_flags == VEC_SVE_PRED
18106 || d->perm.encoding ().encoded_nelts () != 1
18107 || !d->perm[0].is_constant (&elt))
18108 return false;
18109
18110 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
18111 return false;
18112
18113 /* Success! */
18114 if (d->testing_p)
18115 return true;
18116
18117 /* The generic preparation in aarch64_expand_vec_perm_const_1
18118 swaps the operand order and the permute indices if it finds
18119 d->perm[0] to be in the second operand. Thus, we can always
18120 use d->op0 and need not do any extra arithmetic to get the
18121 correct lane number. */
18122 in0 = d->op0;
18123 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
18124
18125 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
18126 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
18127 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
18128 return true;
18129 }
18130
18131 static bool
18132 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
18133 {
18134 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
18135 machine_mode vmode = d->vmode;
18136
18137 /* Make sure that the indices are constant. */
18138 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
18139 for (unsigned int i = 0; i < encoded_nelts; ++i)
18140 if (!d->perm[i].is_constant ())
18141 return false;
18142
18143 if (d->testing_p)
18144 return true;
18145
18146 /* Generic code will try constant permutation twice. Once with the
18147 original mode and again with the elements lowered to QImode.
18148 So wait and don't do the selector expansion ourselves. */
18149 if (vmode != V8QImode && vmode != V16QImode)
18150 return false;
18151
18152 /* to_constant is safe since this routine is specific to Advanced SIMD
18153 vectors. */
18154 unsigned int nelt = d->perm.length ().to_constant ();
18155 for (unsigned int i = 0; i < nelt; ++i)
18156 /* If big-endian and two vectors we end up with a weird mixed-endian
18157 mode on NEON. Reverse the index within each word but not the word
18158 itself. to_constant is safe because we checked is_constant above. */
18159 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
18160 ? d->perm[i].to_constant () ^ (nelt - 1)
18161 : d->perm[i].to_constant ());
18162
18163 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18164 sel = force_reg (vmode, sel);
18165
18166 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
18167 return true;
18168 }
18169
18170 /* Try to implement D using an SVE TBL instruction. */
18171
18172 static bool
18173 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
18174 {
18175 unsigned HOST_WIDE_INT nelt;
18176
18177 /* Permuting two variable-length vectors could overflow the
18178 index range. */
18179 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
18180 return false;
18181
18182 if (d->testing_p)
18183 return true;
18184
18185 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
18186 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
18187 if (d->one_vector_p)
18188 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
18189 else
18190 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
18191 return true;
18192 }
18193
18194 /* Try to implement D using SVE SEL instruction. */
18195
18196 static bool
18197 aarch64_evpc_sel (struct expand_vec_perm_d *d)
18198 {
18199 machine_mode vmode = d->vmode;
18200 int unit_size = GET_MODE_UNIT_SIZE (vmode);
18201
18202 if (d->vec_flags != VEC_SVE_DATA
18203 || unit_size > 8)
18204 return false;
18205
18206 int n_patterns = d->perm.encoding ().npatterns ();
18207 poly_int64 vec_len = d->perm.length ();
18208
18209 for (int i = 0; i < n_patterns; ++i)
18210 if (!known_eq (d->perm[i], i)
18211 && !known_eq (d->perm[i], vec_len + i))
18212 return false;
18213
18214 for (int i = n_patterns; i < n_patterns * 2; i++)
18215 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
18216 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
18217 return false;
18218
18219 if (d->testing_p)
18220 return true;
18221
18222 machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
18223
18224 rtx_vector_builder builder (pred_mode, n_patterns, 2);
18225 for (int i = 0; i < n_patterns * 2; i++)
18226 {
18227 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
18228 : CONST0_RTX (BImode);
18229 builder.quick_push (elem);
18230 }
18231
18232 rtx const_vec = builder.build ();
18233 rtx pred = force_reg (pred_mode, const_vec);
18234 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
18235 return true;
18236 }
18237
18238 static bool
18239 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
18240 {
18241 /* The pattern matching functions above are written to look for a small
18242 number to begin the sequence (0, 1, N/2). If we begin with an index
18243 from the second operand, we can swap the operands. */
18244 poly_int64 nelt = d->perm.length ();
18245 if (known_ge (d->perm[0], nelt))
18246 {
18247 d->perm.rotate_inputs (1);
18248 std::swap (d->op0, d->op1);
18249 }
18250
18251 if ((d->vec_flags == VEC_ADVSIMD
18252 || d->vec_flags == VEC_SVE_DATA
18253 || d->vec_flags == VEC_SVE_PRED)
18254 && known_gt (nelt, 1))
18255 {
18256 if (aarch64_evpc_rev_local (d))
18257 return true;
18258 else if (aarch64_evpc_rev_global (d))
18259 return true;
18260 else if (aarch64_evpc_ext (d))
18261 return true;
18262 else if (aarch64_evpc_dup (d))
18263 return true;
18264 else if (aarch64_evpc_zip (d))
18265 return true;
18266 else if (aarch64_evpc_uzp (d))
18267 return true;
18268 else if (aarch64_evpc_trn (d))
18269 return true;
18270 else if (aarch64_evpc_sel (d))
18271 return true;
18272 if (d->vec_flags == VEC_SVE_DATA)
18273 return aarch64_evpc_sve_tbl (d);
18274 else if (d->vec_flags == VEC_ADVSIMD)
18275 return aarch64_evpc_tbl (d);
18276 }
18277 return false;
18278 }
18279
18280 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18281
18282 static bool
18283 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18284 rtx op1, const vec_perm_indices &sel)
18285 {
18286 struct expand_vec_perm_d d;
18287
18288 /* Check whether the mask can be applied to a single vector. */
18289 if (sel.ninputs () == 1
18290 || (op0 && rtx_equal_p (op0, op1)))
18291 d.one_vector_p = true;
18292 else if (sel.all_from_input_p (0))
18293 {
18294 d.one_vector_p = true;
18295 op1 = op0;
18296 }
18297 else if (sel.all_from_input_p (1))
18298 {
18299 d.one_vector_p = true;
18300 op0 = op1;
18301 }
18302 else
18303 d.one_vector_p = false;
18304
18305 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18306 sel.nelts_per_input ());
18307 d.vmode = vmode;
18308 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18309 d.target = target;
18310 d.op0 = op0;
18311 d.op1 = op1;
18312 d.testing_p = !target;
18313
18314 if (!d.testing_p)
18315 return aarch64_expand_vec_perm_const_1 (&d);
18316
18317 rtx_insn *last = get_last_insn ();
18318 bool ret = aarch64_expand_vec_perm_const_1 (&d);
18319 gcc_assert (last == get_last_insn ());
18320
18321 return ret;
18322 }
18323
18324 /* Generate a byte permute mask for a register of mode MODE,
18325 which has NUNITS units. */
18326
18327 rtx
18328 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18329 {
18330 /* We have to reverse each vector because we dont have
18331 a permuted load that can reverse-load according to ABI rules. */
18332 rtx mask;
18333 rtvec v = rtvec_alloc (16);
18334 unsigned int i, j;
18335 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18336
18337 gcc_assert (BYTES_BIG_ENDIAN);
18338 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18339
18340 for (i = 0; i < nunits; i++)
18341 for (j = 0; j < usize; j++)
18342 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18343 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18344 return force_reg (V16QImode, mask);
18345 }
18346
18347 /* Expand an SVE integer comparison using the SVE equivalent of:
18348
18349 (set TARGET (CODE OP0 OP1)). */
18350
18351 void
18352 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18353 {
18354 machine_mode pred_mode = GET_MODE (target);
18355 machine_mode data_mode = GET_MODE (op0);
18356 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18357 op0, op1);
18358 if (!rtx_equal_p (target, res))
18359 emit_move_insn (target, res);
18360 }
18361
18362 /* Return the UNSPEC_COND_* code for comparison CODE. */
18363
18364 static unsigned int
18365 aarch64_unspec_cond_code (rtx_code code)
18366 {
18367 switch (code)
18368 {
18369 case NE:
18370 return UNSPEC_COND_FCMNE;
18371 case EQ:
18372 return UNSPEC_COND_FCMEQ;
18373 case LT:
18374 return UNSPEC_COND_FCMLT;
18375 case GT:
18376 return UNSPEC_COND_FCMGT;
18377 case LE:
18378 return UNSPEC_COND_FCMLE;
18379 case GE:
18380 return UNSPEC_COND_FCMGE;
18381 case UNORDERED:
18382 return UNSPEC_COND_FCMUO;
18383 default:
18384 gcc_unreachable ();
18385 }
18386 }
18387
18388 /* Emit:
18389
18390 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18391
18392 where <X> is the operation associated with comparison CODE.
18393 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18394
18395 static void
18396 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18397 bool known_ptrue_p, rtx op0, rtx op1)
18398 {
18399 rtx flag = gen_int_mode (known_ptrue_p, SImode);
18400 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18401 gen_rtvec (4, pred, flag, op0, op1),
18402 aarch64_unspec_cond_code (code));
18403 emit_set_insn (target, unspec);
18404 }
18405
18406 /* Emit the SVE equivalent of:
18407
18408 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18409 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18410 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18411
18412 where <Xi> is the operation associated with comparison CODEi.
18413 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18414
18415 static void
18416 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18417 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18418 {
18419 machine_mode pred_mode = GET_MODE (pred);
18420 rtx tmp1 = gen_reg_rtx (pred_mode);
18421 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18422 rtx tmp2 = gen_reg_rtx (pred_mode);
18423 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18424 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18425 }
18426
18427 /* Emit the SVE equivalent of:
18428
18429 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18430 (set TARGET (not TMP))
18431
18432 where <X> is the operation associated with comparison CODE.
18433 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18434
18435 static void
18436 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18437 bool known_ptrue_p, rtx op0, rtx op1)
18438 {
18439 machine_mode pred_mode = GET_MODE (pred);
18440 rtx tmp = gen_reg_rtx (pred_mode);
18441 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18442 aarch64_emit_unop (target, one_cmpl_optab, tmp);
18443 }
18444
18445 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18446
18447 (set TARGET (CODE OP0 OP1))
18448
18449 If CAN_INVERT_P is true, the caller can also handle inverted results;
18450 return true if the result is in fact inverted. */
18451
18452 bool
18453 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18454 rtx op0, rtx op1, bool can_invert_p)
18455 {
18456 machine_mode pred_mode = GET_MODE (target);
18457 machine_mode data_mode = GET_MODE (op0);
18458
18459 rtx ptrue = aarch64_ptrue_reg (pred_mode);
18460 switch (code)
18461 {
18462 case UNORDERED:
18463 /* UNORDERED has no immediate form. */
18464 op1 = force_reg (data_mode, op1);
18465 /* fall through */
18466 case LT:
18467 case LE:
18468 case GT:
18469 case GE:
18470 case EQ:
18471 case NE:
18472 {
18473 /* There is native support for the comparison. */
18474 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18475 return false;
18476 }
18477
18478 case LTGT:
18479 /* This is a trapping operation (LT or GT). */
18480 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18481 return false;
18482
18483 case UNEQ:
18484 if (!flag_trapping_math)
18485 {
18486 /* This would trap for signaling NaNs. */
18487 op1 = force_reg (data_mode, op1);
18488 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18489 ptrue, true, op0, op1);
18490 return false;
18491 }
18492 /* fall through */
18493 case UNLT:
18494 case UNLE:
18495 case UNGT:
18496 case UNGE:
18497 if (flag_trapping_math)
18498 {
18499 /* Work out which elements are ordered. */
18500 rtx ordered = gen_reg_rtx (pred_mode);
18501 op1 = force_reg (data_mode, op1);
18502 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18503 ptrue, true, op0, op1);
18504
18505 /* Test the opposite condition for the ordered elements,
18506 then invert the result. */
18507 if (code == UNEQ)
18508 code = NE;
18509 else
18510 code = reverse_condition_maybe_unordered (code);
18511 if (can_invert_p)
18512 {
18513 aarch64_emit_sve_fp_cond (target, code,
18514 ordered, false, op0, op1);
18515 return true;
18516 }
18517 aarch64_emit_sve_invert_fp_cond (target, code,
18518 ordered, false, op0, op1);
18519 return false;
18520 }
18521 break;
18522
18523 case ORDERED:
18524 /* ORDERED has no immediate form. */
18525 op1 = force_reg (data_mode, op1);
18526 break;
18527
18528 default:
18529 gcc_unreachable ();
18530 }
18531
18532 /* There is native support for the inverse comparison. */
18533 code = reverse_condition_maybe_unordered (code);
18534 if (can_invert_p)
18535 {
18536 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18537 return true;
18538 }
18539 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18540 return false;
18541 }
18542
18543 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18544 of the data being selected and CMP_MODE is the mode of the values being
18545 compared. */
18546
18547 void
18548 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18549 rtx *ops)
18550 {
18551 machine_mode pred_mode
18552 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18553 GET_MODE_SIZE (cmp_mode)).require ();
18554 rtx pred = gen_reg_rtx (pred_mode);
18555 if (FLOAT_MODE_P (cmp_mode))
18556 {
18557 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18558 ops[4], ops[5], true))
18559 std::swap (ops[1], ops[2]);
18560 }
18561 else
18562 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18563
18564 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18565 ops[1] = force_reg (data_mode, ops[1]);
18566 /* The "false" value can only be zero if the "true" value is a constant. */
18567 if (register_operand (ops[1], data_mode)
18568 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18569 ops[2] = force_reg (data_mode, ops[2]);
18570
18571 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18572 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18573 }
18574
18575 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18576 true. However due to issues with register allocation it is preferable
18577 to avoid tieing integer scalar and FP scalar modes. Executing integer
18578 operations in general registers is better than treating them as scalar
18579 vector operations. This reduces latency and avoids redundant int<->FP
18580 moves. So tie modes if they are either the same class, or vector modes
18581 with other vector modes, vector structs or any scalar mode. */
18582
18583 static bool
18584 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18585 {
18586 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18587 return true;
18588
18589 /* We specifically want to allow elements of "structure" modes to
18590 be tieable to the structure. This more general condition allows
18591 other rarer situations too. The reason we don't extend this to
18592 predicate modes is that there are no predicate structure modes
18593 nor any specific instructions for extracting part of a predicate
18594 register. */
18595 if (aarch64_vector_data_mode_p (mode1)
18596 && aarch64_vector_data_mode_p (mode2))
18597 return true;
18598
18599 /* Also allow any scalar modes with vectors. */
18600 if (aarch64_vector_mode_supported_p (mode1)
18601 || aarch64_vector_mode_supported_p (mode2))
18602 return true;
18603
18604 return false;
18605 }
18606
18607 /* Return a new RTX holding the result of moving POINTER forward by
18608 AMOUNT bytes. */
18609
18610 static rtx
18611 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18612 {
18613 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18614
18615 return adjust_automodify_address (pointer, GET_MODE (pointer),
18616 next, amount);
18617 }
18618
18619 /* Return a new RTX holding the result of moving POINTER forward by the
18620 size of the mode it points to. */
18621
18622 static rtx
18623 aarch64_progress_pointer (rtx pointer)
18624 {
18625 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18626 }
18627
18628 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18629 MODE bytes. */
18630
18631 static void
18632 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18633 machine_mode mode)
18634 {
18635 rtx reg = gen_reg_rtx (mode);
18636
18637 /* "Cast" the pointers to the correct mode. */
18638 *src = adjust_address (*src, mode, 0);
18639 *dst = adjust_address (*dst, mode, 0);
18640 /* Emit the memcpy. */
18641 emit_move_insn (reg, *src);
18642 emit_move_insn (*dst, reg);
18643 /* Move the pointers forward. */
18644 *src = aarch64_progress_pointer (*src);
18645 *dst = aarch64_progress_pointer (*dst);
18646 }
18647
18648 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18649 we succeed, otherwise return false. */
18650
18651 bool
18652 aarch64_expand_cpymem (rtx *operands)
18653 {
18654 int n, mode_bits;
18655 rtx dst = operands[0];
18656 rtx src = operands[1];
18657 rtx base;
18658 machine_mode cur_mode = BLKmode, next_mode;
18659 bool speed_p = !optimize_function_for_size_p (cfun);
18660
18661 /* When optimizing for size, give a better estimate of the length of a
18662 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18663 will always require an even number of instructions to do now. And each
18664 operation requires both a load+store, so devide the max number by 2. */
18665 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18666
18667 /* We can't do anything smart if the amount to copy is not constant. */
18668 if (!CONST_INT_P (operands[2]))
18669 return false;
18670
18671 n = INTVAL (operands[2]);
18672
18673 /* Try to keep the number of instructions low. For all cases we will do at
18674 most two moves for the residual amount, since we'll always overlap the
18675 remainder. */
18676 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18677 return false;
18678
18679 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18680 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18681
18682 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18683 src = adjust_automodify_address (src, VOIDmode, base, 0);
18684
18685 /* Convert n to bits to make the rest of the code simpler. */
18686 n = n * BITS_PER_UNIT;
18687
18688 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18689 larger than TImode, but we should not use them for loads/stores here. */
18690 const int copy_limit = GET_MODE_BITSIZE (TImode);
18691
18692 while (n > 0)
18693 {
18694 /* Find the largest mode in which to do the copy in without over reading
18695 or writing. */
18696 opt_scalar_int_mode mode_iter;
18697 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18698 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18699 cur_mode = mode_iter.require ();
18700
18701 gcc_assert (cur_mode != BLKmode);
18702
18703 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18704 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18705
18706 n -= mode_bits;
18707
18708 /* Do certain trailing copies as overlapping if it's going to be
18709 cheaper. i.e. less instructions to do so. For instance doing a 15
18710 byte copy it's more efficient to do two overlapping 8 byte copies than
18711 8 + 6 + 1. */
18712 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18713 {
18714 next_mode = smallest_mode_for_size (n, MODE_INT);
18715 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18716 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18717 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18718 n = n_bits;
18719 }
18720 }
18721
18722 return true;
18723 }
18724
18725 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18726 SImode stores. Handle the case when the constant has identical
18727 bottom and top halves. This is beneficial when the two stores can be
18728 merged into an STP and we avoid synthesising potentially expensive
18729 immediates twice. Return true if such a split is possible. */
18730
18731 bool
18732 aarch64_split_dimode_const_store (rtx dst, rtx src)
18733 {
18734 rtx lo = gen_lowpart (SImode, src);
18735 rtx hi = gen_highpart_mode (SImode, DImode, src);
18736
18737 bool size_p = optimize_function_for_size_p (cfun);
18738
18739 if (!rtx_equal_p (lo, hi))
18740 return false;
18741
18742 unsigned int orig_cost
18743 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18744 unsigned int lo_cost
18745 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18746
18747 /* We want to transform:
18748 MOV x1, 49370
18749 MOVK x1, 0x140, lsl 16
18750 MOVK x1, 0xc0da, lsl 32
18751 MOVK x1, 0x140, lsl 48
18752 STR x1, [x0]
18753 into:
18754 MOV w1, 49370
18755 MOVK w1, 0x140, lsl 16
18756 STP w1, w1, [x0]
18757 So we want to perform this only when we save two instructions
18758 or more. When optimizing for size, however, accept any code size
18759 savings we can. */
18760 if (size_p && orig_cost <= lo_cost)
18761 return false;
18762
18763 if (!size_p
18764 && (orig_cost <= lo_cost + 1))
18765 return false;
18766
18767 rtx mem_lo = adjust_address (dst, SImode, 0);
18768 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18769 return false;
18770
18771 rtx tmp_reg = gen_reg_rtx (SImode);
18772 aarch64_expand_mov_immediate (tmp_reg, lo);
18773 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18774 /* Don't emit an explicit store pair as this may not be always profitable.
18775 Let the sched-fusion logic decide whether to merge them. */
18776 emit_move_insn (mem_lo, tmp_reg);
18777 emit_move_insn (mem_hi, tmp_reg);
18778
18779 return true;
18780 }
18781
18782 /* Generate RTL for a conditional branch with rtx comparison CODE in
18783 mode CC_MODE. The destination of the unlikely conditional branch
18784 is LABEL_REF. */
18785
18786 void
18787 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18788 rtx label_ref)
18789 {
18790 rtx x;
18791 x = gen_rtx_fmt_ee (code, VOIDmode,
18792 gen_rtx_REG (cc_mode, CC_REGNUM),
18793 const0_rtx);
18794
18795 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18796 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18797 pc_rtx);
18798 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18799 }
18800
18801 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18802
18803 OP1 represents the TImode destination operand 1
18804 OP2 represents the TImode destination operand 2
18805 LOW_DEST represents the low half (DImode) of TImode operand 0
18806 LOW_IN1 represents the low half (DImode) of TImode operand 1
18807 LOW_IN2 represents the low half (DImode) of TImode operand 2
18808 HIGH_DEST represents the high half (DImode) of TImode operand 0
18809 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18810 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18811
18812 void
18813 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18814 rtx *low_in1, rtx *low_in2,
18815 rtx *high_dest, rtx *high_in1,
18816 rtx *high_in2)
18817 {
18818 *low_dest = gen_reg_rtx (DImode);
18819 *low_in1 = gen_lowpart (DImode, op1);
18820 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18821 subreg_lowpart_offset (DImode, TImode));
18822 *high_dest = gen_reg_rtx (DImode);
18823 *high_in1 = gen_highpart (DImode, op1);
18824 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18825 subreg_highpart_offset (DImode, TImode));
18826 }
18827
18828 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18829
18830 This function differs from 'arch64_addti_scratch_regs' in that
18831 OP1 can be an immediate constant (zero). We must call
18832 subreg_highpart_offset with DImode and TImode arguments, otherwise
18833 VOIDmode will be used for the const_int which generates an internal
18834 error from subreg_size_highpart_offset which does not expect a size of zero.
18835
18836 OP1 represents the TImode destination operand 1
18837 OP2 represents the TImode destination operand 2
18838 LOW_DEST represents the low half (DImode) of TImode operand 0
18839 LOW_IN1 represents the low half (DImode) of TImode operand 1
18840 LOW_IN2 represents the low half (DImode) of TImode operand 2
18841 HIGH_DEST represents the high half (DImode) of TImode operand 0
18842 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18843 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18844
18845
18846 void
18847 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18848 rtx *low_in1, rtx *low_in2,
18849 rtx *high_dest, rtx *high_in1,
18850 rtx *high_in2)
18851 {
18852 *low_dest = gen_reg_rtx (DImode);
18853 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18854 subreg_lowpart_offset (DImode, TImode));
18855
18856 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18857 subreg_lowpart_offset (DImode, TImode));
18858 *high_dest = gen_reg_rtx (DImode);
18859
18860 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18861 subreg_highpart_offset (DImode, TImode));
18862 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18863 subreg_highpart_offset (DImode, TImode));
18864 }
18865
18866 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18867
18868 OP0 represents the TImode destination operand 0
18869 LOW_DEST represents the low half (DImode) of TImode operand 0
18870 LOW_IN1 represents the low half (DImode) of TImode operand 1
18871 LOW_IN2 represents the low half (DImode) of TImode operand 2
18872 HIGH_DEST represents the high half (DImode) of TImode operand 0
18873 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18874 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18875 UNSIGNED_P is true if the operation is being performed on unsigned
18876 values. */
18877 void
18878 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18879 rtx low_in2, rtx high_dest, rtx high_in1,
18880 rtx high_in2, bool unsigned_p)
18881 {
18882 if (low_in2 == const0_rtx)
18883 {
18884 low_dest = low_in1;
18885 high_in2 = force_reg (DImode, high_in2);
18886 if (unsigned_p)
18887 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18888 else
18889 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18890 }
18891 else
18892 {
18893 if (CONST_INT_P (low_in2))
18894 {
18895 high_in2 = force_reg (DImode, high_in2);
18896 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18897 GEN_INT (-INTVAL (low_in2))));
18898 }
18899 else
18900 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18901
18902 if (unsigned_p)
18903 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18904 else
18905 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18906 }
18907
18908 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18909 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18910
18911 }
18912
18913 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18914
18915 static unsigned HOST_WIDE_INT
18916 aarch64_asan_shadow_offset (void)
18917 {
18918 if (TARGET_ILP32)
18919 return (HOST_WIDE_INT_1 << 29);
18920 else
18921 return (HOST_WIDE_INT_1 << 36);
18922 }
18923
18924 static rtx
18925 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18926 int code, tree treeop0, tree treeop1)
18927 {
18928 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18929 rtx op0, op1;
18930 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18931 insn_code icode;
18932 struct expand_operand ops[4];
18933
18934 start_sequence ();
18935 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18936
18937 op_mode = GET_MODE (op0);
18938 if (op_mode == VOIDmode)
18939 op_mode = GET_MODE (op1);
18940
18941 switch (op_mode)
18942 {
18943 case E_QImode:
18944 case E_HImode:
18945 case E_SImode:
18946 cmp_mode = SImode;
18947 icode = CODE_FOR_cmpsi;
18948 break;
18949
18950 case E_DImode:
18951 cmp_mode = DImode;
18952 icode = CODE_FOR_cmpdi;
18953 break;
18954
18955 case E_SFmode:
18956 cmp_mode = SFmode;
18957 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18958 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18959 break;
18960
18961 case E_DFmode:
18962 cmp_mode = DFmode;
18963 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18964 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18965 break;
18966
18967 default:
18968 end_sequence ();
18969 return NULL_RTX;
18970 }
18971
18972 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18973 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18974 if (!op0 || !op1)
18975 {
18976 end_sequence ();
18977 return NULL_RTX;
18978 }
18979 *prep_seq = get_insns ();
18980 end_sequence ();
18981
18982 create_fixed_operand (&ops[0], op0);
18983 create_fixed_operand (&ops[1], op1);
18984
18985 start_sequence ();
18986 if (!maybe_expand_insn (icode, 2, ops))
18987 {
18988 end_sequence ();
18989 return NULL_RTX;
18990 }
18991 *gen_seq = get_insns ();
18992 end_sequence ();
18993
18994 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18995 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18996 }
18997
18998 static rtx
18999 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
19000 int cmp_code, tree treeop0, tree treeop1, int bit_code)
19001 {
19002 rtx op0, op1, target;
19003 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19004 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19005 insn_code icode;
19006 struct expand_operand ops[6];
19007 int aarch64_cond;
19008
19009 push_to_sequence (*prep_seq);
19010 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19011
19012 op_mode = GET_MODE (op0);
19013 if (op_mode == VOIDmode)
19014 op_mode = GET_MODE (op1);
19015
19016 switch (op_mode)
19017 {
19018 case E_QImode:
19019 case E_HImode:
19020 case E_SImode:
19021 cmp_mode = SImode;
19022 icode = CODE_FOR_ccmpsi;
19023 break;
19024
19025 case E_DImode:
19026 cmp_mode = DImode;
19027 icode = CODE_FOR_ccmpdi;
19028 break;
19029
19030 case E_SFmode:
19031 cmp_mode = SFmode;
19032 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19033 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
19034 break;
19035
19036 case E_DFmode:
19037 cmp_mode = DFmode;
19038 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19039 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
19040 break;
19041
19042 default:
19043 end_sequence ();
19044 return NULL_RTX;
19045 }
19046
19047 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
19048 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
19049 if (!op0 || !op1)
19050 {
19051 end_sequence ();
19052 return NULL_RTX;
19053 }
19054 *prep_seq = get_insns ();
19055 end_sequence ();
19056
19057 target = gen_rtx_REG (cc_mode, CC_REGNUM);
19058 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
19059
19060 if (bit_code != AND)
19061 {
19062 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
19063 GET_MODE (XEXP (prev, 0))),
19064 VOIDmode, XEXP (prev, 0), const0_rtx);
19065 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
19066 }
19067
19068 create_fixed_operand (&ops[0], XEXP (prev, 0));
19069 create_fixed_operand (&ops[1], target);
19070 create_fixed_operand (&ops[2], op0);
19071 create_fixed_operand (&ops[3], op1);
19072 create_fixed_operand (&ops[4], prev);
19073 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
19074
19075 push_to_sequence (*gen_seq);
19076 if (!maybe_expand_insn (icode, 6, ops))
19077 {
19078 end_sequence ();
19079 return NULL_RTX;
19080 }
19081
19082 *gen_seq = get_insns ();
19083 end_sequence ();
19084
19085 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
19086 }
19087
19088 #undef TARGET_GEN_CCMP_FIRST
19089 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19090
19091 #undef TARGET_GEN_CCMP_NEXT
19092 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19093
19094 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
19095 instruction fusion of some sort. */
19096
19097 static bool
19098 aarch64_macro_fusion_p (void)
19099 {
19100 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
19101 }
19102
19103
19104 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
19105 should be kept together during scheduling. */
19106
19107 static bool
19108 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
19109 {
19110 rtx set_dest;
19111 rtx prev_set = single_set (prev);
19112 rtx curr_set = single_set (curr);
19113 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
19114 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
19115
19116 if (!aarch64_macro_fusion_p ())
19117 return false;
19118
19119 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
19120 {
19121 /* We are trying to match:
19122 prev (mov) == (set (reg r0) (const_int imm16))
19123 curr (movk) == (set (zero_extract (reg r0)
19124 (const_int 16)
19125 (const_int 16))
19126 (const_int imm16_1)) */
19127
19128 set_dest = SET_DEST (curr_set);
19129
19130 if (GET_CODE (set_dest) == ZERO_EXTRACT
19131 && CONST_INT_P (SET_SRC (curr_set))
19132 && CONST_INT_P (SET_SRC (prev_set))
19133 && CONST_INT_P (XEXP (set_dest, 2))
19134 && INTVAL (XEXP (set_dest, 2)) == 16
19135 && REG_P (XEXP (set_dest, 0))
19136 && REG_P (SET_DEST (prev_set))
19137 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
19138 {
19139 return true;
19140 }
19141 }
19142
19143 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
19144 {
19145
19146 /* We're trying to match:
19147 prev (adrp) == (set (reg r1)
19148 (high (symbol_ref ("SYM"))))
19149 curr (add) == (set (reg r0)
19150 (lo_sum (reg r1)
19151 (symbol_ref ("SYM"))))
19152 Note that r0 need not necessarily be the same as r1, especially
19153 during pre-regalloc scheduling. */
19154
19155 if (satisfies_constraint_Ush (SET_SRC (prev_set))
19156 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19157 {
19158 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
19159 && REG_P (XEXP (SET_SRC (curr_set), 0))
19160 && REGNO (XEXP (SET_SRC (curr_set), 0))
19161 == REGNO (SET_DEST (prev_set))
19162 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
19163 XEXP (SET_SRC (curr_set), 1)))
19164 return true;
19165 }
19166 }
19167
19168 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
19169 {
19170
19171 /* We're trying to match:
19172 prev (movk) == (set (zero_extract (reg r0)
19173 (const_int 16)
19174 (const_int 32))
19175 (const_int imm16_1))
19176 curr (movk) == (set (zero_extract (reg r0)
19177 (const_int 16)
19178 (const_int 48))
19179 (const_int imm16_2)) */
19180
19181 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
19182 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
19183 && REG_P (XEXP (SET_DEST (prev_set), 0))
19184 && REG_P (XEXP (SET_DEST (curr_set), 0))
19185 && REGNO (XEXP (SET_DEST (prev_set), 0))
19186 == REGNO (XEXP (SET_DEST (curr_set), 0))
19187 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
19188 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
19189 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
19190 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
19191 && CONST_INT_P (SET_SRC (prev_set))
19192 && CONST_INT_P (SET_SRC (curr_set)))
19193 return true;
19194
19195 }
19196 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
19197 {
19198 /* We're trying to match:
19199 prev (adrp) == (set (reg r0)
19200 (high (symbol_ref ("SYM"))))
19201 curr (ldr) == (set (reg r1)
19202 (mem (lo_sum (reg r0)
19203 (symbol_ref ("SYM")))))
19204 or
19205 curr (ldr) == (set (reg r1)
19206 (zero_extend (mem
19207 (lo_sum (reg r0)
19208 (symbol_ref ("SYM")))))) */
19209 if (satisfies_constraint_Ush (SET_SRC (prev_set))
19210 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19211 {
19212 rtx curr_src = SET_SRC (curr_set);
19213
19214 if (GET_CODE (curr_src) == ZERO_EXTEND)
19215 curr_src = XEXP (curr_src, 0);
19216
19217 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
19218 && REG_P (XEXP (XEXP (curr_src, 0), 0))
19219 && REGNO (XEXP (XEXP (curr_src, 0), 0))
19220 == REGNO (SET_DEST (prev_set))
19221 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
19222 XEXP (SET_SRC (prev_set), 0)))
19223 return true;
19224 }
19225 }
19226
19227 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
19228 && any_condjump_p (curr))
19229 {
19230 unsigned int condreg1, condreg2;
19231 rtx cc_reg_1;
19232 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
19233 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
19234
19235 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
19236 && prev
19237 && modified_in_p (cc_reg_1, prev))
19238 {
19239 enum attr_type prev_type = get_attr_type (prev);
19240
19241 /* FIXME: this misses some which is considered simple arthematic
19242 instructions for ThunderX. Simple shifts are missed here. */
19243 if (prev_type == TYPE_ALUS_SREG
19244 || prev_type == TYPE_ALUS_IMM
19245 || prev_type == TYPE_LOGICS_REG
19246 || prev_type == TYPE_LOGICS_IMM)
19247 return true;
19248 }
19249 }
19250
19251 if (prev_set
19252 && curr_set
19253 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
19254 && any_condjump_p (curr))
19255 {
19256 /* We're trying to match:
19257 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19258 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
19259 (const_int 0))
19260 (label_ref ("SYM"))
19261 (pc)) */
19262 if (SET_DEST (curr_set) == (pc_rtx)
19263 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19264 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19265 && REG_P (SET_DEST (prev_set))
19266 && REGNO (SET_DEST (prev_set))
19267 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19268 {
19269 /* Fuse ALU operations followed by conditional branch instruction. */
19270 switch (get_attr_type (prev))
19271 {
19272 case TYPE_ALU_IMM:
19273 case TYPE_ALU_SREG:
19274 case TYPE_ADC_REG:
19275 case TYPE_ADC_IMM:
19276 case TYPE_ADCS_REG:
19277 case TYPE_ADCS_IMM:
19278 case TYPE_LOGIC_REG:
19279 case TYPE_LOGIC_IMM:
19280 case TYPE_CSEL:
19281 case TYPE_ADR:
19282 case TYPE_MOV_IMM:
19283 case TYPE_SHIFT_REG:
19284 case TYPE_SHIFT_IMM:
19285 case TYPE_BFM:
19286 case TYPE_RBIT:
19287 case TYPE_REV:
19288 case TYPE_EXTEND:
19289 return true;
19290
19291 default:;
19292 }
19293 }
19294 }
19295
19296 return false;
19297 }
19298
19299 /* Return true iff the instruction fusion described by OP is enabled. */
19300
19301 bool
19302 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19303 {
19304 return (aarch64_tune_params.fusible_ops & op) != 0;
19305 }
19306
19307 /* If MEM is in the form of [base+offset], extract the two parts
19308 of address and set to BASE and OFFSET, otherwise return false
19309 after clearing BASE and OFFSET. */
19310
19311 bool
19312 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19313 {
19314 rtx addr;
19315
19316 gcc_assert (MEM_P (mem));
19317
19318 addr = XEXP (mem, 0);
19319
19320 if (REG_P (addr))
19321 {
19322 *base = addr;
19323 *offset = const0_rtx;
19324 return true;
19325 }
19326
19327 if (GET_CODE (addr) == PLUS
19328 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19329 {
19330 *base = XEXP (addr, 0);
19331 *offset = XEXP (addr, 1);
19332 return true;
19333 }
19334
19335 *base = NULL_RTX;
19336 *offset = NULL_RTX;
19337
19338 return false;
19339 }
19340
19341 /* Types for scheduling fusion. */
19342 enum sched_fusion_type
19343 {
19344 SCHED_FUSION_NONE = 0,
19345 SCHED_FUSION_LD_SIGN_EXTEND,
19346 SCHED_FUSION_LD_ZERO_EXTEND,
19347 SCHED_FUSION_LD,
19348 SCHED_FUSION_ST,
19349 SCHED_FUSION_NUM
19350 };
19351
19352 /* If INSN is a load or store of address in the form of [base+offset],
19353 extract the two parts and set to BASE and OFFSET. Return scheduling
19354 fusion type this INSN is. */
19355
19356 static enum sched_fusion_type
19357 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19358 {
19359 rtx x, dest, src;
19360 enum sched_fusion_type fusion = SCHED_FUSION_LD;
19361
19362 gcc_assert (INSN_P (insn));
19363 x = PATTERN (insn);
19364 if (GET_CODE (x) != SET)
19365 return SCHED_FUSION_NONE;
19366
19367 src = SET_SRC (x);
19368 dest = SET_DEST (x);
19369
19370 machine_mode dest_mode = GET_MODE (dest);
19371
19372 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19373 return SCHED_FUSION_NONE;
19374
19375 if (GET_CODE (src) == SIGN_EXTEND)
19376 {
19377 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19378 src = XEXP (src, 0);
19379 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19380 return SCHED_FUSION_NONE;
19381 }
19382 else if (GET_CODE (src) == ZERO_EXTEND)
19383 {
19384 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19385 src = XEXP (src, 0);
19386 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19387 return SCHED_FUSION_NONE;
19388 }
19389
19390 if (GET_CODE (src) == MEM && REG_P (dest))
19391 extract_base_offset_in_addr (src, base, offset);
19392 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19393 {
19394 fusion = SCHED_FUSION_ST;
19395 extract_base_offset_in_addr (dest, base, offset);
19396 }
19397 else
19398 return SCHED_FUSION_NONE;
19399
19400 if (*base == NULL_RTX || *offset == NULL_RTX)
19401 fusion = SCHED_FUSION_NONE;
19402
19403 return fusion;
19404 }
19405
19406 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19407
19408 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19409 and PRI are only calculated for these instructions. For other instruction,
19410 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19411 type instruction fusion can be added by returning different priorities.
19412
19413 It's important that irrelevant instructions get the largest FUSION_PRI. */
19414
19415 static void
19416 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19417 int *fusion_pri, int *pri)
19418 {
19419 int tmp, off_val;
19420 rtx base, offset;
19421 enum sched_fusion_type fusion;
19422
19423 gcc_assert (INSN_P (insn));
19424
19425 tmp = max_pri - 1;
19426 fusion = fusion_load_store (insn, &base, &offset);
19427 if (fusion == SCHED_FUSION_NONE)
19428 {
19429 *pri = tmp;
19430 *fusion_pri = tmp;
19431 return;
19432 }
19433
19434 /* Set FUSION_PRI according to fusion type and base register. */
19435 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19436
19437 /* Calculate PRI. */
19438 tmp /= 2;
19439
19440 /* INSN with smaller offset goes first. */
19441 off_val = (int)(INTVAL (offset));
19442 if (off_val >= 0)
19443 tmp -= (off_val & 0xfffff);
19444 else
19445 tmp += ((- off_val) & 0xfffff);
19446
19447 *pri = tmp;
19448 return;
19449 }
19450
19451 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19452 Adjust priority of sha1h instructions so they are scheduled before
19453 other SHA1 instructions. */
19454
19455 static int
19456 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19457 {
19458 rtx x = PATTERN (insn);
19459
19460 if (GET_CODE (x) == SET)
19461 {
19462 x = SET_SRC (x);
19463
19464 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19465 return priority + 10;
19466 }
19467
19468 return priority;
19469 }
19470
19471 /* Given OPERANDS of consecutive load/store, check if we can merge
19472 them into ldp/stp. LOAD is true if they are load instructions.
19473 MODE is the mode of memory operands. */
19474
19475 bool
19476 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19477 machine_mode mode)
19478 {
19479 HOST_WIDE_INT offval_1, offval_2, msize;
19480 enum reg_class rclass_1, rclass_2;
19481 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19482
19483 if (load)
19484 {
19485 mem_1 = operands[1];
19486 mem_2 = operands[3];
19487 reg_1 = operands[0];
19488 reg_2 = operands[2];
19489 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19490 if (REGNO (reg_1) == REGNO (reg_2))
19491 return false;
19492 }
19493 else
19494 {
19495 mem_1 = operands[0];
19496 mem_2 = operands[2];
19497 reg_1 = operands[1];
19498 reg_2 = operands[3];
19499 }
19500
19501 /* The mems cannot be volatile. */
19502 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19503 return false;
19504
19505 /* If we have SImode and slow unaligned ldp,
19506 check the alignment to be at least 8 byte. */
19507 if (mode == SImode
19508 && (aarch64_tune_params.extra_tuning_flags
19509 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19510 && !optimize_size
19511 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19512 return false;
19513
19514 /* Check if the addresses are in the form of [base+offset]. */
19515 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19516 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19517 return false;
19518 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19519 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19520 return false;
19521
19522 /* Check if the bases are same. */
19523 if (!rtx_equal_p (base_1, base_2))
19524 return false;
19525
19526 /* The operands must be of the same size. */
19527 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19528 GET_MODE_SIZE (GET_MODE (mem_2))));
19529
19530 offval_1 = INTVAL (offset_1);
19531 offval_2 = INTVAL (offset_2);
19532 /* We should only be trying this for fixed-sized modes. There is no
19533 SVE LDP/STP instruction. */
19534 msize = GET_MODE_SIZE (mode).to_constant ();
19535 /* Check if the offsets are consecutive. */
19536 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19537 return false;
19538
19539 /* Check if the addresses are clobbered by load. */
19540 if (load)
19541 {
19542 if (reg_mentioned_p (reg_1, mem_1))
19543 return false;
19544
19545 /* In increasing order, the last load can clobber the address. */
19546 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19547 return false;
19548 }
19549
19550 /* One of the memory accesses must be a mempair operand.
19551 If it is not the first one, they need to be swapped by the
19552 peephole. */
19553 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19554 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19555 return false;
19556
19557 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19558 rclass_1 = FP_REGS;
19559 else
19560 rclass_1 = GENERAL_REGS;
19561
19562 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19563 rclass_2 = FP_REGS;
19564 else
19565 rclass_2 = GENERAL_REGS;
19566
19567 /* Check if the registers are of same class. */
19568 if (rclass_1 != rclass_2)
19569 return false;
19570
19571 return true;
19572 }
19573
19574 /* Given OPERANDS of consecutive load/store that can be merged,
19575 swap them if they are not in ascending order. */
19576 void
19577 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19578 {
19579 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19580 HOST_WIDE_INT offval_1, offval_2;
19581
19582 if (load)
19583 {
19584 mem_1 = operands[1];
19585 mem_2 = operands[3];
19586 }
19587 else
19588 {
19589 mem_1 = operands[0];
19590 mem_2 = operands[2];
19591 }
19592
19593 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19594 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19595
19596 offval_1 = INTVAL (offset_1);
19597 offval_2 = INTVAL (offset_2);
19598
19599 if (offval_1 > offval_2)
19600 {
19601 /* Irrespective of whether this is a load or a store,
19602 we do the same swap. */
19603 std::swap (operands[0], operands[2]);
19604 std::swap (operands[1], operands[3]);
19605 }
19606 }
19607
19608 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19609 comparison between the two. */
19610 int
19611 aarch64_host_wide_int_compare (const void *x, const void *y)
19612 {
19613 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19614 * ((const HOST_WIDE_INT *) y));
19615 }
19616
19617 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19618 other pointing to a REG rtx containing an offset, compare the offsets
19619 of the two pairs.
19620
19621 Return:
19622
19623 1 iff offset (X) > offset (Y)
19624 0 iff offset (X) == offset (Y)
19625 -1 iff offset (X) < offset (Y) */
19626 int
19627 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19628 {
19629 const rtx * operands_1 = (const rtx *) x;
19630 const rtx * operands_2 = (const rtx *) y;
19631 rtx mem_1, mem_2, base, offset_1, offset_2;
19632
19633 if (MEM_P (operands_1[0]))
19634 mem_1 = operands_1[0];
19635 else
19636 mem_1 = operands_1[1];
19637
19638 if (MEM_P (operands_2[0]))
19639 mem_2 = operands_2[0];
19640 else
19641 mem_2 = operands_2[1];
19642
19643 /* Extract the offsets. */
19644 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19645 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19646
19647 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19648
19649 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19650 }
19651
19652 /* Given OPERANDS of consecutive load/store, check if we can merge
19653 them into ldp/stp by adjusting the offset. LOAD is true if they
19654 are load instructions. MODE is the mode of memory operands.
19655
19656 Given below consecutive stores:
19657
19658 str w1, [xb, 0x100]
19659 str w1, [xb, 0x104]
19660 str w1, [xb, 0x108]
19661 str w1, [xb, 0x10c]
19662
19663 Though the offsets are out of the range supported by stp, we can
19664 still pair them after adjusting the offset, like:
19665
19666 add scratch, xb, 0x100
19667 stp w1, w1, [scratch]
19668 stp w1, w1, [scratch, 0x8]
19669
19670 The peephole patterns detecting this opportunity should guarantee
19671 the scratch register is avaliable. */
19672
19673 bool
19674 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19675 scalar_mode mode)
19676 {
19677 const int num_insns = 4;
19678 enum reg_class rclass;
19679 HOST_WIDE_INT offvals[num_insns], msize;
19680 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19681
19682 if (load)
19683 {
19684 for (int i = 0; i < num_insns; i++)
19685 {
19686 reg[i] = operands[2 * i];
19687 mem[i] = operands[2 * i + 1];
19688
19689 gcc_assert (REG_P (reg[i]));
19690 }
19691
19692 /* Do not attempt to merge the loads if the loads clobber each other. */
19693 for (int i = 0; i < 8; i += 2)
19694 for (int j = i + 2; j < 8; j += 2)
19695 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19696 return false;
19697 }
19698 else
19699 for (int i = 0; i < num_insns; i++)
19700 {
19701 mem[i] = operands[2 * i];
19702 reg[i] = operands[2 * i + 1];
19703 }
19704
19705 /* Skip if memory operand is by itself valid for ldp/stp. */
19706 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19707 return false;
19708
19709 for (int i = 0; i < num_insns; i++)
19710 {
19711 /* The mems cannot be volatile. */
19712 if (MEM_VOLATILE_P (mem[i]))
19713 return false;
19714
19715 /* Check if the addresses are in the form of [base+offset]. */
19716 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19717 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19718 return false;
19719 }
19720
19721 /* Check if the registers are of same class. */
19722 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19723 ? FP_REGS : GENERAL_REGS;
19724
19725 for (int i = 1; i < num_insns; i++)
19726 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19727 {
19728 if (rclass != FP_REGS)
19729 return false;
19730 }
19731 else
19732 {
19733 if (rclass != GENERAL_REGS)
19734 return false;
19735 }
19736
19737 /* Only the last register in the order in which they occur
19738 may be clobbered by the load. */
19739 if (rclass == GENERAL_REGS && load)
19740 for (int i = 0; i < num_insns - 1; i++)
19741 if (reg_mentioned_p (reg[i], mem[i]))
19742 return false;
19743
19744 /* Check if the bases are same. */
19745 for (int i = 0; i < num_insns - 1; i++)
19746 if (!rtx_equal_p (base[i], base[i + 1]))
19747 return false;
19748
19749 for (int i = 0; i < num_insns; i++)
19750 offvals[i] = INTVAL (offset[i]);
19751
19752 msize = GET_MODE_SIZE (mode);
19753
19754 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19755 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19756 aarch64_host_wide_int_compare);
19757
19758 if (!(offvals[1] == offvals[0] + msize
19759 && offvals[3] == offvals[2] + msize))
19760 return false;
19761
19762 /* Check that offsets are within range of each other. The ldp/stp
19763 instructions have 7 bit immediate offsets, so use 0x80. */
19764 if (offvals[2] - offvals[0] >= msize * 0x80)
19765 return false;
19766
19767 /* The offsets must be aligned with respect to each other. */
19768 if (offvals[0] % msize != offvals[2] % msize)
19769 return false;
19770
19771 /* If we have SImode and slow unaligned ldp,
19772 check the alignment to be at least 8 byte. */
19773 if (mode == SImode
19774 && (aarch64_tune_params.extra_tuning_flags
19775 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19776 && !optimize_size
19777 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19778 return false;
19779
19780 return true;
19781 }
19782
19783 /* Given OPERANDS of consecutive load/store, this function pairs them
19784 into LDP/STP after adjusting the offset. It depends on the fact
19785 that the operands can be sorted so the offsets are correct for STP.
19786 MODE is the mode of memory operands. CODE is the rtl operator
19787 which should be applied to all memory operands, it's SIGN_EXTEND,
19788 ZERO_EXTEND or UNKNOWN. */
19789
19790 bool
19791 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19792 scalar_mode mode, RTX_CODE code)
19793 {
19794 rtx base, offset_1, offset_3, t1, t2;
19795 rtx mem_1, mem_2, mem_3, mem_4;
19796 rtx temp_operands[8];
19797 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19798 stp_off_upper_limit, stp_off_lower_limit, msize;
19799
19800 /* We make changes on a copy as we may still bail out. */
19801 for (int i = 0; i < 8; i ++)
19802 temp_operands[i] = operands[i];
19803
19804 /* Sort the operands. */
19805 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19806
19807 /* Copy the memory operands so that if we have to bail for some
19808 reason the original addresses are unchanged. */
19809 if (load)
19810 {
19811 mem_1 = copy_rtx (temp_operands[1]);
19812 mem_2 = copy_rtx (temp_operands[3]);
19813 mem_3 = copy_rtx (temp_operands[5]);
19814 mem_4 = copy_rtx (temp_operands[7]);
19815 }
19816 else
19817 {
19818 mem_1 = copy_rtx (temp_operands[0]);
19819 mem_2 = copy_rtx (temp_operands[2]);
19820 mem_3 = copy_rtx (temp_operands[4]);
19821 mem_4 = copy_rtx (temp_operands[6]);
19822 gcc_assert (code == UNKNOWN);
19823 }
19824
19825 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19826 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19827 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19828 && offset_3 != NULL_RTX);
19829
19830 /* Adjust offset so it can fit in LDP/STP instruction. */
19831 msize = GET_MODE_SIZE (mode);
19832 stp_off_upper_limit = msize * (0x40 - 1);
19833 stp_off_lower_limit = - msize * 0x40;
19834
19835 off_val_1 = INTVAL (offset_1);
19836 off_val_3 = INTVAL (offset_3);
19837
19838 /* The base offset is optimally half way between the two STP/LDP offsets. */
19839 if (msize <= 4)
19840 base_off = (off_val_1 + off_val_3) / 2;
19841 else
19842 /* However, due to issues with negative LDP/STP offset generation for
19843 larger modes, for DF, DI and vector modes. we must not use negative
19844 addresses smaller than 9 signed unadjusted bits can store. This
19845 provides the most range in this case. */
19846 base_off = off_val_1;
19847
19848 /* Adjust the base so that it is aligned with the addresses but still
19849 optimal. */
19850 if (base_off % msize != off_val_1 % msize)
19851 /* Fix the offset, bearing in mind we want to make it bigger not
19852 smaller. */
19853 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19854 else if (msize <= 4)
19855 /* The negative range of LDP/STP is one larger than the positive range. */
19856 base_off += msize;
19857
19858 /* Check if base offset is too big or too small. We can attempt to resolve
19859 this issue by setting it to the maximum value and seeing if the offsets
19860 still fit. */
19861 if (base_off >= 0x1000)
19862 {
19863 base_off = 0x1000 - 1;
19864 /* We must still make sure that the base offset is aligned with respect
19865 to the address. But it may may not be made any bigger. */
19866 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19867 }
19868
19869 /* Likewise for the case where the base is too small. */
19870 if (base_off <= -0x1000)
19871 {
19872 base_off = -0x1000 + 1;
19873 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19874 }
19875
19876 /* Offset of the first STP/LDP. */
19877 new_off_1 = off_val_1 - base_off;
19878
19879 /* Offset of the second STP/LDP. */
19880 new_off_3 = off_val_3 - base_off;
19881
19882 /* The offsets must be within the range of the LDP/STP instructions. */
19883 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19884 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19885 return false;
19886
19887 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19888 new_off_1), true);
19889 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19890 new_off_1 + msize), true);
19891 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19892 new_off_3), true);
19893 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19894 new_off_3 + msize), true);
19895
19896 if (!aarch64_mem_pair_operand (mem_1, mode)
19897 || !aarch64_mem_pair_operand (mem_3, mode))
19898 return false;
19899
19900 if (code == ZERO_EXTEND)
19901 {
19902 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19903 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19904 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19905 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19906 }
19907 else if (code == SIGN_EXTEND)
19908 {
19909 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19910 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19911 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19912 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19913 }
19914
19915 if (load)
19916 {
19917 operands[0] = temp_operands[0];
19918 operands[1] = mem_1;
19919 operands[2] = temp_operands[2];
19920 operands[3] = mem_2;
19921 operands[4] = temp_operands[4];
19922 operands[5] = mem_3;
19923 operands[6] = temp_operands[6];
19924 operands[7] = mem_4;
19925 }
19926 else
19927 {
19928 operands[0] = mem_1;
19929 operands[1] = temp_operands[1];
19930 operands[2] = mem_2;
19931 operands[3] = temp_operands[3];
19932 operands[4] = mem_3;
19933 operands[5] = temp_operands[5];
19934 operands[6] = mem_4;
19935 operands[7] = temp_operands[7];
19936 }
19937
19938 /* Emit adjusting instruction. */
19939 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19940 /* Emit ldp/stp instructions. */
19941 t1 = gen_rtx_SET (operands[0], operands[1]);
19942 t2 = gen_rtx_SET (operands[2], operands[3]);
19943 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19944 t1 = gen_rtx_SET (operands[4], operands[5]);
19945 t2 = gen_rtx_SET (operands[6], operands[7]);
19946 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19947 return true;
19948 }
19949
19950 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19951 it isn't worth branching around empty masked ops (including masked
19952 stores). */
19953
19954 static bool
19955 aarch64_empty_mask_is_expensive (unsigned)
19956 {
19957 return false;
19958 }
19959
19960 /* Return 1 if pseudo register should be created and used to hold
19961 GOT address for PIC code. */
19962
19963 bool
19964 aarch64_use_pseudo_pic_reg (void)
19965 {
19966 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19967 }
19968
19969 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19970
19971 static int
19972 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19973 {
19974 switch (XINT (x, 1))
19975 {
19976 case UNSPEC_GOTSMALLPIC:
19977 case UNSPEC_GOTSMALLPIC28K:
19978 case UNSPEC_GOTTINYPIC:
19979 return 0;
19980 default:
19981 break;
19982 }
19983
19984 return default_unspec_may_trap_p (x, flags);
19985 }
19986
19987
19988 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19989 return the log2 of that value. Otherwise return -1. */
19990
19991 int
19992 aarch64_fpconst_pow_of_2 (rtx x)
19993 {
19994 const REAL_VALUE_TYPE *r;
19995
19996 if (!CONST_DOUBLE_P (x))
19997 return -1;
19998
19999 r = CONST_DOUBLE_REAL_VALUE (x);
20000
20001 if (REAL_VALUE_NEGATIVE (*r)
20002 || REAL_VALUE_ISNAN (*r)
20003 || REAL_VALUE_ISINF (*r)
20004 || !real_isinteger (r, DFmode))
20005 return -1;
20006
20007 return exact_log2 (real_to_integer (r));
20008 }
20009
20010 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20011 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20012 return n. Otherwise return -1. */
20013
20014 int
20015 aarch64_fpconst_pow2_recip (rtx x)
20016 {
20017 REAL_VALUE_TYPE r0;
20018
20019 if (!CONST_DOUBLE_P (x))
20020 return -1;
20021
20022 r0 = *CONST_DOUBLE_REAL_VALUE (x);
20023 if (exact_real_inverse (DFmode, &r0)
20024 && !REAL_VALUE_NEGATIVE (r0))
20025 {
20026 int ret = exact_log2 (real_to_integer (&r0));
20027 if (ret >= 1 && ret <= 32)
20028 return ret;
20029 }
20030 return -1;
20031 }
20032
20033 /* If X is a vector of equal CONST_DOUBLE values and that value is
20034 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
20035
20036 int
20037 aarch64_vec_fpconst_pow_of_2 (rtx x)
20038 {
20039 int nelts;
20040 if (GET_CODE (x) != CONST_VECTOR
20041 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
20042 return -1;
20043
20044 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
20045 return -1;
20046
20047 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
20048 if (firstval <= 0)
20049 return -1;
20050
20051 for (int i = 1; i < nelts; i++)
20052 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
20053 return -1;
20054
20055 return firstval;
20056 }
20057
20058 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20059 to float.
20060
20061 __fp16 always promotes through this hook.
20062 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20063 through the generic excess precision logic rather than here. */
20064
20065 static tree
20066 aarch64_promoted_type (const_tree t)
20067 {
20068 if (SCALAR_FLOAT_TYPE_P (t)
20069 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
20070 return float_type_node;
20071
20072 return NULL_TREE;
20073 }
20074
20075 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
20076
20077 static bool
20078 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
20079 optimization_type opt_type)
20080 {
20081 switch (op)
20082 {
20083 case rsqrt_optab:
20084 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
20085
20086 default:
20087 return true;
20088 }
20089 }
20090
20091 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
20092
20093 static unsigned int
20094 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
20095 int *offset)
20096 {
20097 /* Polynomial invariant 1 == (VG / 2) - 1. */
20098 gcc_assert (i == 1);
20099 *factor = 2;
20100 *offset = 1;
20101 return AARCH64_DWARF_VG;
20102 }
20103
20104 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20105 if MODE is HFmode, and punt to the generic implementation otherwise. */
20106
20107 static bool
20108 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
20109 {
20110 return (mode == HFmode
20111 ? true
20112 : default_libgcc_floating_mode_supported_p (mode));
20113 }
20114
20115 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20116 if MODE is HFmode, and punt to the generic implementation otherwise. */
20117
20118 static bool
20119 aarch64_scalar_mode_supported_p (scalar_mode mode)
20120 {
20121 return (mode == HFmode
20122 ? true
20123 : default_scalar_mode_supported_p (mode));
20124 }
20125
20126 /* Set the value of FLT_EVAL_METHOD.
20127 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20128
20129 0: evaluate all operations and constants, whose semantic type has at
20130 most the range and precision of type float, to the range and
20131 precision of float; evaluate all other operations and constants to
20132 the range and precision of the semantic type;
20133
20134 N, where _FloatN is a supported interchange floating type
20135 evaluate all operations and constants, whose semantic type has at
20136 most the range and precision of _FloatN type, to the range and
20137 precision of the _FloatN type; evaluate all other operations and
20138 constants to the range and precision of the semantic type;
20139
20140 If we have the ARMv8.2-A extensions then we support _Float16 in native
20141 precision, so we should set this to 16. Otherwise, we support the type,
20142 but want to evaluate expressions in float precision, so set this to
20143 0. */
20144
20145 static enum flt_eval_method
20146 aarch64_excess_precision (enum excess_precision_type type)
20147 {
20148 switch (type)
20149 {
20150 case EXCESS_PRECISION_TYPE_FAST:
20151 case EXCESS_PRECISION_TYPE_STANDARD:
20152 /* We can calculate either in 16-bit range and precision or
20153 32-bit range and precision. Make that decision based on whether
20154 we have native support for the ARMv8.2-A 16-bit floating-point
20155 instructions or not. */
20156 return (TARGET_FP_F16INST
20157 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20158 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
20159 case EXCESS_PRECISION_TYPE_IMPLICIT:
20160 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
20161 default:
20162 gcc_unreachable ();
20163 }
20164 return FLT_EVAL_METHOD_UNPREDICTABLE;
20165 }
20166
20167 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
20168 scheduled for speculative execution. Reject the long-running division
20169 and square-root instructions. */
20170
20171 static bool
20172 aarch64_sched_can_speculate_insn (rtx_insn *insn)
20173 {
20174 switch (get_attr_type (insn))
20175 {
20176 case TYPE_SDIV:
20177 case TYPE_UDIV:
20178 case TYPE_FDIVS:
20179 case TYPE_FDIVD:
20180 case TYPE_FSQRTS:
20181 case TYPE_FSQRTD:
20182 case TYPE_NEON_FP_SQRT_S:
20183 case TYPE_NEON_FP_SQRT_D:
20184 case TYPE_NEON_FP_SQRT_S_Q:
20185 case TYPE_NEON_FP_SQRT_D_Q:
20186 case TYPE_NEON_FP_DIV_S:
20187 case TYPE_NEON_FP_DIV_D:
20188 case TYPE_NEON_FP_DIV_S_Q:
20189 case TYPE_NEON_FP_DIV_D_Q:
20190 return false;
20191 default:
20192 return true;
20193 }
20194 }
20195
20196 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
20197
20198 static int
20199 aarch64_compute_pressure_classes (reg_class *classes)
20200 {
20201 int i = 0;
20202 classes[i++] = GENERAL_REGS;
20203 classes[i++] = FP_REGS;
20204 /* PR_REGS isn't a useful pressure class because many predicate pseudo
20205 registers need to go in PR_LO_REGS at some point during their
20206 lifetime. Splitting it into two halves has the effect of making
20207 all predicates count against PR_LO_REGS, so that we try whenever
20208 possible to restrict the number of live predicates to 8. This
20209 greatly reduces the amount of spilling in certain loops. */
20210 classes[i++] = PR_LO_REGS;
20211 classes[i++] = PR_HI_REGS;
20212 return i;
20213 }
20214
20215 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
20216
20217 static bool
20218 aarch64_can_change_mode_class (machine_mode from,
20219 machine_mode to, reg_class_t)
20220 {
20221 if (BYTES_BIG_ENDIAN)
20222 {
20223 bool from_sve_p = aarch64_sve_data_mode_p (from);
20224 bool to_sve_p = aarch64_sve_data_mode_p (to);
20225
20226 /* Don't allow changes between SVE data modes and non-SVE modes.
20227 See the comment at the head of aarch64-sve.md for details. */
20228 if (from_sve_p != to_sve_p)
20229 return false;
20230
20231 /* Don't allow changes in element size: lane 0 of the new vector
20232 would not then be lane 0 of the old vector. See the comment
20233 above aarch64_maybe_expand_sve_subreg_move for a more detailed
20234 description.
20235
20236 In the worst case, this forces a register to be spilled in
20237 one mode and reloaded in the other, which handles the
20238 endianness correctly. */
20239 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
20240 return false;
20241 }
20242 return true;
20243 }
20244
20245 /* Implement TARGET_EARLY_REMAT_MODES. */
20246
20247 static void
20248 aarch64_select_early_remat_modes (sbitmap modes)
20249 {
20250 /* SVE values are not normally live across a call, so it should be
20251 worth doing early rematerialization even in VL-specific mode. */
20252 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
20253 if (aarch64_sve_mode_p ((machine_mode) i))
20254 bitmap_set_bit (modes, i);
20255 }
20256
20257 /* Override the default target speculation_safe_value. */
20258 static rtx
20259 aarch64_speculation_safe_value (machine_mode mode,
20260 rtx result, rtx val, rtx failval)
20261 {
20262 /* Maybe we should warn if falling back to hard barriers. They are
20263 likely to be noticably more expensive than the alternative below. */
20264 if (!aarch64_track_speculation)
20265 return default_speculation_safe_value (mode, result, val, failval);
20266
20267 if (!REG_P (val))
20268 val = copy_to_mode_reg (mode, val);
20269
20270 if (!aarch64_reg_or_zero (failval, mode))
20271 failval = copy_to_mode_reg (mode, failval);
20272
20273 emit_insn (gen_despeculate_copy (mode, result, val, failval));
20274 return result;
20275 }
20276
20277 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20278 Look into the tuning structure for an estimate.
20279 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20280 Advanced SIMD 128 bits. */
20281
20282 static HOST_WIDE_INT
20283 aarch64_estimated_poly_value (poly_int64 val)
20284 {
20285 enum aarch64_sve_vector_bits_enum width_source
20286 = aarch64_tune_params.sve_width;
20287
20288 /* If we still don't have an estimate, use the default. */
20289 if (width_source == SVE_SCALABLE)
20290 return default_estimated_poly_value (val);
20291
20292 HOST_WIDE_INT over_128 = width_source - 128;
20293 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20294 }
20295
20296
20297 /* Return true for types that could be supported as SIMD return or
20298 argument types. */
20299
20300 static bool
20301 supported_simd_type (tree t)
20302 {
20303 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20304 {
20305 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20306 return s == 1 || s == 2 || s == 4 || s == 8;
20307 }
20308 return false;
20309 }
20310
20311 /* Return true for types that currently are supported as SIMD return
20312 or argument types. */
20313
20314 static bool
20315 currently_supported_simd_type (tree t, tree b)
20316 {
20317 if (COMPLEX_FLOAT_TYPE_P (t))
20318 return false;
20319
20320 if (TYPE_SIZE (t) != TYPE_SIZE (b))
20321 return false;
20322
20323 return supported_simd_type (t);
20324 }
20325
20326 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20327
20328 static int
20329 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20330 struct cgraph_simd_clone *clonei,
20331 tree base_type, int num)
20332 {
20333 tree t, ret_type, arg_type;
20334 unsigned int elt_bits, vec_bits, count;
20335
20336 if (!TARGET_SIMD)
20337 return 0;
20338
20339 if (clonei->simdlen
20340 && (clonei->simdlen < 2
20341 || clonei->simdlen > 1024
20342 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20343 {
20344 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20345 "unsupported simdlen %d", clonei->simdlen);
20346 return 0;
20347 }
20348
20349 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20350 if (TREE_CODE (ret_type) != VOID_TYPE
20351 && !currently_supported_simd_type (ret_type, base_type))
20352 {
20353 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20354 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20355 "GCC does not currently support mixed size types "
20356 "for %<simd%> functions");
20357 else if (supported_simd_type (ret_type))
20358 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20359 "GCC does not currently support return type %qT "
20360 "for %<simd%> functions", ret_type);
20361 else
20362 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20363 "unsupported return type %qT for %<simd%> functions",
20364 ret_type);
20365 return 0;
20366 }
20367
20368 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20369 {
20370 arg_type = TREE_TYPE (t);
20371
20372 if (!currently_supported_simd_type (arg_type, base_type))
20373 {
20374 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20375 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20376 "GCC does not currently support mixed size types "
20377 "for %<simd%> functions");
20378 else
20379 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20380 "GCC does not currently support argument type %qT "
20381 "for %<simd%> functions", arg_type);
20382 return 0;
20383 }
20384 }
20385
20386 clonei->vecsize_mangle = 'n';
20387 clonei->mask_mode = VOIDmode;
20388 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20389 if (clonei->simdlen == 0)
20390 {
20391 count = 2;
20392 vec_bits = (num == 0 ? 64 : 128);
20393 clonei->simdlen = vec_bits / elt_bits;
20394 }
20395 else
20396 {
20397 count = 1;
20398 vec_bits = clonei->simdlen * elt_bits;
20399 if (vec_bits != 64 && vec_bits != 128)
20400 {
20401 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20402 "GCC does not currently support simdlen %d for type %qT",
20403 clonei->simdlen, base_type);
20404 return 0;
20405 }
20406 }
20407 clonei->vecsize_int = vec_bits;
20408 clonei->vecsize_float = vec_bits;
20409 return count;
20410 }
20411
20412 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20413
20414 static void
20415 aarch64_simd_clone_adjust (struct cgraph_node *node)
20416 {
20417 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20418 use the correct ABI. */
20419
20420 tree t = TREE_TYPE (node->decl);
20421 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20422 TYPE_ATTRIBUTES (t));
20423 }
20424
20425 /* Implement TARGET_SIMD_CLONE_USABLE. */
20426
20427 static int
20428 aarch64_simd_clone_usable (struct cgraph_node *node)
20429 {
20430 switch (node->simdclone->vecsize_mangle)
20431 {
20432 case 'n':
20433 if (!TARGET_SIMD)
20434 return -1;
20435 return 0;
20436 default:
20437 gcc_unreachable ();
20438 }
20439 }
20440
20441 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20442
20443 static int
20444 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20445 {
20446 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20447 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20448 return 0;
20449 return 1;
20450 }
20451
20452 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20453
20454 static const char *
20455 aarch64_get_multilib_abi_name (void)
20456 {
20457 if (TARGET_BIG_END)
20458 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20459 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20460 }
20461
20462 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20463 global variable based guard use the default else
20464 return a null tree. */
20465 static tree
20466 aarch64_stack_protect_guard (void)
20467 {
20468 if (aarch64_stack_protector_guard == SSP_GLOBAL)
20469 return default_stack_protect_guard ();
20470
20471 return NULL_TREE;
20472 }
20473
20474 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20475 section at the end if needed. */
20476 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20477 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20478 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20479 void
20480 aarch64_file_end_indicate_exec_stack ()
20481 {
20482 file_end_indicate_exec_stack ();
20483
20484 unsigned feature_1_and = 0;
20485 if (aarch64_bti_enabled ())
20486 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20487
20488 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20489 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20490
20491 if (feature_1_and)
20492 {
20493 /* Generate .note.gnu.property section. */
20494 switch_to_section (get_section (".note.gnu.property",
20495 SECTION_NOTYPE, NULL));
20496
20497 /* PT_NOTE header: namesz, descsz, type.
20498 namesz = 4 ("GNU\0")
20499 descsz = 16 (Size of the program property array)
20500 [(12 + padding) * Number of array elements]
20501 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20502 assemble_align (POINTER_SIZE);
20503 assemble_integer (GEN_INT (4), 4, 32, 1);
20504 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20505 assemble_integer (GEN_INT (5), 4, 32, 1);
20506
20507 /* PT_NOTE name. */
20508 assemble_string ("GNU", 4);
20509
20510 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20511 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20512 datasz = 4
20513 data = feature_1_and. */
20514 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20515 assemble_integer (GEN_INT (4), 4, 32, 1);
20516 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20517
20518 /* Pad the size of the note to the required alignment. */
20519 assemble_align (POINTER_SIZE);
20520 }
20521 }
20522 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20523 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20524 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20525
20526 /* Target-specific selftests. */
20527
20528 #if CHECKING_P
20529
20530 namespace selftest {
20531
20532 /* Selftest for the RTL loader.
20533 Verify that the RTL loader copes with a dump from
20534 print_rtx_function. This is essentially just a test that class
20535 function_reader can handle a real dump, but it also verifies
20536 that lookup_reg_by_dump_name correctly handles hard regs.
20537 The presence of hard reg names in the dump means that the test is
20538 target-specific, hence it is in this file. */
20539
20540 static void
20541 aarch64_test_loading_full_dump ()
20542 {
20543 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20544
20545 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20546
20547 rtx_insn *insn_1 = get_insn_by_uid (1);
20548 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20549
20550 rtx_insn *insn_15 = get_insn_by_uid (15);
20551 ASSERT_EQ (INSN, GET_CODE (insn_15));
20552 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20553
20554 /* Verify crtl->return_rtx. */
20555 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20556 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20557 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20558 }
20559
20560 /* Run all target-specific selftests. */
20561
20562 static void
20563 aarch64_run_selftests (void)
20564 {
20565 aarch64_test_loading_full_dump ();
20566 }
20567
20568 } // namespace selftest
20569
20570 #endif /* #if CHECKING_P */
20571
20572 #undef TARGET_STACK_PROTECT_GUARD
20573 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20574
20575 #undef TARGET_ADDRESS_COST
20576 #define TARGET_ADDRESS_COST aarch64_address_cost
20577
20578 /* This hook will determines whether unnamed bitfields affect the alignment
20579 of the containing structure. The hook returns true if the structure
20580 should inherit the alignment requirements of an unnamed bitfield's
20581 type. */
20582 #undef TARGET_ALIGN_ANON_BITFIELD
20583 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20584
20585 #undef TARGET_ASM_ALIGNED_DI_OP
20586 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20587
20588 #undef TARGET_ASM_ALIGNED_HI_OP
20589 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20590
20591 #undef TARGET_ASM_ALIGNED_SI_OP
20592 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20593
20594 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20595 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20596 hook_bool_const_tree_hwi_hwi_const_tree_true
20597
20598 #undef TARGET_ASM_FILE_START
20599 #define TARGET_ASM_FILE_START aarch64_start_file
20600
20601 #undef TARGET_ASM_OUTPUT_MI_THUNK
20602 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20603
20604 #undef TARGET_ASM_SELECT_RTX_SECTION
20605 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20606
20607 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20608 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20609
20610 #undef TARGET_BUILD_BUILTIN_VA_LIST
20611 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20612
20613 #undef TARGET_CALLEE_COPIES
20614 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20615
20616 #undef TARGET_CAN_ELIMINATE
20617 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20618
20619 #undef TARGET_CAN_INLINE_P
20620 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20621
20622 #undef TARGET_CANNOT_FORCE_CONST_MEM
20623 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20624
20625 #undef TARGET_CASE_VALUES_THRESHOLD
20626 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20627
20628 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20629 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20630
20631 /* Only the least significant bit is used for initialization guard
20632 variables. */
20633 #undef TARGET_CXX_GUARD_MASK_BIT
20634 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20635
20636 #undef TARGET_C_MODE_FOR_SUFFIX
20637 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20638
20639 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20640 #undef TARGET_DEFAULT_TARGET_FLAGS
20641 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20642 #endif
20643
20644 #undef TARGET_CLASS_MAX_NREGS
20645 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20646
20647 #undef TARGET_BUILTIN_DECL
20648 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20649
20650 #undef TARGET_BUILTIN_RECIPROCAL
20651 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20652
20653 #undef TARGET_C_EXCESS_PRECISION
20654 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20655
20656 #undef TARGET_EXPAND_BUILTIN
20657 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20658
20659 #undef TARGET_EXPAND_BUILTIN_VA_START
20660 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20661
20662 #undef TARGET_FOLD_BUILTIN
20663 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20664
20665 #undef TARGET_FUNCTION_ARG
20666 #define TARGET_FUNCTION_ARG aarch64_function_arg
20667
20668 #undef TARGET_FUNCTION_ARG_ADVANCE
20669 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20670
20671 #undef TARGET_FUNCTION_ARG_BOUNDARY
20672 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20673
20674 #undef TARGET_FUNCTION_ARG_PADDING
20675 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20676
20677 #undef TARGET_GET_RAW_RESULT_MODE
20678 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20679 #undef TARGET_GET_RAW_ARG_MODE
20680 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20681
20682 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20683 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20684
20685 #undef TARGET_FUNCTION_VALUE
20686 #define TARGET_FUNCTION_VALUE aarch64_function_value
20687
20688 #undef TARGET_FUNCTION_VALUE_REGNO_P
20689 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20690
20691 #undef TARGET_GIMPLE_FOLD_BUILTIN
20692 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20693
20694 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20695 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20696
20697 #undef TARGET_INIT_BUILTINS
20698 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20699
20700 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20701 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20702 aarch64_ira_change_pseudo_allocno_class
20703
20704 #undef TARGET_LEGITIMATE_ADDRESS_P
20705 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20706
20707 #undef TARGET_LEGITIMATE_CONSTANT_P
20708 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20709
20710 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20711 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20712 aarch64_legitimize_address_displacement
20713
20714 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20715 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20716
20717 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20718 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20719 aarch64_libgcc_floating_mode_supported_p
20720
20721 #undef TARGET_MANGLE_TYPE
20722 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20723
20724 #undef TARGET_MEMORY_MOVE_COST
20725 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20726
20727 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20728 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20729
20730 #undef TARGET_MUST_PASS_IN_STACK
20731 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20732
20733 /* This target hook should return true if accesses to volatile bitfields
20734 should use the narrowest mode possible. It should return false if these
20735 accesses should use the bitfield container type. */
20736 #undef TARGET_NARROW_VOLATILE_BITFIELD
20737 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20738
20739 #undef TARGET_OPTION_OVERRIDE
20740 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20741
20742 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20743 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20744 aarch64_override_options_after_change
20745
20746 #undef TARGET_OPTION_SAVE
20747 #define TARGET_OPTION_SAVE aarch64_option_save
20748
20749 #undef TARGET_OPTION_RESTORE
20750 #define TARGET_OPTION_RESTORE aarch64_option_restore
20751
20752 #undef TARGET_OPTION_PRINT
20753 #define TARGET_OPTION_PRINT aarch64_option_print
20754
20755 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20756 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20757
20758 #undef TARGET_SET_CURRENT_FUNCTION
20759 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20760
20761 #undef TARGET_PASS_BY_REFERENCE
20762 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20763
20764 #undef TARGET_PREFERRED_RELOAD_CLASS
20765 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20766
20767 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20768 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20769
20770 #undef TARGET_PROMOTED_TYPE
20771 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20772
20773 #undef TARGET_SECONDARY_RELOAD
20774 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20775
20776 #undef TARGET_SHIFT_TRUNCATION_MASK
20777 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20778
20779 #undef TARGET_SETUP_INCOMING_VARARGS
20780 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20781
20782 #undef TARGET_STRUCT_VALUE_RTX
20783 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20784
20785 #undef TARGET_REGISTER_MOVE_COST
20786 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20787
20788 #undef TARGET_RETURN_IN_MEMORY
20789 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20790
20791 #undef TARGET_RETURN_IN_MSB
20792 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20793
20794 #undef TARGET_RTX_COSTS
20795 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20796
20797 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20798 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20799
20800 #undef TARGET_SCHED_ISSUE_RATE
20801 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20802
20803 #undef TARGET_SCHED_VARIABLE_ISSUE
20804 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20805
20806 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20807 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20808 aarch64_sched_first_cycle_multipass_dfa_lookahead
20809
20810 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20811 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20812 aarch64_first_cycle_multipass_dfa_lookahead_guard
20813
20814 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20815 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20816 aarch64_get_separate_components
20817
20818 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20819 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20820 aarch64_components_for_bb
20821
20822 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20823 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20824 aarch64_disqualify_components
20825
20826 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20827 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20828 aarch64_emit_prologue_components
20829
20830 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20831 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20832 aarch64_emit_epilogue_components
20833
20834 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20835 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20836 aarch64_set_handled_components
20837
20838 #undef TARGET_TRAMPOLINE_INIT
20839 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20840
20841 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20842 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20843
20844 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20845 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20846
20847 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20848 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20849 aarch64_builtin_support_vector_misalignment
20850
20851 #undef TARGET_ARRAY_MODE
20852 #define TARGET_ARRAY_MODE aarch64_array_mode
20853
20854 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20855 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20856
20857 #undef TARGET_VECTORIZE_ADD_STMT_COST
20858 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20859
20860 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20861 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20862 aarch64_builtin_vectorization_cost
20863
20864 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20865 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20866
20867 #undef TARGET_VECTORIZE_BUILTINS
20868 #define TARGET_VECTORIZE_BUILTINS
20869
20870 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20871 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20872 aarch64_builtin_vectorized_function
20873
20874 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20875 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20876 aarch64_autovectorize_vector_sizes
20877
20878 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20879 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20880 aarch64_atomic_assign_expand_fenv
20881
20882 /* Section anchor support. */
20883
20884 #undef TARGET_MIN_ANCHOR_OFFSET
20885 #define TARGET_MIN_ANCHOR_OFFSET -256
20886
20887 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20888 byte offset; we can do much more for larger data types, but have no way
20889 to determine the size of the access. We assume accesses are aligned. */
20890 #undef TARGET_MAX_ANCHOR_OFFSET
20891 #define TARGET_MAX_ANCHOR_OFFSET 4095
20892
20893 #undef TARGET_VECTOR_ALIGNMENT
20894 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20895
20896 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20897 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20898 aarch64_vectorize_preferred_vector_alignment
20899 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20900 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20901 aarch64_simd_vector_alignment_reachable
20902
20903 /* vec_perm support. */
20904
20905 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20906 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20907 aarch64_vectorize_vec_perm_const
20908
20909 #undef TARGET_VECTORIZE_GET_MASK_MODE
20910 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20911 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20912 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20913 aarch64_empty_mask_is_expensive
20914 #undef TARGET_PREFERRED_ELSE_VALUE
20915 #define TARGET_PREFERRED_ELSE_VALUE \
20916 aarch64_preferred_else_value
20917
20918 #undef TARGET_INIT_LIBFUNCS
20919 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20920
20921 #undef TARGET_FIXED_CONDITION_CODE_REGS
20922 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20923
20924 #undef TARGET_FLAGS_REGNUM
20925 #define TARGET_FLAGS_REGNUM CC_REGNUM
20926
20927 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20928 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20929
20930 #undef TARGET_ASAN_SHADOW_OFFSET
20931 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20932
20933 #undef TARGET_LEGITIMIZE_ADDRESS
20934 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20935
20936 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20937 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20938
20939 #undef TARGET_CAN_USE_DOLOOP_P
20940 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20941
20942 #undef TARGET_SCHED_ADJUST_PRIORITY
20943 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20944
20945 #undef TARGET_SCHED_MACRO_FUSION_P
20946 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20947
20948 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20949 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20950
20951 #undef TARGET_SCHED_FUSION_PRIORITY
20952 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20953
20954 #undef TARGET_UNSPEC_MAY_TRAP_P
20955 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20956
20957 #undef TARGET_USE_PSEUDO_PIC_REG
20958 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20959
20960 #undef TARGET_PRINT_OPERAND
20961 #define TARGET_PRINT_OPERAND aarch64_print_operand
20962
20963 #undef TARGET_PRINT_OPERAND_ADDRESS
20964 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20965
20966 #undef TARGET_OPTAB_SUPPORTED_P
20967 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20968
20969 #undef TARGET_OMIT_STRUCT_RETURN_REG
20970 #define TARGET_OMIT_STRUCT_RETURN_REG true
20971
20972 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20973 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20974 aarch64_dwarf_poly_indeterminate_value
20975
20976 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20977 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20978 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20979
20980 #undef TARGET_HARD_REGNO_NREGS
20981 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20982 #undef TARGET_HARD_REGNO_MODE_OK
20983 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20984
20985 #undef TARGET_MODES_TIEABLE_P
20986 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20987
20988 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20989 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20990 aarch64_hard_regno_call_part_clobbered
20991
20992 #undef TARGET_INSN_CALLEE_ABI
20993 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
20994
20995 #undef TARGET_CONSTANT_ALIGNMENT
20996 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20997
20998 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20999 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
21000 aarch64_stack_clash_protection_alloca_probe_range
21001
21002 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21003 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21004
21005 #undef TARGET_CAN_CHANGE_MODE_CLASS
21006 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21007
21008 #undef TARGET_SELECT_EARLY_REMAT_MODES
21009 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21010
21011 #undef TARGET_SPECULATION_SAFE_VALUE
21012 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21013
21014 #undef TARGET_ESTIMATED_POLY_VALUE
21015 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21016
21017 #undef TARGET_ATTRIBUTE_TABLE
21018 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21019
21020 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21021 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21022 aarch64_simd_clone_compute_vecsize_and_simdlen
21023
21024 #undef TARGET_SIMD_CLONE_ADJUST
21025 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21026
21027 #undef TARGET_SIMD_CLONE_USABLE
21028 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21029
21030 #undef TARGET_COMP_TYPE_ATTRIBUTES
21031 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21032
21033 #undef TARGET_GET_MULTILIB_ABI_NAME
21034 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21035
21036 #undef TARGET_FNTYPE_ABI
21037 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21038
21039 #if CHECKING_P
21040 #undef TARGET_RUN_TARGET_SELFTESTS
21041 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21042 #endif /* #if CHECKING_P */
21043
21044 #undef TARGET_ASM_POST_CFI_STARTPROC
21045 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21046
21047 struct gcc_target targetm = TARGET_INITIALIZER;
21048
21049 #include "gt-aarch64.h"