aarch64: Add CPU-specific SVE vector costs struct
[gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
86 {
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
129 };
130
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
136 {
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140 }
141
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
151 {
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155 }
156
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162 {
163 u.index.base = base_in;
164 u.index.step = step_in;
165 }
166
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174 u.pattern = pattern_in;
175 }
176
177 namespace {
178
179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
180 class pure_scalable_type_info
181 {
182 public:
183 /* Represents the result of analyzing a type. All values are nonzero,
184 in the possibly forlorn hope that accidental conversions to bool
185 trigger a warning. */
186 enum analysis_result
187 {
188 /* The type does not have an ABI identity; i.e. it doesn't contain
189 at least one object whose type is a Fundamental Data Type. */
190 NO_ABI_IDENTITY = 1,
191
192 /* The type is definitely a Pure Scalable Type. */
193 IS_PST,
194
195 /* The type is definitely not a Pure Scalable Type. */
196 ISNT_PST,
197
198 /* It doesn't matter for PCS purposes whether the type is a Pure
199 Scalable Type or not, since the type will be handled the same
200 way regardless.
201
202 Specifically, this means that if the type is a Pure Scalable Type,
203 there aren't enough argument registers to hold it, and so it will
204 need to be passed or returned in memory. If the type isn't a
205 Pure Scalable Type, it's too big to be passed or returned in core
206 or SIMD&FP registers, and so again will need to go in memory. */
207 DOESNT_MATTER
208 };
209
210 /* Aggregates of 17 bytes or more are normally passed and returned
211 in memory, so aggregates of that size can safely be analyzed as
212 DOESNT_MATTER. We need to be able to collect enough pieces to
213 represent a PST that is smaller than that. Since predicates are
214 2 bytes in size for -msve-vector-bits=128, that means we need to be
215 able to store at least 8 pieces.
216
217 We also need to be able to store enough pieces to represent
218 a single vector in each vector argument register and a single
219 predicate in each predicate argument register. This means that
220 we need at least 12 pieces. */
221 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
222 #if __cplusplus >= 201103L
223 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
224 #endif
225
226 /* Describes one piece of a PST. Each piece is one of:
227
228 - a single Scalable Vector Type (SVT)
229 - a single Scalable Predicate Type (SPT)
230 - a PST containing 2, 3 or 4 SVTs, with no padding
231
232 It either represents a single built-in type or a PST formed from
233 multiple homogeneous built-in types. */
234 struct piece
235 {
236 rtx get_rtx (unsigned int, unsigned int) const;
237
238 /* The number of vector and predicate registers that the piece
239 occupies. One of the two is always zero. */
240 unsigned int num_zr;
241 unsigned int num_pr;
242
243 /* The mode of the registers described above. */
244 machine_mode mode;
245
246 /* If this piece is formed from multiple homogeneous built-in types,
247 this is the mode of the built-in types, otherwise it is MODE. */
248 machine_mode orig_mode;
249
250 /* The offset in bytes of the piece from the start of the type. */
251 poly_uint64_pod offset;
252 };
253
254 /* Divides types analyzed as IS_PST into individual pieces. The pieces
255 are in memory order. */
256 auto_vec<piece, MAX_PIECES> pieces;
257
258 unsigned int num_zr () const;
259 unsigned int num_pr () const;
260
261 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
262
263 analysis_result analyze (const_tree);
264 bool analyze_registers (const_tree);
265
266 private:
267 analysis_result analyze_array (const_tree);
268 analysis_result analyze_record (const_tree);
269 void add_piece (const piece &);
270 };
271 }
272
273 /* The current code model. */
274 enum aarch64_code_model aarch64_cmodel;
275
276 /* The number of 64-bit elements in an SVE vector. */
277 poly_uint16 aarch64_sve_vg;
278
279 #ifdef HAVE_AS_TLS
280 #undef TARGET_HAVE_TLS
281 #define TARGET_HAVE_TLS 1
282 #endif
283
284 static bool aarch64_composite_type_p (const_tree, machine_mode);
285 static bool aarch64_return_in_memory_1 (const_tree);
286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
287 const_tree,
288 machine_mode *, int *,
289 bool *, bool);
290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
292 static void aarch64_override_options_after_change (void);
293 static bool aarch64_vector_mode_supported_p (machine_mode);
294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296 const_tree type,
297 int misalignment,
298 bool is_packed);
299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301 aarch64_addr_query_type);
302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
303
304 /* Major revision number of the ARM Architecture implemented by the target. */
305 unsigned aarch64_architecture_version;
306
307 /* The processor for which instructions should be scheduled. */
308 enum aarch64_processor aarch64_tune = cortexa53;
309
310 /* Mask to specify which instruction scheduling options should be used. */
311 uint64_t aarch64_tune_flags = 0;
312
313 /* Global flag for PC relative loads. */
314 bool aarch64_pcrelative_literal_loads;
315
316 /* Global flag for whether frame pointer is enabled. */
317 bool aarch64_use_frame_pointer;
318
319 #define BRANCH_PROTECT_STR_MAX 255
320 char *accepted_branch_protection_string = NULL;
321
322 static enum aarch64_parse_opt_result
323 aarch64_parse_branch_protection (const char*, char**);
324
325 /* Support for command line parsing of boolean flags in the tuning
326 structures. */
327 struct aarch64_flag_desc
328 {
329 const char* name;
330 unsigned int flag;
331 };
332
333 #define AARCH64_FUSION_PAIR(name, internal_name) \
334 { name, AARCH64_FUSE_##internal_name },
335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
336 {
337 { "none", AARCH64_FUSE_NOTHING },
338 #include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL },
340 { NULL, AARCH64_FUSE_NOTHING }
341 };
342
343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
346 {
347 { "none", AARCH64_EXTRA_TUNE_NONE },
348 #include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL },
350 { NULL, AARCH64_EXTRA_TUNE_NONE }
351 };
352
353 /* Tuning parameters. */
354
355 static const struct cpu_addrcost_table generic_addrcost_table =
356 {
357 {
358 1, /* hi */
359 0, /* si */
360 0, /* di */
361 1, /* ti */
362 },
363 0, /* pre_modify */
364 0, /* post_modify */
365 0, /* register_offset */
366 0, /* register_sextend */
367 0, /* register_zextend */
368 0 /* imm_offset */
369 };
370
371 static const struct cpu_addrcost_table exynosm1_addrcost_table =
372 {
373 {
374 0, /* hi */
375 0, /* si */
376 0, /* di */
377 2, /* ti */
378 },
379 0, /* pre_modify */
380 0, /* post_modify */
381 1, /* register_offset */
382 1, /* register_sextend */
383 2, /* register_zextend */
384 0, /* imm_offset */
385 };
386
387 static const struct cpu_addrcost_table xgene1_addrcost_table =
388 {
389 {
390 1, /* hi */
391 0, /* si */
392 0, /* di */
393 1, /* ti */
394 },
395 1, /* pre_modify */
396 1, /* post_modify */
397 0, /* register_offset */
398 1, /* register_sextend */
399 1, /* register_zextend */
400 0, /* imm_offset */
401 };
402
403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
404 {
405 {
406 1, /* hi */
407 1, /* si */
408 1, /* di */
409 2, /* ti */
410 },
411 0, /* pre_modify */
412 0, /* post_modify */
413 2, /* register_offset */
414 3, /* register_sextend */
415 3, /* register_zextend */
416 0, /* imm_offset */
417 };
418
419 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
420 {
421 {
422 1, /* hi */
423 1, /* si */
424 1, /* di */
425 2, /* ti */
426 },
427 0, /* pre_modify */
428 0, /* post_modify */
429 2, /* register_offset */
430 3, /* register_sextend */
431 3, /* register_zextend */
432 0, /* imm_offset */
433 };
434
435 static const struct cpu_addrcost_table tsv110_addrcost_table =
436 {
437 {
438 1, /* hi */
439 0, /* si */
440 0, /* di */
441 1, /* ti */
442 },
443 0, /* pre_modify */
444 0, /* post_modify */
445 0, /* register_offset */
446 1, /* register_sextend */
447 1, /* register_zextend */
448 0, /* imm_offset */
449 };
450
451 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
452 {
453 {
454 1, /* hi */
455 1, /* si */
456 1, /* di */
457 2, /* ti */
458 },
459 1, /* pre_modify */
460 1, /* post_modify */
461 3, /* register_offset */
462 3, /* register_sextend */
463 3, /* register_zextend */
464 2, /* imm_offset */
465 };
466
467 static const struct cpu_regmove_cost generic_regmove_cost =
468 {
469 1, /* GP2GP */
470 /* Avoid the use of slow int<->fp moves for spilling by setting
471 their cost higher than memmov_cost. */
472 5, /* GP2FP */
473 5, /* FP2GP */
474 2 /* FP2FP */
475 };
476
477 static const struct cpu_regmove_cost cortexa57_regmove_cost =
478 {
479 1, /* GP2GP */
480 /* Avoid the use of slow int<->fp moves for spilling by setting
481 their cost higher than memmov_cost. */
482 5, /* GP2FP */
483 5, /* FP2GP */
484 2 /* FP2FP */
485 };
486
487 static const struct cpu_regmove_cost cortexa53_regmove_cost =
488 {
489 1, /* GP2GP */
490 /* Avoid the use of slow int<->fp moves for spilling by setting
491 their cost higher than memmov_cost. */
492 5, /* GP2FP */
493 5, /* FP2GP */
494 2 /* FP2FP */
495 };
496
497 static const struct cpu_regmove_cost exynosm1_regmove_cost =
498 {
499 1, /* GP2GP */
500 /* Avoid the use of slow int<->fp moves for spilling by setting
501 their cost higher than memmov_cost (actual, 4 and 9). */
502 9, /* GP2FP */
503 9, /* FP2GP */
504 1 /* FP2FP */
505 };
506
507 static const struct cpu_regmove_cost thunderx_regmove_cost =
508 {
509 2, /* GP2GP */
510 2, /* GP2FP */
511 6, /* FP2GP */
512 4 /* FP2FP */
513 };
514
515 static const struct cpu_regmove_cost xgene1_regmove_cost =
516 {
517 1, /* GP2GP */
518 /* Avoid the use of slow int<->fp moves for spilling by setting
519 their cost higher than memmov_cost. */
520 8, /* GP2FP */
521 8, /* FP2GP */
522 2 /* FP2FP */
523 };
524
525 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
526 {
527 2, /* GP2GP */
528 /* Avoid the use of int<->fp moves for spilling. */
529 6, /* GP2FP */
530 6, /* FP2GP */
531 4 /* FP2FP */
532 };
533
534 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
535 {
536 1, /* GP2GP */
537 /* Avoid the use of int<->fp moves for spilling. */
538 5, /* GP2FP */
539 6, /* FP2GP */
540 3, /* FP2FP */
541 };
542
543 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
544 {
545 1, /* GP2GP */
546 /* Avoid the use of int<->fp moves for spilling. */
547 4, /* GP2FP */
548 5, /* FP2GP */
549 4 /* FP2FP */
550 };
551
552 static const struct cpu_regmove_cost tsv110_regmove_cost =
553 {
554 1, /* GP2GP */
555 /* Avoid the use of slow int<->fp moves for spilling by setting
556 their cost higher than memmov_cost. */
557 2, /* GP2FP */
558 3, /* FP2GP */
559 2 /* FP2FP */
560 };
561
562 /* Generic costs for Advanced SIMD vector operations. */
563 static const advsimd_vec_cost generic_advsimd_vector_cost =
564 {
565 1, /* int_stmt_cost */
566 1, /* fp_stmt_cost */
567 2, /* permute_cost */
568 2, /* vec_to_scalar_cost */
569 1, /* scalar_to_vec_cost */
570 1, /* align_load_cost */
571 1, /* unalign_load_cost */
572 1, /* unalign_store_cost */
573 1 /* store_cost */
574 };
575
576 /* Generic costs for SVE vector operations. */
577 static const sve_vec_cost generic_sve_vector_cost =
578 {
579 1, /* int_stmt_cost */
580 1, /* fp_stmt_cost */
581 2, /* permute_cost */
582 2, /* vec_to_scalar_cost */
583 1, /* scalar_to_vec_cost */
584 1, /* align_load_cost */
585 1, /* unalign_load_cost */
586 1, /* unalign_store_cost */
587 1 /* store_cost */
588 };
589
590 /* Generic costs for vector insn classes. */
591 static const struct cpu_vector_cost generic_vector_cost =
592 {
593 1, /* scalar_int_stmt_cost */
594 1, /* scalar_fp_stmt_cost */
595 1, /* scalar_load_cost */
596 1, /* scalar_store_cost */
597 3, /* cond_taken_branch_cost */
598 1, /* cond_not_taken_branch_cost */
599 &generic_advsimd_vector_cost, /* advsimd */
600 &generic_sve_vector_cost /* sve */
601 };
602
603 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
604 {
605 1, /* int_stmt_cost */
606 3, /* fp_stmt_cost */
607 2, /* permute_cost */
608 1, /* vec_to_scalar_cost */
609 1, /* scalar_to_vec_cost */
610 1, /* align_load_cost */
611 1, /* unalign_load_cost */
612 1, /* unalign_store_cost */
613 1 /* store_cost */
614 };
615
616 /* QDF24XX costs for vector insn classes. */
617 static const struct cpu_vector_cost qdf24xx_vector_cost =
618 {
619 1, /* scalar_int_stmt_cost */
620 1, /* scalar_fp_stmt_cost */
621 1, /* scalar_load_cost */
622 1, /* scalar_store_cost */
623 3, /* cond_taken_branch_cost */
624 1, /* cond_not_taken_branch_cost */
625 &qdf24xx_advsimd_vector_cost, /* advsimd */
626 NULL /* sve */
627 };
628
629
630 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
631 {
632 4, /* int_stmt_cost */
633 1, /* fp_stmt_cost */
634 4, /* permute_cost */
635 2, /* vec_to_scalar_cost */
636 2, /* scalar_to_vec_cost */
637 3, /* align_load_cost */
638 5, /* unalign_load_cost */
639 5, /* unalign_store_cost */
640 1 /* store_cost */
641 };
642
643 /* ThunderX costs for vector insn classes. */
644 static const struct cpu_vector_cost thunderx_vector_cost =
645 {
646 1, /* scalar_int_stmt_cost */
647 1, /* scalar_fp_stmt_cost */
648 3, /* scalar_load_cost */
649 1, /* scalar_store_cost */
650 3, /* cond_taken_branch_cost */
651 3, /* cond_not_taken_branch_cost */
652 &thunderx_advsimd_vector_cost, /* advsimd */
653 NULL /* sve */
654 };
655
656 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
657 {
658 2, /* int_stmt_cost */
659 2, /* fp_stmt_cost */
660 2, /* permute_cost */
661 3, /* vec_to_scalar_cost */
662 2, /* scalar_to_vec_cost */
663 5, /* align_load_cost */
664 5, /* unalign_load_cost */
665 1, /* unalign_store_cost */
666 1 /* store_cost */
667 };
668
669 static const struct cpu_vector_cost tsv110_vector_cost =
670 {
671 1, /* scalar_int_stmt_cost */
672 1, /* scalar_fp_stmt_cost */
673 5, /* scalar_load_cost */
674 1, /* scalar_store_cost */
675 1, /* cond_taken_branch_cost */
676 1, /* cond_not_taken_branch_cost */
677 &tsv110_advsimd_vector_cost, /* advsimd */
678 NULL, /* sve */
679 };
680
681 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
682 {
683 2, /* int_stmt_cost */
684 2, /* fp_stmt_cost */
685 3, /* permute_cost */
686 8, /* vec_to_scalar_cost */
687 8, /* scalar_to_vec_cost */
688 4, /* align_load_cost */
689 4, /* unalign_load_cost */
690 1, /* unalign_store_cost */
691 1 /* store_cost */
692 };
693
694 /* Cortex-A57 costs for vector insn classes. */
695 static const struct cpu_vector_cost cortexa57_vector_cost =
696 {
697 1, /* scalar_int_stmt_cost */
698 1, /* scalar_fp_stmt_cost */
699 4, /* scalar_load_cost */
700 1, /* scalar_store_cost */
701 1, /* cond_taken_branch_cost */
702 1, /* cond_not_taken_branch_cost */
703 &cortexa57_advsimd_vector_cost, /* advsimd */
704 NULL /* sve */
705 };
706
707 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
708 {
709 3, /* int_stmt_cost */
710 3, /* fp_stmt_cost */
711 3, /* permute_cost */
712 3, /* vec_to_scalar_cost */
713 3, /* scalar_to_vec_cost */
714 5, /* align_load_cost */
715 5, /* unalign_load_cost */
716 1, /* unalign_store_cost */
717 1 /* store_cost */
718 };
719
720 static const struct cpu_vector_cost exynosm1_vector_cost =
721 {
722 1, /* scalar_int_stmt_cost */
723 1, /* scalar_fp_stmt_cost */
724 5, /* scalar_load_cost */
725 1, /* scalar_store_cost */
726 1, /* cond_taken_branch_cost */
727 1, /* cond_not_taken_branch_cost */
728 &exynosm1_advsimd_vector_cost, /* advsimd */
729 NULL /* sve */
730 };
731
732 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
733 {
734 2, /* int_stmt_cost */
735 2, /* fp_stmt_cost */
736 2, /* permute_cost */
737 4, /* vec_to_scalar_cost */
738 4, /* scalar_to_vec_cost */
739 10, /* align_load_cost */
740 10, /* unalign_load_cost */
741 2, /* unalign_store_cost */
742 2 /* store_cost */
743 };
744
745 /* Generic costs for vector insn classes. */
746 static const struct cpu_vector_cost xgene1_vector_cost =
747 {
748 1, /* scalar_int_stmt_cost */
749 1, /* scalar_fp_stmt_cost */
750 5, /* scalar_load_cost */
751 1, /* scalar_store_cost */
752 2, /* cond_taken_branch_cost */
753 1, /* cond_not_taken_branch_cost */
754 &xgene1_advsimd_vector_cost, /* advsimd */
755 NULL /* sve */
756 };
757
758 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
759 {
760 4, /* int_stmt_cost */
761 5, /* fp_stmt_cost */
762 10, /* permute_cost */
763 6, /* vec_to_scalar_cost */
764 5, /* scalar_to_vec_cost */
765 4, /* align_load_cost */
766 4, /* unalign_load_cost */
767 1, /* unalign_store_cost */
768 1 /* store_cost */
769 };
770
771 /* Costs for vector insn classes for Vulcan. */
772 static const struct cpu_vector_cost thunderx2t99_vector_cost =
773 {
774 1, /* scalar_int_stmt_cost */
775 6, /* scalar_fp_stmt_cost */
776 4, /* scalar_load_cost */
777 1, /* scalar_store_cost */
778 2, /* cond_taken_branch_cost */
779 1, /* cond_not_taken_branch_cost */
780 &thunderx2t99_advsimd_vector_cost, /* advsimd */
781 NULL /* sve */
782 };
783
784 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
785 {
786 5, /* int_stmt_cost */
787 5, /* fp_stmt_cost */
788 10, /* permute_cost */
789 5, /* vec_to_scalar_cost */
790 5, /* scalar_to_vec_cost */
791 4, /* align_load_cost */
792 4, /* unalign_load_cost */
793 4, /* unalign_store_cost */
794 4 /* store_cost */
795 };
796
797 static const struct cpu_vector_cost thunderx3t110_vector_cost =
798 {
799 1, /* scalar_int_stmt_cost */
800 5, /* scalar_fp_stmt_cost */
801 4, /* scalar_load_cost */
802 1, /* scalar_store_cost */
803 2, /* cond_taken_branch_cost */
804 1, /* cond_not_taken_branch_cost */
805 &thunderx3t110_advsimd_vector_cost, /* advsimd */
806 NULL /* sve */
807 };
808
809
810 /* Generic costs for branch instructions. */
811 static const struct cpu_branch_cost generic_branch_cost =
812 {
813 1, /* Predictable. */
814 3 /* Unpredictable. */
815 };
816
817 /* Generic approximation modes. */
818 static const cpu_approx_modes generic_approx_modes =
819 {
820 AARCH64_APPROX_NONE, /* division */
821 AARCH64_APPROX_NONE, /* sqrt */
822 AARCH64_APPROX_NONE /* recip_sqrt */
823 };
824
825 /* Approximation modes for Exynos M1. */
826 static const cpu_approx_modes exynosm1_approx_modes =
827 {
828 AARCH64_APPROX_NONE, /* division */
829 AARCH64_APPROX_ALL, /* sqrt */
830 AARCH64_APPROX_ALL /* recip_sqrt */
831 };
832
833 /* Approximation modes for X-Gene 1. */
834 static const cpu_approx_modes xgene1_approx_modes =
835 {
836 AARCH64_APPROX_NONE, /* division */
837 AARCH64_APPROX_NONE, /* sqrt */
838 AARCH64_APPROX_ALL /* recip_sqrt */
839 };
840
841 /* Generic prefetch settings (which disable prefetch). */
842 static const cpu_prefetch_tune generic_prefetch_tune =
843 {
844 0, /* num_slots */
845 -1, /* l1_cache_size */
846 -1, /* l1_cache_line_size */
847 -1, /* l2_cache_size */
848 true, /* prefetch_dynamic_strides */
849 -1, /* minimum_stride */
850 -1 /* default_opt_level */
851 };
852
853 static const cpu_prefetch_tune exynosm1_prefetch_tune =
854 {
855 0, /* num_slots */
856 -1, /* l1_cache_size */
857 64, /* l1_cache_line_size */
858 -1, /* l2_cache_size */
859 true, /* prefetch_dynamic_strides */
860 -1, /* minimum_stride */
861 -1 /* default_opt_level */
862 };
863
864 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
865 {
866 4, /* num_slots */
867 32, /* l1_cache_size */
868 64, /* l1_cache_line_size */
869 512, /* l2_cache_size */
870 false, /* prefetch_dynamic_strides */
871 2048, /* minimum_stride */
872 3 /* default_opt_level */
873 };
874
875 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
876 {
877 8, /* num_slots */
878 32, /* l1_cache_size */
879 128, /* l1_cache_line_size */
880 16*1024, /* l2_cache_size */
881 true, /* prefetch_dynamic_strides */
882 -1, /* minimum_stride */
883 3 /* default_opt_level */
884 };
885
886 static const cpu_prefetch_tune thunderx_prefetch_tune =
887 {
888 8, /* num_slots */
889 32, /* l1_cache_size */
890 128, /* l1_cache_line_size */
891 -1, /* l2_cache_size */
892 true, /* prefetch_dynamic_strides */
893 -1, /* minimum_stride */
894 -1 /* default_opt_level */
895 };
896
897 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
898 {
899 8, /* num_slots */
900 32, /* l1_cache_size */
901 64, /* l1_cache_line_size */
902 256, /* l2_cache_size */
903 true, /* prefetch_dynamic_strides */
904 -1, /* minimum_stride */
905 -1 /* default_opt_level */
906 };
907
908 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
909 {
910 8, /* num_slots */
911 32, /* l1_cache_size */
912 64, /* l1_cache_line_size */
913 256, /* l2_cache_size */
914 true, /* prefetch_dynamic_strides */
915 -1, /* minimum_stride */
916 -1 /* default_opt_level */
917 };
918
919 static const cpu_prefetch_tune tsv110_prefetch_tune =
920 {
921 0, /* num_slots */
922 64, /* l1_cache_size */
923 64, /* l1_cache_line_size */
924 512, /* l2_cache_size */
925 true, /* prefetch_dynamic_strides */
926 -1, /* minimum_stride */
927 -1 /* default_opt_level */
928 };
929
930 static const cpu_prefetch_tune xgene1_prefetch_tune =
931 {
932 8, /* num_slots */
933 32, /* l1_cache_size */
934 64, /* l1_cache_line_size */
935 256, /* l2_cache_size */
936 true, /* prefetch_dynamic_strides */
937 -1, /* minimum_stride */
938 -1 /* default_opt_level */
939 };
940
941 static const cpu_prefetch_tune a64fx_prefetch_tune =
942 {
943 8, /* num_slots */
944 64, /* l1_cache_size */
945 256, /* l1_cache_line_size */
946 32768, /* l2_cache_size */
947 true, /* prefetch_dynamic_strides */
948 -1, /* minimum_stride */
949 -1 /* default_opt_level */
950 };
951
952 static const struct tune_params generic_tunings =
953 {
954 &cortexa57_extra_costs,
955 &generic_addrcost_table,
956 &generic_regmove_cost,
957 &generic_vector_cost,
958 &generic_branch_cost,
959 &generic_approx_modes,
960 SVE_NOT_IMPLEMENTED, /* sve_width */
961 4, /* memmov_cost */
962 2, /* issue_rate */
963 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
964 "16:12", /* function_align. */
965 "4", /* jump_align. */
966 "8", /* loop_align. */
967 2, /* int_reassoc_width. */
968 4, /* fp_reassoc_width. */
969 1, /* vec_reassoc_width. */
970 2, /* min_div_recip_mul_sf. */
971 2, /* min_div_recip_mul_df. */
972 0, /* max_case_values. */
973 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
974 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
975 &generic_prefetch_tune
976 };
977
978 static const struct tune_params cortexa35_tunings =
979 {
980 &cortexa53_extra_costs,
981 &generic_addrcost_table,
982 &cortexa53_regmove_cost,
983 &generic_vector_cost,
984 &generic_branch_cost,
985 &generic_approx_modes,
986 SVE_NOT_IMPLEMENTED, /* sve_width */
987 4, /* memmov_cost */
988 1, /* issue_rate */
989 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
990 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
991 "16", /* function_align. */
992 "4", /* jump_align. */
993 "8", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 0, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1002 &generic_prefetch_tune
1003 };
1004
1005 static const struct tune_params cortexa53_tunings =
1006 {
1007 &cortexa53_extra_costs,
1008 &generic_addrcost_table,
1009 &cortexa53_regmove_cost,
1010 &generic_vector_cost,
1011 &generic_branch_cost,
1012 &generic_approx_modes,
1013 SVE_NOT_IMPLEMENTED, /* sve_width */
1014 4, /* memmov_cost */
1015 2, /* issue_rate */
1016 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1018 "16", /* function_align. */
1019 "4", /* jump_align. */
1020 "8", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1028 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1029 &generic_prefetch_tune
1030 };
1031
1032 static const struct tune_params cortexa57_tunings =
1033 {
1034 &cortexa57_extra_costs,
1035 &generic_addrcost_table,
1036 &cortexa57_regmove_cost,
1037 &cortexa57_vector_cost,
1038 &generic_branch_cost,
1039 &generic_approx_modes,
1040 SVE_NOT_IMPLEMENTED, /* sve_width */
1041 4, /* memmov_cost */
1042 3, /* issue_rate */
1043 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1044 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1045 "16", /* function_align. */
1046 "4", /* jump_align. */
1047 "8", /* loop_align. */
1048 2, /* int_reassoc_width. */
1049 4, /* fp_reassoc_width. */
1050 1, /* vec_reassoc_width. */
1051 2, /* min_div_recip_mul_sf. */
1052 2, /* min_div_recip_mul_df. */
1053 0, /* max_case_values. */
1054 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1055 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1056 &generic_prefetch_tune
1057 };
1058
1059 static const struct tune_params cortexa72_tunings =
1060 {
1061 &cortexa57_extra_costs,
1062 &generic_addrcost_table,
1063 &cortexa57_regmove_cost,
1064 &cortexa57_vector_cost,
1065 &generic_branch_cost,
1066 &generic_approx_modes,
1067 SVE_NOT_IMPLEMENTED, /* sve_width */
1068 4, /* memmov_cost */
1069 3, /* issue_rate */
1070 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1071 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1072 "16", /* function_align. */
1073 "4", /* jump_align. */
1074 "8", /* loop_align. */
1075 2, /* int_reassoc_width. */
1076 4, /* fp_reassoc_width. */
1077 1, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
1080 0, /* max_case_values. */
1081 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1082 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1083 &generic_prefetch_tune
1084 };
1085
1086 static const struct tune_params cortexa73_tunings =
1087 {
1088 &cortexa57_extra_costs,
1089 &generic_addrcost_table,
1090 &cortexa57_regmove_cost,
1091 &cortexa57_vector_cost,
1092 &generic_branch_cost,
1093 &generic_approx_modes,
1094 SVE_NOT_IMPLEMENTED, /* sve_width */
1095 4, /* memmov_cost. */
1096 2, /* issue_rate. */
1097 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1098 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1099 "16", /* function_align. */
1100 "4", /* jump_align. */
1101 "8", /* loop_align. */
1102 2, /* int_reassoc_width. */
1103 4, /* fp_reassoc_width. */
1104 1, /* vec_reassoc_width. */
1105 2, /* min_div_recip_mul_sf. */
1106 2, /* min_div_recip_mul_df. */
1107 0, /* max_case_values. */
1108 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1109 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1110 &generic_prefetch_tune
1111 };
1112
1113
1114
1115 static const struct tune_params exynosm1_tunings =
1116 {
1117 &exynosm1_extra_costs,
1118 &exynosm1_addrcost_table,
1119 &exynosm1_regmove_cost,
1120 &exynosm1_vector_cost,
1121 &generic_branch_cost,
1122 &exynosm1_approx_modes,
1123 SVE_NOT_IMPLEMENTED, /* sve_width */
1124 4, /* memmov_cost */
1125 3, /* issue_rate */
1126 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
1127 "4", /* function_align. */
1128 "4", /* jump_align. */
1129 "4", /* loop_align. */
1130 2, /* int_reassoc_width. */
1131 4, /* fp_reassoc_width. */
1132 1, /* vec_reassoc_width. */
1133 2, /* min_div_recip_mul_sf. */
1134 2, /* min_div_recip_mul_df. */
1135 48, /* max_case_values. */
1136 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1137 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1138 &exynosm1_prefetch_tune
1139 };
1140
1141 static const struct tune_params thunderxt88_tunings =
1142 {
1143 &thunderx_extra_costs,
1144 &generic_addrcost_table,
1145 &thunderx_regmove_cost,
1146 &thunderx_vector_cost,
1147 &generic_branch_cost,
1148 &generic_approx_modes,
1149 SVE_NOT_IMPLEMENTED, /* sve_width */
1150 6, /* memmov_cost */
1151 2, /* issue_rate */
1152 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1153 "8", /* function_align. */
1154 "8", /* jump_align. */
1155 "8", /* loop_align. */
1156 2, /* int_reassoc_width. */
1157 4, /* fp_reassoc_width. */
1158 1, /* vec_reassoc_width. */
1159 2, /* min_div_recip_mul_sf. */
1160 2, /* min_div_recip_mul_df. */
1161 0, /* max_case_values. */
1162 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1163 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1164 &thunderxt88_prefetch_tune
1165 };
1166
1167 static const struct tune_params thunderx_tunings =
1168 {
1169 &thunderx_extra_costs,
1170 &generic_addrcost_table,
1171 &thunderx_regmove_cost,
1172 &thunderx_vector_cost,
1173 &generic_branch_cost,
1174 &generic_approx_modes,
1175 SVE_NOT_IMPLEMENTED, /* sve_width */
1176 6, /* memmov_cost */
1177 2, /* issue_rate */
1178 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1179 "8", /* function_align. */
1180 "8", /* jump_align. */
1181 "8", /* loop_align. */
1182 2, /* int_reassoc_width. */
1183 4, /* fp_reassoc_width. */
1184 1, /* vec_reassoc_width. */
1185 2, /* min_div_recip_mul_sf. */
1186 2, /* min_div_recip_mul_df. */
1187 0, /* max_case_values. */
1188 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1189 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1190 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1191 &thunderx_prefetch_tune
1192 };
1193
1194 static const struct tune_params tsv110_tunings =
1195 {
1196 &tsv110_extra_costs,
1197 &tsv110_addrcost_table,
1198 &tsv110_regmove_cost,
1199 &tsv110_vector_cost,
1200 &generic_branch_cost,
1201 &generic_approx_modes,
1202 SVE_NOT_IMPLEMENTED, /* sve_width */
1203 4, /* memmov_cost */
1204 4, /* issue_rate */
1205 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1206 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1207 "16", /* function_align. */
1208 "4", /* jump_align. */
1209 "8", /* loop_align. */
1210 2, /* int_reassoc_width. */
1211 4, /* fp_reassoc_width. */
1212 1, /* vec_reassoc_width. */
1213 2, /* min_div_recip_mul_sf. */
1214 2, /* min_div_recip_mul_df. */
1215 0, /* max_case_values. */
1216 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1217 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1218 &tsv110_prefetch_tune
1219 };
1220
1221 static const struct tune_params xgene1_tunings =
1222 {
1223 &xgene1_extra_costs,
1224 &xgene1_addrcost_table,
1225 &xgene1_regmove_cost,
1226 &xgene1_vector_cost,
1227 &generic_branch_cost,
1228 &xgene1_approx_modes,
1229 SVE_NOT_IMPLEMENTED, /* sve_width */
1230 6, /* memmov_cost */
1231 4, /* issue_rate */
1232 AARCH64_FUSE_NOTHING, /* fusible_ops */
1233 "16", /* function_align. */
1234 "16", /* jump_align. */
1235 "16", /* loop_align. */
1236 2, /* int_reassoc_width. */
1237 4, /* fp_reassoc_width. */
1238 1, /* vec_reassoc_width. */
1239 2, /* min_div_recip_mul_sf. */
1240 2, /* min_div_recip_mul_df. */
1241 17, /* max_case_values. */
1242 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1243 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1244 &xgene1_prefetch_tune
1245 };
1246
1247 static const struct tune_params emag_tunings =
1248 {
1249 &xgene1_extra_costs,
1250 &xgene1_addrcost_table,
1251 &xgene1_regmove_cost,
1252 &xgene1_vector_cost,
1253 &generic_branch_cost,
1254 &xgene1_approx_modes,
1255 SVE_NOT_IMPLEMENTED,
1256 6, /* memmov_cost */
1257 4, /* issue_rate */
1258 AARCH64_FUSE_NOTHING, /* fusible_ops */
1259 "16", /* function_align. */
1260 "16", /* jump_align. */
1261 "16", /* loop_align. */
1262 2, /* int_reassoc_width. */
1263 4, /* fp_reassoc_width. */
1264 1, /* vec_reassoc_width. */
1265 2, /* min_div_recip_mul_sf. */
1266 2, /* min_div_recip_mul_df. */
1267 17, /* max_case_values. */
1268 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1269 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1270 &xgene1_prefetch_tune
1271 };
1272
1273 static const struct tune_params qdf24xx_tunings =
1274 {
1275 &qdf24xx_extra_costs,
1276 &qdf24xx_addrcost_table,
1277 &qdf24xx_regmove_cost,
1278 &qdf24xx_vector_cost,
1279 &generic_branch_cost,
1280 &generic_approx_modes,
1281 SVE_NOT_IMPLEMENTED, /* sve_width */
1282 4, /* memmov_cost */
1283 4, /* issue_rate */
1284 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1285 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1286 "16", /* function_align. */
1287 "8", /* jump_align. */
1288 "16", /* loop_align. */
1289 2, /* int_reassoc_width. */
1290 4, /* fp_reassoc_width. */
1291 1, /* vec_reassoc_width. */
1292 2, /* min_div_recip_mul_sf. */
1293 2, /* min_div_recip_mul_df. */
1294 0, /* max_case_values. */
1295 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1296 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1297 &qdf24xx_prefetch_tune
1298 };
1299
1300 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1301 for now. */
1302 static const struct tune_params saphira_tunings =
1303 {
1304 &generic_extra_costs,
1305 &generic_addrcost_table,
1306 &generic_regmove_cost,
1307 &generic_vector_cost,
1308 &generic_branch_cost,
1309 &generic_approx_modes,
1310 SVE_NOT_IMPLEMENTED, /* sve_width */
1311 4, /* memmov_cost */
1312 4, /* issue_rate */
1313 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1314 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1315 "16", /* function_align. */
1316 "8", /* jump_align. */
1317 "16", /* loop_align. */
1318 2, /* int_reassoc_width. */
1319 4, /* fp_reassoc_width. */
1320 1, /* vec_reassoc_width. */
1321 2, /* min_div_recip_mul_sf. */
1322 2, /* min_div_recip_mul_df. */
1323 0, /* max_case_values. */
1324 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1325 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1326 &generic_prefetch_tune
1327 };
1328
1329 static const struct tune_params thunderx2t99_tunings =
1330 {
1331 &thunderx2t99_extra_costs,
1332 &thunderx2t99_addrcost_table,
1333 &thunderx2t99_regmove_cost,
1334 &thunderx2t99_vector_cost,
1335 &generic_branch_cost,
1336 &generic_approx_modes,
1337 SVE_NOT_IMPLEMENTED, /* sve_width */
1338 4, /* memmov_cost. */
1339 4, /* issue_rate. */
1340 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1341 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1342 "16", /* function_align. */
1343 "8", /* jump_align. */
1344 "16", /* loop_align. */
1345 3, /* int_reassoc_width. */
1346 2, /* fp_reassoc_width. */
1347 2, /* vec_reassoc_width. */
1348 2, /* min_div_recip_mul_sf. */
1349 2, /* min_div_recip_mul_df. */
1350 0, /* max_case_values. */
1351 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1352 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1353 &thunderx2t99_prefetch_tune
1354 };
1355
1356 static const struct tune_params thunderx3t110_tunings =
1357 {
1358 &thunderx3t110_extra_costs,
1359 &thunderx3t110_addrcost_table,
1360 &thunderx3t110_regmove_cost,
1361 &thunderx3t110_vector_cost,
1362 &generic_branch_cost,
1363 &generic_approx_modes,
1364 SVE_NOT_IMPLEMENTED, /* sve_width */
1365 4, /* memmov_cost. */
1366 6, /* issue_rate. */
1367 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1368 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1369 "16", /* function_align. */
1370 "8", /* jump_align. */
1371 "16", /* loop_align. */
1372 3, /* int_reassoc_width. */
1373 2, /* fp_reassoc_width. */
1374 2, /* vec_reassoc_width. */
1375 2, /* min_div_recip_mul_sf. */
1376 2, /* min_div_recip_mul_df. */
1377 0, /* max_case_values. */
1378 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1379 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1380 &thunderx3t110_prefetch_tune
1381 };
1382
1383 static const struct tune_params neoversen1_tunings =
1384 {
1385 &cortexa76_extra_costs,
1386 &generic_addrcost_table,
1387 &generic_regmove_cost,
1388 &cortexa57_vector_cost,
1389 &generic_branch_cost,
1390 &generic_approx_modes,
1391 SVE_NOT_IMPLEMENTED, /* sve_width */
1392 4, /* memmov_cost */
1393 3, /* issue_rate */
1394 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1395 "32:16", /* function_align. */
1396 "4", /* jump_align. */
1397 "32:16", /* loop_align. */
1398 2, /* int_reassoc_width. */
1399 4, /* fp_reassoc_width. */
1400 2, /* vec_reassoc_width. */
1401 2, /* min_div_recip_mul_sf. */
1402 2, /* min_div_recip_mul_df. */
1403 0, /* max_case_values. */
1404 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1405 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1406 &generic_prefetch_tune
1407 };
1408
1409 static const struct tune_params neoversev1_tunings =
1410 {
1411 &cortexa76_extra_costs,
1412 &generic_addrcost_table,
1413 &generic_regmove_cost,
1414 &cortexa57_vector_cost,
1415 &generic_branch_cost,
1416 &generic_approx_modes,
1417 SVE_256, /* sve_width */
1418 4, /* memmov_cost */
1419 3, /* issue_rate */
1420 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1421 "32:16", /* function_align. */
1422 "4", /* jump_align. */
1423 "32:16", /* loop_align. */
1424 2, /* int_reassoc_width. */
1425 4, /* fp_reassoc_width. */
1426 2, /* vec_reassoc_width. */
1427 2, /* min_div_recip_mul_sf. */
1428 2, /* min_div_recip_mul_df. */
1429 0, /* max_case_values. */
1430 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1431 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1432 &generic_prefetch_tune
1433 };
1434
1435 static const struct tune_params neoversen2_tunings =
1436 {
1437 &cortexa76_extra_costs,
1438 &generic_addrcost_table,
1439 &generic_regmove_cost,
1440 &cortexa57_vector_cost,
1441 &generic_branch_cost,
1442 &generic_approx_modes,
1443 SVE_128, /* sve_width */
1444 4, /* memmov_cost */
1445 3, /* issue_rate */
1446 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1447 "32:16", /* function_align. */
1448 "4", /* jump_align. */
1449 "32:16", /* loop_align. */
1450 2, /* int_reassoc_width. */
1451 4, /* fp_reassoc_width. */
1452 2, /* vec_reassoc_width. */
1453 2, /* min_div_recip_mul_sf. */
1454 2, /* min_div_recip_mul_df. */
1455 0, /* max_case_values. */
1456 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1457 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1458 &generic_prefetch_tune
1459 };
1460
1461 static const struct tune_params a64fx_tunings =
1462 {
1463 &generic_extra_costs,
1464 &generic_addrcost_table,
1465 &generic_regmove_cost,
1466 &generic_vector_cost,
1467 &generic_branch_cost,
1468 &generic_approx_modes,
1469 SVE_512, /* sve_width */
1470 4, /* memmov_cost */
1471 7, /* issue_rate */
1472 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1473 "32", /* function_align. */
1474 "16", /* jump_align. */
1475 "32", /* loop_align. */
1476 4, /* int_reassoc_width. */
1477 2, /* fp_reassoc_width. */
1478 2, /* vec_reassoc_width. */
1479 2, /* min_div_recip_mul_sf. */
1480 2, /* min_div_recip_mul_df. */
1481 0, /* max_case_values. */
1482 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1483 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1484 &a64fx_prefetch_tune
1485 };
1486
1487 /* Support for fine-grained override of the tuning structures. */
1488 struct aarch64_tuning_override_function
1489 {
1490 const char* name;
1491 void (*parse_override)(const char*, struct tune_params*);
1492 };
1493
1494 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1495 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1496 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1497
1498 static const struct aarch64_tuning_override_function
1499 aarch64_tuning_override_functions[] =
1500 {
1501 { "fuse", aarch64_parse_fuse_string },
1502 { "tune", aarch64_parse_tune_string },
1503 { "sve_width", aarch64_parse_sve_width_string },
1504 { NULL, NULL }
1505 };
1506
1507 /* A processor implementing AArch64. */
1508 struct processor
1509 {
1510 const char *const name;
1511 enum aarch64_processor ident;
1512 enum aarch64_processor sched_core;
1513 enum aarch64_arch arch;
1514 unsigned architecture_version;
1515 const uint64_t flags;
1516 const struct tune_params *const tune;
1517 };
1518
1519 /* Architectures implementing AArch64. */
1520 static const struct processor all_architectures[] =
1521 {
1522 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1523 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1524 #include "aarch64-arches.def"
1525 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1526 };
1527
1528 /* Processor cores implementing AArch64. */
1529 static const struct processor all_cores[] =
1530 {
1531 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1532 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1533 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1534 FLAGS, &COSTS##_tunings},
1535 #include "aarch64-cores.def"
1536 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1537 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1538 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1539 };
1540
1541
1542 /* Target specification. These are populated by the -march, -mtune, -mcpu
1543 handling code or by target attributes. */
1544 static const struct processor *selected_arch;
1545 static const struct processor *selected_cpu;
1546 static const struct processor *selected_tune;
1547
1548 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1549
1550 /* The current tuning set. */
1551 struct tune_params aarch64_tune_params = generic_tunings;
1552
1553 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1554
1555 static tree
1556 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1557 int, bool *no_add_attrs)
1558 {
1559 /* Since we set fn_type_req to true, the caller should have checked
1560 this for us. */
1561 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1562 switch ((arm_pcs) fntype_abi (*node).id ())
1563 {
1564 case ARM_PCS_AAPCS64:
1565 case ARM_PCS_SIMD:
1566 return NULL_TREE;
1567
1568 case ARM_PCS_SVE:
1569 error ("the %qE attribute cannot be applied to an SVE function type",
1570 name);
1571 *no_add_attrs = true;
1572 return NULL_TREE;
1573
1574 case ARM_PCS_TLSDESC:
1575 case ARM_PCS_UNKNOWN:
1576 break;
1577 }
1578 gcc_unreachable ();
1579 }
1580
1581 /* Table of machine attributes. */
1582 static const struct attribute_spec aarch64_attribute_table[] =
1583 {
1584 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1585 affects_type_identity, handler, exclude } */
1586 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1587 handle_aarch64_vector_pcs_attribute, NULL },
1588 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
1589 aarch64_sve::handle_arm_sve_vector_bits_attribute,
1590 NULL },
1591 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
1592 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
1593 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
1594 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1595 };
1596
1597 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1598
1599 /* An ISA extension in the co-processor and main instruction set space. */
1600 struct aarch64_option_extension
1601 {
1602 const char *const name;
1603 const unsigned long flags_on;
1604 const unsigned long flags_off;
1605 };
1606
1607 typedef enum aarch64_cond_code
1608 {
1609 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1610 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1611 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1612 }
1613 aarch64_cc;
1614
1615 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1616
1617 struct aarch64_branch_protect_type
1618 {
1619 /* The type's name that the user passes to the branch-protection option
1620 string. */
1621 const char* name;
1622 /* Function to handle the protection type and set global variables.
1623 First argument is the string token corresponding with this type and the
1624 second argument is the next token in the option string.
1625 Return values:
1626 * AARCH64_PARSE_OK: Handling was sucessful.
1627 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1628 should print an error.
1629 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1630 own error. */
1631 enum aarch64_parse_opt_result (*handler)(char*, char*);
1632 /* A list of types that can follow this type in the option string. */
1633 const aarch64_branch_protect_type* subtypes;
1634 unsigned int num_subtypes;
1635 };
1636
1637 static enum aarch64_parse_opt_result
1638 aarch64_handle_no_branch_protection (char* str, char* rest)
1639 {
1640 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1641 aarch64_enable_bti = 0;
1642 if (rest)
1643 {
1644 error ("unexpected %<%s%> after %<%s%>", rest, str);
1645 return AARCH64_PARSE_INVALID_FEATURE;
1646 }
1647 return AARCH64_PARSE_OK;
1648 }
1649
1650 static enum aarch64_parse_opt_result
1651 aarch64_handle_standard_branch_protection (char* str, char* rest)
1652 {
1653 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1654 aarch64_ra_sign_key = AARCH64_KEY_A;
1655 aarch64_enable_bti = 1;
1656 if (rest)
1657 {
1658 error ("unexpected %<%s%> after %<%s%>", rest, str);
1659 return AARCH64_PARSE_INVALID_FEATURE;
1660 }
1661 return AARCH64_PARSE_OK;
1662 }
1663
1664 static enum aarch64_parse_opt_result
1665 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1666 char* rest ATTRIBUTE_UNUSED)
1667 {
1668 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1669 aarch64_ra_sign_key = AARCH64_KEY_A;
1670 return AARCH64_PARSE_OK;
1671 }
1672
1673 static enum aarch64_parse_opt_result
1674 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1675 char* rest ATTRIBUTE_UNUSED)
1676 {
1677 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1678 return AARCH64_PARSE_OK;
1679 }
1680
1681 static enum aarch64_parse_opt_result
1682 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1683 char* rest ATTRIBUTE_UNUSED)
1684 {
1685 aarch64_ra_sign_key = AARCH64_KEY_B;
1686 return AARCH64_PARSE_OK;
1687 }
1688
1689 static enum aarch64_parse_opt_result
1690 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1691 char* rest ATTRIBUTE_UNUSED)
1692 {
1693 aarch64_enable_bti = 1;
1694 return AARCH64_PARSE_OK;
1695 }
1696
1697 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1698 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1699 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1700 { NULL, NULL, NULL, 0 }
1701 };
1702
1703 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1704 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1705 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1706 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1707 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1708 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1709 { NULL, NULL, NULL, 0 }
1710 };
1711
1712 /* The condition codes of the processor, and the inverse function. */
1713 static const char * const aarch64_condition_codes[] =
1714 {
1715 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1716 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1717 };
1718
1719 /* The preferred condition codes for SVE conditions. */
1720 static const char *const aarch64_sve_condition_codes[] =
1721 {
1722 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1723 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1724 };
1725
1726 /* Return the assembly token for svpattern value VALUE. */
1727
1728 static const char *
1729 svpattern_token (enum aarch64_svpattern pattern)
1730 {
1731 switch (pattern)
1732 {
1733 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1734 AARCH64_FOR_SVPATTERN (CASE)
1735 #undef CASE
1736 case AARCH64_NUM_SVPATTERNS:
1737 break;
1738 }
1739 gcc_unreachable ();
1740 }
1741
1742 /* Return the location of a piece that is known to be passed or returned
1743 in registers. FIRST_ZR is the first unused vector argument register
1744 and FIRST_PR is the first unused predicate argument register. */
1745
1746 rtx
1747 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1748 unsigned int first_pr) const
1749 {
1750 gcc_assert (VECTOR_MODE_P (mode)
1751 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1752 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1753
1754 if (num_zr > 0 && num_pr == 0)
1755 return gen_rtx_REG (mode, first_zr);
1756
1757 if (num_zr == 0 && num_pr == 1)
1758 return gen_rtx_REG (mode, first_pr);
1759
1760 gcc_unreachable ();
1761 }
1762
1763 /* Return the total number of vector registers required by the PST. */
1764
1765 unsigned int
1766 pure_scalable_type_info::num_zr () const
1767 {
1768 unsigned int res = 0;
1769 for (unsigned int i = 0; i < pieces.length (); ++i)
1770 res += pieces[i].num_zr;
1771 return res;
1772 }
1773
1774 /* Return the total number of predicate registers required by the PST. */
1775
1776 unsigned int
1777 pure_scalable_type_info::num_pr () const
1778 {
1779 unsigned int res = 0;
1780 for (unsigned int i = 0; i < pieces.length (); ++i)
1781 res += pieces[i].num_pr;
1782 return res;
1783 }
1784
1785 /* Return the location of a PST that is known to be passed or returned
1786 in registers. FIRST_ZR is the first unused vector argument register
1787 and FIRST_PR is the first unused predicate argument register. */
1788
1789 rtx
1790 pure_scalable_type_info::get_rtx (machine_mode mode,
1791 unsigned int first_zr,
1792 unsigned int first_pr) const
1793 {
1794 /* Try to return a single REG if possible. This leads to better
1795 code generation; it isn't required for correctness. */
1796 if (mode == pieces[0].mode)
1797 {
1798 gcc_assert (pieces.length () == 1);
1799 return pieces[0].get_rtx (first_zr, first_pr);
1800 }
1801
1802 /* Build up a PARALLEL that contains the individual pieces. */
1803 rtvec rtxes = rtvec_alloc (pieces.length ());
1804 for (unsigned int i = 0; i < pieces.length (); ++i)
1805 {
1806 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1807 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1808 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1809 first_zr += pieces[i].num_zr;
1810 first_pr += pieces[i].num_pr;
1811 }
1812 return gen_rtx_PARALLEL (mode, rtxes);
1813 }
1814
1815 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1816 in the AAPCS64. */
1817
1818 pure_scalable_type_info::analysis_result
1819 pure_scalable_type_info::analyze (const_tree type)
1820 {
1821 /* Prevent accidental reuse. */
1822 gcc_assert (pieces.is_empty ());
1823
1824 /* No code will be generated for erroneous types, so we won't establish
1825 an ABI mapping. */
1826 if (type == error_mark_node)
1827 return NO_ABI_IDENTITY;
1828
1829 /* Zero-sized types disappear in the language->ABI mapping. */
1830 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1831 return NO_ABI_IDENTITY;
1832
1833 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1834 piece p = {};
1835 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1836 {
1837 machine_mode mode = TYPE_MODE_RAW (type);
1838 gcc_assert (VECTOR_MODE_P (mode)
1839 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1840
1841 p.mode = p.orig_mode = mode;
1842 add_piece (p);
1843 return IS_PST;
1844 }
1845
1846 /* Check for user-defined PSTs. */
1847 if (TREE_CODE (type) == ARRAY_TYPE)
1848 return analyze_array (type);
1849 if (TREE_CODE (type) == RECORD_TYPE)
1850 return analyze_record (type);
1851
1852 return ISNT_PST;
1853 }
1854
1855 /* Analyze a type that is known not to be passed or returned in memory.
1856 Return true if it has an ABI identity and is a Pure Scalable Type. */
1857
1858 bool
1859 pure_scalable_type_info::analyze_registers (const_tree type)
1860 {
1861 analysis_result result = analyze (type);
1862 gcc_assert (result != DOESNT_MATTER);
1863 return result == IS_PST;
1864 }
1865
1866 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1867
1868 pure_scalable_type_info::analysis_result
1869 pure_scalable_type_info::analyze_array (const_tree type)
1870 {
1871 /* Analyze the element type. */
1872 pure_scalable_type_info element_info;
1873 analysis_result result = element_info.analyze (TREE_TYPE (type));
1874 if (result != IS_PST)
1875 return result;
1876
1877 /* An array of unknown, flexible or variable length will be passed and
1878 returned by reference whatever we do. */
1879 tree nelts_minus_one = array_type_nelts (type);
1880 if (!tree_fits_uhwi_p (nelts_minus_one))
1881 return DOESNT_MATTER;
1882
1883 /* Likewise if the array is constant-sized but too big to be interesting.
1884 The double checks against MAX_PIECES are to protect against overflow. */
1885 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1886 if (count > MAX_PIECES)
1887 return DOESNT_MATTER;
1888 count += 1;
1889 if (count * element_info.pieces.length () > MAX_PIECES)
1890 return DOESNT_MATTER;
1891
1892 /* The above checks should have weeded out elements of unknown size. */
1893 poly_uint64 element_bytes;
1894 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1895 gcc_unreachable ();
1896
1897 /* Build up the list of individual vectors and predicates. */
1898 gcc_assert (!element_info.pieces.is_empty ());
1899 for (unsigned int i = 0; i < count; ++i)
1900 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1901 {
1902 piece p = element_info.pieces[j];
1903 p.offset += i * element_bytes;
1904 add_piece (p);
1905 }
1906 return IS_PST;
1907 }
1908
1909 /* Subroutine of analyze for handling RECORD_TYPEs. */
1910
1911 pure_scalable_type_info::analysis_result
1912 pure_scalable_type_info::analyze_record (const_tree type)
1913 {
1914 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1915 {
1916 if (TREE_CODE (field) != FIELD_DECL)
1917 continue;
1918
1919 /* Zero-sized fields disappear in the language->ABI mapping. */
1920 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1921 continue;
1922
1923 /* All fields with an ABI identity must be PSTs for the record as
1924 a whole to be a PST. If any individual field is too big to be
1925 interesting then the record is too. */
1926 pure_scalable_type_info field_info;
1927 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1928 if (subresult == NO_ABI_IDENTITY)
1929 continue;
1930 if (subresult != IS_PST)
1931 return subresult;
1932
1933 /* Since all previous fields are PSTs, we ought to be able to track
1934 the field offset using poly_ints. */
1935 tree bitpos = bit_position (field);
1936 gcc_assert (poly_int_tree_p (bitpos));
1937
1938 /* For the same reason, it shouldn't be possible to create a PST field
1939 whose offset isn't byte-aligned. */
1940 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1941 BITS_PER_UNIT);
1942
1943 /* Punt if the record is too big to be interesting. */
1944 poly_uint64 bytepos;
1945 if (!wide_bytepos.to_uhwi (&bytepos)
1946 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1947 return DOESNT_MATTER;
1948
1949 /* Add the individual vectors and predicates in the field to the
1950 record's list. */
1951 gcc_assert (!field_info.pieces.is_empty ());
1952 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1953 {
1954 piece p = field_info.pieces[i];
1955 p.offset += bytepos;
1956 add_piece (p);
1957 }
1958 }
1959 /* Empty structures disappear in the language->ABI mapping. */
1960 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1961 }
1962
1963 /* Add P to the list of pieces in the type. */
1964
1965 void
1966 pure_scalable_type_info::add_piece (const piece &p)
1967 {
1968 /* Try to fold the new piece into the previous one to form a
1969 single-mode PST. For example, if we see three consecutive vectors
1970 of the same mode, we can represent them using the corresponding
1971 3-tuple mode.
1972
1973 This is purely an optimization. */
1974 if (!pieces.is_empty ())
1975 {
1976 piece &prev = pieces.last ();
1977 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1978 unsigned int nelems1, nelems2;
1979 if (prev.orig_mode == p.orig_mode
1980 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1981 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1982 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1983 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1984 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1985 && targetm.array_mode (p.orig_mode,
1986 nelems1 + nelems2).exists (&prev.mode))
1987 {
1988 prev.num_zr += p.num_zr;
1989 prev.num_pr += p.num_pr;
1990 return;
1991 }
1992 }
1993 pieces.quick_push (p);
1994 }
1995
1996 /* Return true if at least one possible value of type TYPE includes at
1997 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1998
1999 This is a relatively expensive test for some types, so it should
2000 generally be made as late as possible. */
2001
2002 static bool
2003 aarch64_some_values_include_pst_objects_p (const_tree type)
2004 {
2005 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2006 return false;
2007
2008 if (aarch64_sve::builtin_type_p (type))
2009 return true;
2010
2011 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2012 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2013
2014 if (RECORD_OR_UNION_TYPE_P (type))
2015 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2016 if (TREE_CODE (field) == FIELD_DECL
2017 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2018 return true;
2019
2020 return false;
2021 }
2022
2023 /* Return the descriptor of the SIMD ABI. */
2024
2025 static const predefined_function_abi &
2026 aarch64_simd_abi (void)
2027 {
2028 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2029 if (!simd_abi.initialized_p ())
2030 {
2031 HARD_REG_SET full_reg_clobbers
2032 = default_function_abi.full_reg_clobbers ();
2033 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2034 if (FP_SIMD_SAVED_REGNUM_P (regno))
2035 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2036 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2037 }
2038 return simd_abi;
2039 }
2040
2041 /* Return the descriptor of the SVE PCS. */
2042
2043 static const predefined_function_abi &
2044 aarch64_sve_abi (void)
2045 {
2046 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2047 if (!sve_abi.initialized_p ())
2048 {
2049 HARD_REG_SET full_reg_clobbers
2050 = default_function_abi.full_reg_clobbers ();
2051 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2052 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2053 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
2054 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2055 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2056 }
2057 return sve_abi;
2058 }
2059
2060 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2061 wraps, otherwise return X itself. */
2062
2063 static rtx
2064 strip_salt (rtx x)
2065 {
2066 rtx search = x;
2067 if (GET_CODE (search) == CONST)
2068 search = XEXP (search, 0);
2069 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2070 x = XVECEXP (search, 0, 0);
2071 return x;
2072 }
2073
2074 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2075 expression. */
2076
2077 static rtx
2078 strip_offset_and_salt (rtx addr, poly_int64 *offset)
2079 {
2080 return strip_salt (strip_offset (addr, offset));
2081 }
2082
2083 /* Generate code to enable conditional branches in functions over 1 MiB. */
2084 const char *
2085 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2086 const char * branch_format)
2087 {
2088 rtx_code_label * tmp_label = gen_label_rtx ();
2089 char label_buf[256];
2090 char buffer[128];
2091 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2092 CODE_LABEL_NUMBER (tmp_label));
2093 const char *label_ptr = targetm.strip_name_encoding (label_buf);
2094 rtx dest_label = operands[pos_label];
2095 operands[pos_label] = tmp_label;
2096
2097 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2098 output_asm_insn (buffer, operands);
2099
2100 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2101 operands[pos_label] = dest_label;
2102 output_asm_insn (buffer, operands);
2103 return "";
2104 }
2105
2106 void
2107 aarch64_err_no_fpadvsimd (machine_mode mode)
2108 {
2109 if (TARGET_GENERAL_REGS_ONLY)
2110 if (FLOAT_MODE_P (mode))
2111 error ("%qs is incompatible with the use of floating-point types",
2112 "-mgeneral-regs-only");
2113 else
2114 error ("%qs is incompatible with the use of vector types",
2115 "-mgeneral-regs-only");
2116 else
2117 if (FLOAT_MODE_P (mode))
2118 error ("%qs feature modifier is incompatible with the use of"
2119 " floating-point types", "+nofp");
2120 else
2121 error ("%qs feature modifier is incompatible with the use of"
2122 " vector types", "+nofp");
2123 }
2124
2125 /* Report when we try to do something that requires SVE when SVE is disabled.
2126 This is an error of last resort and isn't very high-quality. It usually
2127 involves attempts to measure the vector length in some way. */
2128 static void
2129 aarch64_report_sve_required (void)
2130 {
2131 static bool reported_p = false;
2132
2133 /* Avoid reporting a slew of messages for a single oversight. */
2134 if (reported_p)
2135 return;
2136
2137 error ("this operation requires the SVE ISA extension");
2138 inform (input_location, "you can enable SVE using the command-line"
2139 " option %<-march%>, or by using the %<target%>"
2140 " attribute or pragma");
2141 reported_p = true;
2142 }
2143
2144 /* Return true if REGNO is P0-P15 or one of the special FFR-related
2145 registers. */
2146 inline bool
2147 pr_or_ffr_regnum_p (unsigned int regno)
2148 {
2149 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2150 }
2151
2152 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2153 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2154 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2155 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2156 and GENERAL_REGS is lower than the memory cost (in this case the best class
2157 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
2158 cost results in bad allocations with many redundant int<->FP moves which
2159 are expensive on various cores.
2160 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2161 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
2162 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
2163 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
2164 The result of this is that it is no longer inefficient to have a higher
2165 memory move cost than the register move cost.
2166 */
2167
2168 static reg_class_t
2169 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2170 reg_class_t best_class)
2171 {
2172 machine_mode mode;
2173
2174 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2175 || !reg_class_subset_p (FP_REGS, allocno_class))
2176 return allocno_class;
2177
2178 if (!reg_class_subset_p (GENERAL_REGS, best_class)
2179 || !reg_class_subset_p (FP_REGS, best_class))
2180 return best_class;
2181
2182 mode = PSEUDO_REGNO_MODE (regno);
2183 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2184 }
2185
2186 static unsigned int
2187 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2188 {
2189 if (GET_MODE_UNIT_SIZE (mode) == 4)
2190 return aarch64_tune_params.min_div_recip_mul_sf;
2191 return aarch64_tune_params.min_div_recip_mul_df;
2192 }
2193
2194 /* Return the reassociation width of treeop OPC with mode MODE. */
2195 static int
2196 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2197 {
2198 if (VECTOR_MODE_P (mode))
2199 return aarch64_tune_params.vec_reassoc_width;
2200 if (INTEGRAL_MODE_P (mode))
2201 return aarch64_tune_params.int_reassoc_width;
2202 /* Avoid reassociating floating point addition so we emit more FMAs. */
2203 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2204 return aarch64_tune_params.fp_reassoc_width;
2205 return 1;
2206 }
2207
2208 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
2209 unsigned
2210 aarch64_dbx_register_number (unsigned regno)
2211 {
2212 if (GP_REGNUM_P (regno))
2213 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2214 else if (regno == SP_REGNUM)
2215 return AARCH64_DWARF_SP;
2216 else if (FP_REGNUM_P (regno))
2217 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2218 else if (PR_REGNUM_P (regno))
2219 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2220 else if (regno == VG_REGNUM)
2221 return AARCH64_DWARF_VG;
2222
2223 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2224 equivalent DWARF register. */
2225 return DWARF_FRAME_REGISTERS;
2226 }
2227
2228 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2229 integer, otherwise return X unmodified. */
2230 static rtx
2231 aarch64_bit_representation (rtx x)
2232 {
2233 if (CONST_DOUBLE_P (x))
2234 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2235 return x;
2236 }
2237
2238 /* Return true if MODE is any of the Advanced SIMD structure modes. */
2239 static bool
2240 aarch64_advsimd_struct_mode_p (machine_mode mode)
2241 {
2242 return (TARGET_SIMD
2243 && (mode == OImode || mode == CImode || mode == XImode));
2244 }
2245
2246 /* Return true if MODE is an SVE predicate mode. */
2247 static bool
2248 aarch64_sve_pred_mode_p (machine_mode mode)
2249 {
2250 return (TARGET_SVE
2251 && (mode == VNx16BImode
2252 || mode == VNx8BImode
2253 || mode == VNx4BImode
2254 || mode == VNx2BImode));
2255 }
2256
2257 /* Three mutually-exclusive flags describing a vector or predicate type. */
2258 const unsigned int VEC_ADVSIMD = 1;
2259 const unsigned int VEC_SVE_DATA = 2;
2260 const unsigned int VEC_SVE_PRED = 4;
2261 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2262 a structure of 2, 3 or 4 vectors. */
2263 const unsigned int VEC_STRUCT = 8;
2264 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2265 vector has fewer significant bytes than a full SVE vector. */
2266 const unsigned int VEC_PARTIAL = 16;
2267 /* Useful combinations of the above. */
2268 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
2269 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2270
2271 /* Return a set of flags describing the vector properties of mode MODE.
2272 Ignore modes that are not supported by the current target. */
2273 static unsigned int
2274 aarch64_classify_vector_mode (machine_mode mode)
2275 {
2276 if (aarch64_advsimd_struct_mode_p (mode))
2277 return VEC_ADVSIMD | VEC_STRUCT;
2278
2279 if (aarch64_sve_pred_mode_p (mode))
2280 return VEC_SVE_PRED;
2281
2282 /* Make the decision based on the mode's enum value rather than its
2283 properties, so that we keep the correct classification regardless
2284 of -msve-vector-bits. */
2285 switch (mode)
2286 {
2287 /* Partial SVE QI vectors. */
2288 case E_VNx2QImode:
2289 case E_VNx4QImode:
2290 case E_VNx8QImode:
2291 /* Partial SVE HI vectors. */
2292 case E_VNx2HImode:
2293 case E_VNx4HImode:
2294 /* Partial SVE SI vector. */
2295 case E_VNx2SImode:
2296 /* Partial SVE HF vectors. */
2297 case E_VNx2HFmode:
2298 case E_VNx4HFmode:
2299 /* Partial SVE BF vectors. */
2300 case E_VNx2BFmode:
2301 case E_VNx4BFmode:
2302 /* Partial SVE SF vector. */
2303 case E_VNx2SFmode:
2304 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2305
2306 case E_VNx16QImode:
2307 case E_VNx8HImode:
2308 case E_VNx4SImode:
2309 case E_VNx2DImode:
2310 case E_VNx8BFmode:
2311 case E_VNx8HFmode:
2312 case E_VNx4SFmode:
2313 case E_VNx2DFmode:
2314 return TARGET_SVE ? VEC_SVE_DATA : 0;
2315
2316 /* x2 SVE vectors. */
2317 case E_VNx32QImode:
2318 case E_VNx16HImode:
2319 case E_VNx8SImode:
2320 case E_VNx4DImode:
2321 case E_VNx16BFmode:
2322 case E_VNx16HFmode:
2323 case E_VNx8SFmode:
2324 case E_VNx4DFmode:
2325 /* x3 SVE vectors. */
2326 case E_VNx48QImode:
2327 case E_VNx24HImode:
2328 case E_VNx12SImode:
2329 case E_VNx6DImode:
2330 case E_VNx24BFmode:
2331 case E_VNx24HFmode:
2332 case E_VNx12SFmode:
2333 case E_VNx6DFmode:
2334 /* x4 SVE vectors. */
2335 case E_VNx64QImode:
2336 case E_VNx32HImode:
2337 case E_VNx16SImode:
2338 case E_VNx8DImode:
2339 case E_VNx32BFmode:
2340 case E_VNx32HFmode:
2341 case E_VNx16SFmode:
2342 case E_VNx8DFmode:
2343 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2344
2345 /* 64-bit Advanced SIMD vectors. */
2346 case E_V8QImode:
2347 case E_V4HImode:
2348 case E_V2SImode:
2349 /* ...E_V1DImode doesn't exist. */
2350 case E_V4HFmode:
2351 case E_V4BFmode:
2352 case E_V2SFmode:
2353 case E_V1DFmode:
2354 /* 128-bit Advanced SIMD vectors. */
2355 case E_V16QImode:
2356 case E_V8HImode:
2357 case E_V4SImode:
2358 case E_V2DImode:
2359 case E_V8HFmode:
2360 case E_V8BFmode:
2361 case E_V4SFmode:
2362 case E_V2DFmode:
2363 return TARGET_SIMD ? VEC_ADVSIMD : 0;
2364
2365 default:
2366 return 0;
2367 }
2368 }
2369
2370 /* Return true if MODE is any of the data vector modes, including
2371 structure modes. */
2372 static bool
2373 aarch64_vector_data_mode_p (machine_mode mode)
2374 {
2375 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2376 }
2377
2378 /* Return true if MODE is any form of SVE mode, including predicates,
2379 vectors and structures. */
2380 bool
2381 aarch64_sve_mode_p (machine_mode mode)
2382 {
2383 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2384 }
2385
2386 /* Return true if MODE is an SVE data vector mode; either a single vector
2387 or a structure of vectors. */
2388 static bool
2389 aarch64_sve_data_mode_p (machine_mode mode)
2390 {
2391 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2392 }
2393
2394 /* Return the number of defined bytes in one constituent vector of
2395 SVE mode MODE, which has vector flags VEC_FLAGS. */
2396 static poly_int64
2397 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2398 {
2399 if (vec_flags & VEC_PARTIAL)
2400 /* A single partial vector. */
2401 return GET_MODE_SIZE (mode);
2402
2403 if (vec_flags & VEC_SVE_DATA)
2404 /* A single vector or a tuple. */
2405 return BYTES_PER_SVE_VECTOR;
2406
2407 /* A single predicate. */
2408 gcc_assert (vec_flags & VEC_SVE_PRED);
2409 return BYTES_PER_SVE_PRED;
2410 }
2411
2412 /* Implement target hook TARGET_ARRAY_MODE. */
2413 static opt_machine_mode
2414 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2415 {
2416 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2417 && IN_RANGE (nelems, 2, 4))
2418 return mode_for_vector (GET_MODE_INNER (mode),
2419 GET_MODE_NUNITS (mode) * nelems);
2420
2421 return opt_machine_mode ();
2422 }
2423
2424 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
2425 static bool
2426 aarch64_array_mode_supported_p (machine_mode mode,
2427 unsigned HOST_WIDE_INT nelems)
2428 {
2429 if (TARGET_SIMD
2430 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2431 || AARCH64_VALID_SIMD_DREG_MODE (mode))
2432 && (nelems >= 2 && nelems <= 4))
2433 return true;
2434
2435 return false;
2436 }
2437
2438 /* MODE is some form of SVE vector mode. For data modes, return the number
2439 of vector register bits that each element of MODE occupies, such as 64
2440 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2441 in a 64-bit container). For predicate modes, return the number of
2442 data bits controlled by each significant predicate bit. */
2443
2444 static unsigned int
2445 aarch64_sve_container_bits (machine_mode mode)
2446 {
2447 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2448 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2449 ? BITS_PER_SVE_VECTOR
2450 : GET_MODE_BITSIZE (mode));
2451 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2452 }
2453
2454 /* Return the SVE predicate mode to use for elements that have
2455 ELEM_NBYTES bytes, if such a mode exists. */
2456
2457 opt_machine_mode
2458 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2459 {
2460 if (TARGET_SVE)
2461 {
2462 if (elem_nbytes == 1)
2463 return VNx16BImode;
2464 if (elem_nbytes == 2)
2465 return VNx8BImode;
2466 if (elem_nbytes == 4)
2467 return VNx4BImode;
2468 if (elem_nbytes == 8)
2469 return VNx2BImode;
2470 }
2471 return opt_machine_mode ();
2472 }
2473
2474 /* Return the SVE predicate mode that should be used to control
2475 SVE mode MODE. */
2476
2477 machine_mode
2478 aarch64_sve_pred_mode (machine_mode mode)
2479 {
2480 unsigned int bits = aarch64_sve_container_bits (mode);
2481 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2482 }
2483
2484 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
2485
2486 static opt_machine_mode
2487 aarch64_get_mask_mode (machine_mode mode)
2488 {
2489 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2490 if (vec_flags & VEC_SVE_DATA)
2491 return aarch64_sve_pred_mode (mode);
2492
2493 return default_get_mask_mode (mode);
2494 }
2495
2496 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
2497
2498 opt_machine_mode
2499 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2500 {
2501 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2502 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2503 machine_mode mode;
2504 FOR_EACH_MODE_IN_CLASS (mode, mclass)
2505 if (inner_mode == GET_MODE_INNER (mode)
2506 && known_eq (nunits, GET_MODE_NUNITS (mode))
2507 && aarch64_sve_data_mode_p (mode))
2508 return mode;
2509 return opt_machine_mode ();
2510 }
2511
2512 /* Return the integer element mode associated with SVE mode MODE. */
2513
2514 static scalar_int_mode
2515 aarch64_sve_element_int_mode (machine_mode mode)
2516 {
2517 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2518 ? BITS_PER_SVE_VECTOR
2519 : GET_MODE_BITSIZE (mode));
2520 unsigned int elt_bits = vector_element_size (vector_bits,
2521 GET_MODE_NUNITS (mode));
2522 return int_mode_for_size (elt_bits, 0).require ();
2523 }
2524
2525 /* Return an integer element mode that contains exactly
2526 aarch64_sve_container_bits (MODE) bits. This is wider than
2527 aarch64_sve_element_int_mode if MODE is a partial vector,
2528 otherwise it's the same. */
2529
2530 static scalar_int_mode
2531 aarch64_sve_container_int_mode (machine_mode mode)
2532 {
2533 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2534 }
2535
2536 /* Return the integer vector mode associated with SVE mode MODE.
2537 Unlike related_int_vector_mode, this can handle the case in which
2538 MODE is a predicate (and thus has a different total size). */
2539
2540 machine_mode
2541 aarch64_sve_int_mode (machine_mode mode)
2542 {
2543 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2544 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2545 }
2546
2547 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
2548
2549 static opt_machine_mode
2550 aarch64_vectorize_related_mode (machine_mode vector_mode,
2551 scalar_mode element_mode,
2552 poly_uint64 nunits)
2553 {
2554 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2555
2556 /* If we're operating on SVE vectors, try to return an SVE mode. */
2557 poly_uint64 sve_nunits;
2558 if ((vec_flags & VEC_SVE_DATA)
2559 && multiple_p (BYTES_PER_SVE_VECTOR,
2560 GET_MODE_SIZE (element_mode), &sve_nunits))
2561 {
2562 machine_mode sve_mode;
2563 if (maybe_ne (nunits, 0U))
2564 {
2565 /* Try to find a full or partial SVE mode with exactly
2566 NUNITS units. */
2567 if (multiple_p (sve_nunits, nunits)
2568 && aarch64_sve_data_mode (element_mode,
2569 nunits).exists (&sve_mode))
2570 return sve_mode;
2571 }
2572 else
2573 {
2574 /* Take the preferred number of units from the number of bytes
2575 that fit in VECTOR_MODE. We always start by "autodetecting"
2576 a full vector mode with preferred_simd_mode, so vectors
2577 chosen here will also be full vector modes. Then
2578 autovectorize_vector_modes tries smaller starting modes
2579 and thus smaller preferred numbers of units. */
2580 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2581 if (aarch64_sve_data_mode (element_mode,
2582 sve_nunits).exists (&sve_mode))
2583 return sve_mode;
2584 }
2585 }
2586
2587 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2588 if ((vec_flags & VEC_ADVSIMD)
2589 && known_eq (nunits, 0U)
2590 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2591 && maybe_ge (GET_MODE_BITSIZE (element_mode)
2592 * GET_MODE_NUNITS (vector_mode), 128U))
2593 {
2594 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2595 if (VECTOR_MODE_P (res))
2596 return res;
2597 }
2598
2599 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2600 }
2601
2602 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2603 prefer to use the first arithmetic operand as the else value if
2604 the else value doesn't matter, since that exactly matches the SVE
2605 destructive merging form. For ternary operations we could either
2606 pick the first operand and use FMAD-like instructions or the last
2607 operand and use FMLA-like instructions; the latter seems more
2608 natural. */
2609
2610 static tree
2611 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2612 {
2613 return nops == 3 ? ops[2] : ops[0];
2614 }
2615
2616 /* Implement TARGET_HARD_REGNO_NREGS. */
2617
2618 static unsigned int
2619 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2620 {
2621 /* ??? Logically we should only need to provide a value when
2622 HARD_REGNO_MODE_OK says that the combination is valid,
2623 but at the moment we need to handle all modes. Just ignore
2624 any runtime parts for registers that can't store them. */
2625 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2626 switch (aarch64_regno_regclass (regno))
2627 {
2628 case FP_REGS:
2629 case FP_LO_REGS:
2630 case FP_LO8_REGS:
2631 {
2632 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2633 if (vec_flags & VEC_SVE_DATA)
2634 return exact_div (GET_MODE_SIZE (mode),
2635 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2636 return CEIL (lowest_size, UNITS_PER_VREG);
2637 }
2638 case PR_REGS:
2639 case PR_LO_REGS:
2640 case PR_HI_REGS:
2641 case FFR_REGS:
2642 case PR_AND_FFR_REGS:
2643 return 1;
2644 default:
2645 return CEIL (lowest_size, UNITS_PER_WORD);
2646 }
2647 gcc_unreachable ();
2648 }
2649
2650 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2651
2652 static bool
2653 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2654 {
2655 if (GET_MODE_CLASS (mode) == MODE_CC)
2656 return regno == CC_REGNUM;
2657
2658 if (regno == VG_REGNUM)
2659 /* This must have the same size as _Unwind_Word. */
2660 return mode == DImode;
2661
2662 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2663 if (vec_flags & VEC_SVE_PRED)
2664 return pr_or_ffr_regnum_p (regno);
2665
2666 if (pr_or_ffr_regnum_p (regno))
2667 return false;
2668
2669 if (regno == SP_REGNUM)
2670 /* The purpose of comparing with ptr_mode is to support the
2671 global register variable associated with the stack pointer
2672 register via the syntax of asm ("wsp") in ILP32. */
2673 return mode == Pmode || mode == ptr_mode;
2674
2675 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2676 return mode == Pmode;
2677
2678 if (GP_REGNUM_P (regno))
2679 {
2680 if (vec_flags & VEC_ANY_SVE)
2681 return false;
2682 if (known_le (GET_MODE_SIZE (mode), 8))
2683 return true;
2684 if (known_le (GET_MODE_SIZE (mode), 16))
2685 return (regno & 1) == 0;
2686 }
2687 else if (FP_REGNUM_P (regno))
2688 {
2689 if (vec_flags & VEC_STRUCT)
2690 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2691 else
2692 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2693 }
2694
2695 return false;
2696 }
2697
2698 /* Return true if a function with type FNTYPE returns its value in
2699 SVE vector or predicate registers. */
2700
2701 static bool
2702 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2703 {
2704 tree return_type = TREE_TYPE (fntype);
2705
2706 pure_scalable_type_info pst_info;
2707 switch (pst_info.analyze (return_type))
2708 {
2709 case pure_scalable_type_info::IS_PST:
2710 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2711 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2712
2713 case pure_scalable_type_info::DOESNT_MATTER:
2714 gcc_assert (aarch64_return_in_memory_1 (return_type));
2715 return false;
2716
2717 case pure_scalable_type_info::NO_ABI_IDENTITY:
2718 case pure_scalable_type_info::ISNT_PST:
2719 return false;
2720 }
2721 gcc_unreachable ();
2722 }
2723
2724 /* Return true if a function with type FNTYPE takes arguments in
2725 SVE vector or predicate registers. */
2726
2727 static bool
2728 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2729 {
2730 CUMULATIVE_ARGS args_so_far_v;
2731 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2732 NULL_TREE, 0, true);
2733 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2734
2735 for (tree chain = TYPE_ARG_TYPES (fntype);
2736 chain && chain != void_list_node;
2737 chain = TREE_CHAIN (chain))
2738 {
2739 tree arg_type = TREE_VALUE (chain);
2740 if (arg_type == error_mark_node)
2741 return false;
2742
2743 function_arg_info arg (arg_type, /*named=*/true);
2744 apply_pass_by_reference_rules (&args_so_far_v, arg);
2745 pure_scalable_type_info pst_info;
2746 if (pst_info.analyze_registers (arg.type))
2747 {
2748 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2749 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2750 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2751 return true;
2752 }
2753
2754 targetm.calls.function_arg_advance (args_so_far, arg);
2755 }
2756 return false;
2757 }
2758
2759 /* Implement TARGET_FNTYPE_ABI. */
2760
2761 static const predefined_function_abi &
2762 aarch64_fntype_abi (const_tree fntype)
2763 {
2764 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2765 return aarch64_simd_abi ();
2766
2767 if (aarch64_returns_value_in_sve_regs_p (fntype)
2768 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2769 return aarch64_sve_abi ();
2770
2771 return default_function_abi;
2772 }
2773
2774 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2775
2776 static bool
2777 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2778 {
2779 return (aarch64_sve::builtin_type_p (type1)
2780 == aarch64_sve::builtin_type_p (type2));
2781 }
2782
2783 /* Return true if we should emit CFI for register REGNO. */
2784
2785 static bool
2786 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2787 {
2788 return (GP_REGNUM_P (regno)
2789 || !default_function_abi.clobbers_full_reg_p (regno));
2790 }
2791
2792 /* Return the mode we should use to save and restore register REGNO. */
2793
2794 static machine_mode
2795 aarch64_reg_save_mode (unsigned int regno)
2796 {
2797 if (GP_REGNUM_P (regno))
2798 return DImode;
2799
2800 if (FP_REGNUM_P (regno))
2801 switch (crtl->abi->id ())
2802 {
2803 case ARM_PCS_AAPCS64:
2804 /* Only the low 64 bits are saved by the base PCS. */
2805 return DFmode;
2806
2807 case ARM_PCS_SIMD:
2808 /* The vector PCS saves the low 128 bits (which is the full
2809 register on non-SVE targets). */
2810 return TFmode;
2811
2812 case ARM_PCS_SVE:
2813 /* Use vectors of DImode for registers that need frame
2814 information, so that the first 64 bytes of the save slot
2815 are always the equivalent of what storing D<n> would give. */
2816 if (aarch64_emit_cfi_for_reg_p (regno))
2817 return VNx2DImode;
2818
2819 /* Use vectors of bytes otherwise, so that the layout is
2820 endian-agnostic, and so that we can use LDR and STR for
2821 big-endian targets. */
2822 return VNx16QImode;
2823
2824 case ARM_PCS_TLSDESC:
2825 case ARM_PCS_UNKNOWN:
2826 break;
2827 }
2828
2829 if (PR_REGNUM_P (regno))
2830 /* Save the full predicate register. */
2831 return VNx16BImode;
2832
2833 gcc_unreachable ();
2834 }
2835
2836 /* Implement TARGET_INSN_CALLEE_ABI. */
2837
2838 const predefined_function_abi &
2839 aarch64_insn_callee_abi (const rtx_insn *insn)
2840 {
2841 rtx pat = PATTERN (insn);
2842 gcc_assert (GET_CODE (pat) == PARALLEL);
2843 rtx unspec = XVECEXP (pat, 0, 1);
2844 gcc_assert (GET_CODE (unspec) == UNSPEC
2845 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2846 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2847 }
2848
2849 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2850 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2851 clobbers the top 64 bits when restoring the bottom 64 bits. */
2852
2853 static bool
2854 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2855 unsigned int regno,
2856 machine_mode mode)
2857 {
2858 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2859 {
2860 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2861 unsigned int nregs = hard_regno_nregs (regno, mode);
2862 if (nregs > 1)
2863 per_register_size = exact_div (per_register_size, nregs);
2864 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2865 return maybe_gt (per_register_size, 16);
2866 return maybe_gt (per_register_size, 8);
2867 }
2868 return false;
2869 }
2870
2871 /* Implement REGMODE_NATURAL_SIZE. */
2872 poly_uint64
2873 aarch64_regmode_natural_size (machine_mode mode)
2874 {
2875 /* The natural size for SVE data modes is one SVE data vector,
2876 and similarly for predicates. We can't independently modify
2877 anything smaller than that. */
2878 /* ??? For now, only do this for variable-width SVE registers.
2879 Doing it for constant-sized registers breaks lower-subreg.c. */
2880 /* ??? And once that's fixed, we should probably have similar
2881 code for Advanced SIMD. */
2882 if (!aarch64_sve_vg.is_constant ())
2883 {
2884 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2885 if (vec_flags & VEC_SVE_PRED)
2886 return BYTES_PER_SVE_PRED;
2887 if (vec_flags & VEC_SVE_DATA)
2888 return BYTES_PER_SVE_VECTOR;
2889 }
2890 return UNITS_PER_WORD;
2891 }
2892
2893 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2894 machine_mode
2895 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2896 machine_mode mode)
2897 {
2898 /* The predicate mode determines which bits are significant and
2899 which are "don't care". Decreasing the number of lanes would
2900 lose data while increasing the number of lanes would make bits
2901 unnecessarily significant. */
2902 if (PR_REGNUM_P (regno))
2903 return mode;
2904 if (known_ge (GET_MODE_SIZE (mode), 4))
2905 return mode;
2906 else
2907 return SImode;
2908 }
2909
2910 /* Return true if I's bits are consecutive ones from the MSB. */
2911 bool
2912 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2913 {
2914 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2915 }
2916
2917 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2918 that strcpy from constants will be faster. */
2919
2920 static HOST_WIDE_INT
2921 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2922 {
2923 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2924 return MAX (align, BITS_PER_WORD);
2925 return align;
2926 }
2927
2928 /* Return true if calls to DECL should be treated as
2929 long-calls (ie called via a register). */
2930 static bool
2931 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2932 {
2933 return false;
2934 }
2935
2936 /* Return true if calls to symbol-ref SYM should be treated as
2937 long-calls (ie called via a register). */
2938 bool
2939 aarch64_is_long_call_p (rtx sym)
2940 {
2941 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2942 }
2943
2944 /* Return true if calls to symbol-ref SYM should not go through
2945 plt stubs. */
2946
2947 bool
2948 aarch64_is_noplt_call_p (rtx sym)
2949 {
2950 const_tree decl = SYMBOL_REF_DECL (sym);
2951
2952 if (flag_pic
2953 && decl
2954 && (!flag_plt
2955 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2956 && !targetm.binds_local_p (decl))
2957 return true;
2958
2959 return false;
2960 }
2961
2962 /* Emit an insn that's a simple single-set. Both the operands must be
2963 known to be valid. */
2964 inline static rtx_insn *
2965 emit_set_insn (rtx x, rtx y)
2966 {
2967 return emit_insn (gen_rtx_SET (x, y));
2968 }
2969
2970 /* X and Y are two things to compare using CODE. Emit the compare insn and
2971 return the rtx for register 0 in the proper mode. */
2972 rtx
2973 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2974 {
2975 machine_mode cmp_mode = GET_MODE (x);
2976 machine_mode cc_mode;
2977 rtx cc_reg;
2978
2979 if (cmp_mode == TImode)
2980 {
2981 gcc_assert (code == NE);
2982
2983 cc_mode = CCmode;
2984 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2985
2986 rtx x_lo = operand_subword (x, 0, 0, TImode);
2987 rtx y_lo = operand_subword (y, 0, 0, TImode);
2988 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2989
2990 rtx x_hi = operand_subword (x, 1, 0, TImode);
2991 rtx y_hi = operand_subword (y, 1, 0, TImode);
2992 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2993 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2994 GEN_INT (AARCH64_EQ)));
2995 }
2996 else
2997 {
2998 cc_mode = SELECT_CC_MODE (code, x, y);
2999 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3000 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3001 }
3002 return cc_reg;
3003 }
3004
3005 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
3006
3007 static rtx
3008 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3009 machine_mode y_mode)
3010 {
3011 if (y_mode == E_QImode || y_mode == E_HImode)
3012 {
3013 if (CONST_INT_P (y))
3014 {
3015 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3016 y_mode = SImode;
3017 }
3018 else
3019 {
3020 rtx t, cc_reg;
3021 machine_mode cc_mode;
3022
3023 t = gen_rtx_ZERO_EXTEND (SImode, y);
3024 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3025 cc_mode = CC_SWPmode;
3026 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3027 emit_set_insn (cc_reg, t);
3028 return cc_reg;
3029 }
3030 }
3031
3032 if (!aarch64_plus_operand (y, y_mode))
3033 y = force_reg (y_mode, y);
3034
3035 return aarch64_gen_compare_reg (code, x, y);
3036 }
3037
3038 /* Build the SYMBOL_REF for __tls_get_addr. */
3039
3040 static GTY(()) rtx tls_get_addr_libfunc;
3041
3042 rtx
3043 aarch64_tls_get_addr (void)
3044 {
3045 if (!tls_get_addr_libfunc)
3046 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3047 return tls_get_addr_libfunc;
3048 }
3049
3050 /* Return the TLS model to use for ADDR. */
3051
3052 static enum tls_model
3053 tls_symbolic_operand_type (rtx addr)
3054 {
3055 enum tls_model tls_kind = TLS_MODEL_NONE;
3056 poly_int64 offset;
3057 addr = strip_offset_and_salt (addr, &offset);
3058 if (SYMBOL_REF_P (addr))
3059 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3060
3061 return tls_kind;
3062 }
3063
3064 /* We'll allow lo_sum's in addresses in our legitimate addresses
3065 so that combine would take care of combining addresses where
3066 necessary, but for generation purposes, we'll generate the address
3067 as :
3068 RTL Absolute
3069 tmp = hi (symbol_ref); adrp x1, foo
3070 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
3071 nop
3072
3073 PIC TLS
3074 adrp x1, :got:foo adrp tmp, :tlsgd:foo
3075 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
3076 bl __tls_get_addr
3077 nop
3078
3079 Load TLS symbol, depending on TLS mechanism and TLS access model.
3080
3081 Global Dynamic - Traditional TLS:
3082 adrp tmp, :tlsgd:imm
3083 add dest, tmp, #:tlsgd_lo12:imm
3084 bl __tls_get_addr
3085
3086 Global Dynamic - TLS Descriptors:
3087 adrp dest, :tlsdesc:imm
3088 ldr tmp, [dest, #:tlsdesc_lo12:imm]
3089 add dest, dest, #:tlsdesc_lo12:imm
3090 blr tmp
3091 mrs tp, tpidr_el0
3092 add dest, dest, tp
3093
3094 Initial Exec:
3095 mrs tp, tpidr_el0
3096 adrp tmp, :gottprel:imm
3097 ldr dest, [tmp, #:gottprel_lo12:imm]
3098 add dest, dest, tp
3099
3100 Local Exec:
3101 mrs tp, tpidr_el0
3102 add t0, tp, #:tprel_hi12:imm, lsl #12
3103 add t0, t0, #:tprel_lo12_nc:imm
3104 */
3105
3106 static void
3107 aarch64_load_symref_appropriately (rtx dest, rtx imm,
3108 enum aarch64_symbol_type type)
3109 {
3110 switch (type)
3111 {
3112 case SYMBOL_SMALL_ABSOLUTE:
3113 {
3114 /* In ILP32, the mode of dest can be either SImode or DImode. */
3115 rtx tmp_reg = dest;
3116 machine_mode mode = GET_MODE (dest);
3117
3118 gcc_assert (mode == Pmode || mode == ptr_mode);
3119
3120 if (can_create_pseudo_p ())
3121 tmp_reg = gen_reg_rtx (mode);
3122
3123 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3124 emit_insn (gen_add_losym (dest, tmp_reg, imm));
3125 return;
3126 }
3127
3128 case SYMBOL_TINY_ABSOLUTE:
3129 emit_insn (gen_rtx_SET (dest, imm));
3130 return;
3131
3132 case SYMBOL_SMALL_GOT_28K:
3133 {
3134 machine_mode mode = GET_MODE (dest);
3135 rtx gp_rtx = pic_offset_table_rtx;
3136 rtx insn;
3137 rtx mem;
3138
3139 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3140 here before rtl expand. Tree IVOPT will generate rtl pattern to
3141 decide rtx costs, in which case pic_offset_table_rtx is not
3142 initialized. For that case no need to generate the first adrp
3143 instruction as the final cost for global variable access is
3144 one instruction. */
3145 if (gp_rtx != NULL)
3146 {
3147 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3148 using the page base as GOT base, the first page may be wasted,
3149 in the worst scenario, there is only 28K space for GOT).
3150
3151 The generate instruction sequence for accessing global variable
3152 is:
3153
3154 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3155
3156 Only one instruction needed. But we must initialize
3157 pic_offset_table_rtx properly. We generate initialize insn for
3158 every global access, and allow CSE to remove all redundant.
3159
3160 The final instruction sequences will look like the following
3161 for multiply global variables access.
3162
3163 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3164
3165 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3166 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3167 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3168 ... */
3169
3170 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3171 crtl->uses_pic_offset_table = 1;
3172 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3173
3174 if (mode != GET_MODE (gp_rtx))
3175 gp_rtx = gen_lowpart (mode, gp_rtx);
3176
3177 }
3178
3179 if (mode == ptr_mode)
3180 {
3181 if (mode == DImode)
3182 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3183 else
3184 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3185
3186 mem = XVECEXP (SET_SRC (insn), 0, 0);
3187 }
3188 else
3189 {
3190 gcc_assert (mode == Pmode);
3191
3192 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3193 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3194 }
3195
3196 /* The operand is expected to be MEM. Whenever the related insn
3197 pattern changed, above code which calculate mem should be
3198 updated. */
3199 gcc_assert (MEM_P (mem));
3200 MEM_READONLY_P (mem) = 1;
3201 MEM_NOTRAP_P (mem) = 1;
3202 emit_insn (insn);
3203 return;
3204 }
3205
3206 case SYMBOL_SMALL_GOT_4G:
3207 {
3208 /* In ILP32, the mode of dest can be either SImode or DImode,
3209 while the got entry is always of SImode size. The mode of
3210 dest depends on how dest is used: if dest is assigned to a
3211 pointer (e.g. in the memory), it has SImode; it may have
3212 DImode if dest is dereferenced to access the memeory.
3213 This is why we have to handle three different ldr_got_small
3214 patterns here (two patterns for ILP32). */
3215
3216 rtx insn;
3217 rtx mem;
3218 rtx tmp_reg = dest;
3219 machine_mode mode = GET_MODE (dest);
3220
3221 if (can_create_pseudo_p ())
3222 tmp_reg = gen_reg_rtx (mode);
3223
3224 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3225 if (mode == ptr_mode)
3226 {
3227 if (mode == DImode)
3228 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3229 else
3230 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3231
3232 mem = XVECEXP (SET_SRC (insn), 0, 0);
3233 }
3234 else
3235 {
3236 gcc_assert (mode == Pmode);
3237
3238 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3239 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3240 }
3241
3242 gcc_assert (MEM_P (mem));
3243 MEM_READONLY_P (mem) = 1;
3244 MEM_NOTRAP_P (mem) = 1;
3245 emit_insn (insn);
3246 return;
3247 }
3248
3249 case SYMBOL_SMALL_TLSGD:
3250 {
3251 rtx_insn *insns;
3252 /* The return type of __tls_get_addr is the C pointer type
3253 so use ptr_mode. */
3254 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3255 rtx tmp_reg = dest;
3256
3257 if (GET_MODE (dest) != ptr_mode)
3258 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3259
3260 start_sequence ();
3261 if (ptr_mode == SImode)
3262 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3263 else
3264 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3265 insns = get_insns ();
3266 end_sequence ();
3267
3268 RTL_CONST_CALL_P (insns) = 1;
3269 emit_libcall_block (insns, tmp_reg, result, imm);
3270 /* Convert back to the mode of the dest adding a zero_extend
3271 from SImode (ptr_mode) to DImode (Pmode). */
3272 if (dest != tmp_reg)
3273 convert_move (dest, tmp_reg, true);
3274 return;
3275 }
3276
3277 case SYMBOL_SMALL_TLSDESC:
3278 {
3279 machine_mode mode = GET_MODE (dest);
3280 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3281 rtx tp;
3282
3283 gcc_assert (mode == Pmode || mode == ptr_mode);
3284
3285 /* In ILP32, the got entry is always of SImode size. Unlike
3286 small GOT, the dest is fixed at reg 0. */
3287 if (TARGET_ILP32)
3288 emit_insn (gen_tlsdesc_small_si (imm));
3289 else
3290 emit_insn (gen_tlsdesc_small_di (imm));
3291 tp = aarch64_load_tp (NULL);
3292
3293 if (mode != Pmode)
3294 tp = gen_lowpart (mode, tp);
3295
3296 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3297 if (REG_P (dest))
3298 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3299 return;
3300 }
3301
3302 case SYMBOL_SMALL_TLSIE:
3303 {
3304 /* In ILP32, the mode of dest can be either SImode or DImode,
3305 while the got entry is always of SImode size. The mode of
3306 dest depends on how dest is used: if dest is assigned to a
3307 pointer (e.g. in the memory), it has SImode; it may have
3308 DImode if dest is dereferenced to access the memeory.
3309 This is why we have to handle three different tlsie_small
3310 patterns here (two patterns for ILP32). */
3311 machine_mode mode = GET_MODE (dest);
3312 rtx tmp_reg = gen_reg_rtx (mode);
3313 rtx tp = aarch64_load_tp (NULL);
3314
3315 if (mode == ptr_mode)
3316 {
3317 if (mode == DImode)
3318 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3319 else
3320 {
3321 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3322 tp = gen_lowpart (mode, tp);
3323 }
3324 }
3325 else
3326 {
3327 gcc_assert (mode == Pmode);
3328 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3329 }
3330
3331 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3332 if (REG_P (dest))
3333 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3334 return;
3335 }
3336
3337 case SYMBOL_TLSLE12:
3338 case SYMBOL_TLSLE24:
3339 case SYMBOL_TLSLE32:
3340 case SYMBOL_TLSLE48:
3341 {
3342 machine_mode mode = GET_MODE (dest);
3343 rtx tp = aarch64_load_tp (NULL);
3344
3345 if (mode != Pmode)
3346 tp = gen_lowpart (mode, tp);
3347
3348 switch (type)
3349 {
3350 case SYMBOL_TLSLE12:
3351 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3352 (dest, tp, imm));
3353 break;
3354 case SYMBOL_TLSLE24:
3355 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3356 (dest, tp, imm));
3357 break;
3358 case SYMBOL_TLSLE32:
3359 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3360 (dest, imm));
3361 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3362 (dest, dest, tp));
3363 break;
3364 case SYMBOL_TLSLE48:
3365 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3366 (dest, imm));
3367 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3368 (dest, dest, tp));
3369 break;
3370 default:
3371 gcc_unreachable ();
3372 }
3373
3374 if (REG_P (dest))
3375 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3376 return;
3377 }
3378
3379 case SYMBOL_TINY_GOT:
3380 {
3381 rtx insn;
3382 machine_mode mode = GET_MODE (dest);
3383
3384 if (mode == ptr_mode)
3385 insn = gen_ldr_got_tiny (mode, dest, imm);
3386 else
3387 {
3388 gcc_assert (mode == Pmode);
3389 insn = gen_ldr_got_tiny_sidi (dest, imm);
3390 }
3391
3392 emit_insn (insn);
3393 return;
3394 }
3395
3396 case SYMBOL_TINY_TLSIE:
3397 {
3398 machine_mode mode = GET_MODE (dest);
3399 rtx tp = aarch64_load_tp (NULL);
3400
3401 if (mode == ptr_mode)
3402 {
3403 if (mode == DImode)
3404 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3405 else
3406 {
3407 tp = gen_lowpart (mode, tp);
3408 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3409 }
3410 }
3411 else
3412 {
3413 gcc_assert (mode == Pmode);
3414 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3415 }
3416
3417 if (REG_P (dest))
3418 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3419 return;
3420 }
3421
3422 default:
3423 gcc_unreachable ();
3424 }
3425 }
3426
3427 /* Emit a move from SRC to DEST. Assume that the move expanders can
3428 handle all moves if !can_create_pseudo_p (). The distinction is
3429 important because, unlike emit_move_insn, the move expanders know
3430 how to force Pmode objects into the constant pool even when the
3431 constant pool address is not itself legitimate. */
3432 static rtx
3433 aarch64_emit_move (rtx dest, rtx src)
3434 {
3435 return (can_create_pseudo_p ()
3436 ? emit_move_insn (dest, src)
3437 : emit_move_insn_1 (dest, src));
3438 }
3439
3440 /* Apply UNOPTAB to OP and store the result in DEST. */
3441
3442 static void
3443 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3444 {
3445 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3446 if (dest != tmp)
3447 emit_move_insn (dest, tmp);
3448 }
3449
3450 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3451
3452 static void
3453 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3454 {
3455 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3456 OPTAB_DIRECT);
3457 if (dest != tmp)
3458 emit_move_insn (dest, tmp);
3459 }
3460
3461 /* Split a 128-bit move operation into two 64-bit move operations,
3462 taking care to handle partial overlap of register to register
3463 copies. Special cases are needed when moving between GP regs and
3464 FP regs. SRC can be a register, constant or memory; DST a register
3465 or memory. If either operand is memory it must not have any side
3466 effects. */
3467 void
3468 aarch64_split_128bit_move (rtx dst, rtx src)
3469 {
3470 rtx dst_lo, dst_hi;
3471 rtx src_lo, src_hi;
3472
3473 machine_mode mode = GET_MODE (dst);
3474
3475 gcc_assert (mode == TImode || mode == TFmode);
3476 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3477 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3478
3479 if (REG_P (dst) && REG_P (src))
3480 {
3481 int src_regno = REGNO (src);
3482 int dst_regno = REGNO (dst);
3483
3484 /* Handle FP <-> GP regs. */
3485 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3486 {
3487 src_lo = gen_lowpart (word_mode, src);
3488 src_hi = gen_highpart (word_mode, src);
3489
3490 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3491 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3492 return;
3493 }
3494 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3495 {
3496 dst_lo = gen_lowpart (word_mode, dst);
3497 dst_hi = gen_highpart (word_mode, dst);
3498
3499 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3500 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3501 return;
3502 }
3503 }
3504
3505 dst_lo = gen_lowpart (word_mode, dst);
3506 dst_hi = gen_highpart (word_mode, dst);
3507 src_lo = gen_lowpart (word_mode, src);
3508 src_hi = gen_highpart_mode (word_mode, mode, src);
3509
3510 /* At most one pairing may overlap. */
3511 if (reg_overlap_mentioned_p (dst_lo, src_hi))
3512 {
3513 aarch64_emit_move (dst_hi, src_hi);
3514 aarch64_emit_move (dst_lo, src_lo);
3515 }
3516 else
3517 {
3518 aarch64_emit_move (dst_lo, src_lo);
3519 aarch64_emit_move (dst_hi, src_hi);
3520 }
3521 }
3522
3523 /* Return true if we should split a move from 128-bit value SRC
3524 to 128-bit register DEST. */
3525
3526 bool
3527 aarch64_split_128bit_move_p (rtx dst, rtx src)
3528 {
3529 if (FP_REGNUM_P (REGNO (dst)))
3530 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3531 /* All moves to GPRs need to be split. */
3532 return true;
3533 }
3534
3535 /* Split a complex SIMD combine. */
3536
3537 void
3538 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3539 {
3540 machine_mode src_mode = GET_MODE (src1);
3541 machine_mode dst_mode = GET_MODE (dst);
3542
3543 gcc_assert (VECTOR_MODE_P (dst_mode));
3544 gcc_assert (register_operand (dst, dst_mode)
3545 && register_operand (src1, src_mode)
3546 && register_operand (src2, src_mode));
3547
3548 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3549 return;
3550 }
3551
3552 /* Split a complex SIMD move. */
3553
3554 void
3555 aarch64_split_simd_move (rtx dst, rtx src)
3556 {
3557 machine_mode src_mode = GET_MODE (src);
3558 machine_mode dst_mode = GET_MODE (dst);
3559
3560 gcc_assert (VECTOR_MODE_P (dst_mode));
3561
3562 if (REG_P (dst) && REG_P (src))
3563 {
3564 gcc_assert (VECTOR_MODE_P (src_mode));
3565 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3566 }
3567 }
3568
3569 bool
3570 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3571 machine_mode ymode, rtx y)
3572 {
3573 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3574 gcc_assert (r != NULL);
3575 return rtx_equal_p (x, r);
3576 }
3577
3578 /* Return TARGET if it is nonnull and a register of mode MODE.
3579 Otherwise, return a fresh register of mode MODE if we can,
3580 or TARGET reinterpreted as MODE if we can't. */
3581
3582 static rtx
3583 aarch64_target_reg (rtx target, machine_mode mode)
3584 {
3585 if (target && REG_P (target) && GET_MODE (target) == mode)
3586 return target;
3587 if (!can_create_pseudo_p ())
3588 {
3589 gcc_assert (target);
3590 return gen_lowpart (mode, target);
3591 }
3592 return gen_reg_rtx (mode);
3593 }
3594
3595 /* Return a register that contains the constant in BUILDER, given that
3596 the constant is a legitimate move operand. Use TARGET as the register
3597 if it is nonnull and convenient. */
3598
3599 static rtx
3600 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3601 {
3602 rtx src = builder.build ();
3603 target = aarch64_target_reg (target, GET_MODE (src));
3604 emit_insn (gen_rtx_SET (target, src));
3605 return target;
3606 }
3607
3608 static rtx
3609 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3610 {
3611 if (can_create_pseudo_p ())
3612 return force_reg (mode, value);
3613 else
3614 {
3615 gcc_assert (x);
3616 aarch64_emit_move (x, value);
3617 return x;
3618 }
3619 }
3620
3621 /* Return true if predicate value X is a constant in which every element
3622 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3623 value, i.e. as a predicate in which all bits are significant. */
3624
3625 static bool
3626 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3627 {
3628 if (GET_CODE (x) != CONST_VECTOR)
3629 return false;
3630
3631 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3632 GET_MODE_NUNITS (GET_MODE (x)));
3633 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3634 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3635 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3636
3637 unsigned int nelts = const_vector_encoded_nelts (x);
3638 for (unsigned int i = 0; i < nelts; ++i)
3639 {
3640 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3641 if (!CONST_INT_P (elt))
3642 return false;
3643
3644 builder.quick_push (elt);
3645 for (unsigned int j = 1; j < factor; ++j)
3646 builder.quick_push (const0_rtx);
3647 }
3648 builder.finalize ();
3649 return true;
3650 }
3651
3652 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3653 widest predicate element size it can have (that is, the largest size
3654 for which each element would still be 0 or 1). */
3655
3656 unsigned int
3657 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3658 {
3659 /* Start with the most optimistic assumption: that we only need
3660 one bit per pattern. This is what we will use if only the first
3661 bit in each pattern is ever set. */
3662 unsigned int mask = GET_MODE_SIZE (DImode);
3663 mask |= builder.npatterns ();
3664
3665 /* Look for set bits. */
3666 unsigned int nelts = builder.encoded_nelts ();
3667 for (unsigned int i = 1; i < nelts; ++i)
3668 if (INTVAL (builder.elt (i)) != 0)
3669 {
3670 if (i & 1)
3671 return 1;
3672 mask |= i;
3673 }
3674 return mask & -mask;
3675 }
3676
3677 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3678 return that predicate mode, otherwise return opt_machine_mode (). */
3679
3680 opt_machine_mode
3681 aarch64_ptrue_all_mode (rtx x)
3682 {
3683 gcc_assert (GET_MODE (x) == VNx16BImode);
3684 if (GET_CODE (x) != CONST_VECTOR
3685 || !CONST_VECTOR_DUPLICATE_P (x)
3686 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3687 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3688 return opt_machine_mode ();
3689
3690 unsigned int nelts = const_vector_encoded_nelts (x);
3691 for (unsigned int i = 1; i < nelts; ++i)
3692 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3693 return opt_machine_mode ();
3694
3695 return aarch64_sve_pred_mode (nelts);
3696 }
3697
3698 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3699 that the constant would have with predicate element size ELT_SIZE
3700 (ignoring the upper bits in each element) and return:
3701
3702 * -1 if all bits are set
3703 * N if the predicate has N leading set bits followed by all clear bits
3704 * 0 if the predicate does not have any of these forms. */
3705
3706 int
3707 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3708 unsigned int elt_size)
3709 {
3710 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3711 followed by set bits. */
3712 if (builder.nelts_per_pattern () == 3)
3713 return 0;
3714
3715 /* Skip over leading set bits. */
3716 unsigned int nelts = builder.encoded_nelts ();
3717 unsigned int i = 0;
3718 for (; i < nelts; i += elt_size)
3719 if (INTVAL (builder.elt (i)) == 0)
3720 break;
3721 unsigned int vl = i / elt_size;
3722
3723 /* Check for the all-true case. */
3724 if (i == nelts)
3725 return -1;
3726
3727 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3728 repeating pattern of set bits followed by clear bits. */
3729 if (builder.nelts_per_pattern () != 2)
3730 return 0;
3731
3732 /* We have a "foreground" value and a duplicated "background" value.
3733 If the background might repeat and the last set bit belongs to it,
3734 we might have set bits followed by clear bits followed by set bits. */
3735 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3736 return 0;
3737
3738 /* Make sure that the rest are all clear. */
3739 for (; i < nelts; i += elt_size)
3740 if (INTVAL (builder.elt (i)) != 0)
3741 return 0;
3742
3743 return vl;
3744 }
3745
3746 /* See if there is an svpattern that encodes an SVE predicate of mode
3747 PRED_MODE in which the first VL bits are set and the rest are clear.
3748 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3749 A VL of -1 indicates an all-true vector. */
3750
3751 aarch64_svpattern
3752 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3753 {
3754 if (vl < 0)
3755 return AARCH64_SV_ALL;
3756
3757 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3758 return AARCH64_NUM_SVPATTERNS;
3759
3760 if (vl >= 1 && vl <= 8)
3761 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3762
3763 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3764 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3765
3766 int max_vl;
3767 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3768 {
3769 if (vl == (max_vl / 3) * 3)
3770 return AARCH64_SV_MUL3;
3771 /* These would only trigger for non-power-of-2 lengths. */
3772 if (vl == (max_vl & -4))
3773 return AARCH64_SV_MUL4;
3774 if (vl == (1 << floor_log2 (max_vl)))
3775 return AARCH64_SV_POW2;
3776 if (vl == max_vl)
3777 return AARCH64_SV_ALL;
3778 }
3779 return AARCH64_NUM_SVPATTERNS;
3780 }
3781
3782 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3783 bits has the lowest bit set and the upper bits clear. This is the
3784 VNx16BImode equivalent of a PTRUE for controlling elements of
3785 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3786 all bits are significant, even the upper zeros. */
3787
3788 rtx
3789 aarch64_ptrue_all (unsigned int elt_size)
3790 {
3791 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3792 builder.quick_push (const1_rtx);
3793 for (unsigned int i = 1; i < elt_size; ++i)
3794 builder.quick_push (const0_rtx);
3795 return builder.build ();
3796 }
3797
3798 /* Return an all-true predicate register of mode MODE. */
3799
3800 rtx
3801 aarch64_ptrue_reg (machine_mode mode)
3802 {
3803 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3804 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3805 return gen_lowpart (mode, reg);
3806 }
3807
3808 /* Return an all-false predicate register of mode MODE. */
3809
3810 rtx
3811 aarch64_pfalse_reg (machine_mode mode)
3812 {
3813 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3814 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3815 return gen_lowpart (mode, reg);
3816 }
3817
3818 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3819 for it. PRED2[0] is the predicate for the instruction whose result
3820 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3821 for it. Return true if we can prove that the two predicates are
3822 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3823 with PRED1[0] without changing behavior. */
3824
3825 bool
3826 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3827 {
3828 machine_mode mode = GET_MODE (pred1[0]);
3829 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3830 && mode == GET_MODE (pred2[0])
3831 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3832 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3833
3834 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3835 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3836 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3837 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3838 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3839 }
3840
3841 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3842 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3843 Use TARGET as the target register if nonnull and convenient. */
3844
3845 static rtx
3846 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3847 machine_mode data_mode, rtx op1, rtx op2)
3848 {
3849 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3850 expand_operand ops[5];
3851 create_output_operand (&ops[0], target, pred_mode);
3852 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3853 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3854 create_input_operand (&ops[3], op1, data_mode);
3855 create_input_operand (&ops[4], op2, data_mode);
3856 expand_insn (icode, 5, ops);
3857 return ops[0].value;
3858 }
3859
3860 /* Use a comparison to convert integer vector SRC into MODE, which is
3861 the corresponding SVE predicate mode. Use TARGET for the result
3862 if it's nonnull and convenient. */
3863
3864 rtx
3865 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3866 {
3867 machine_mode src_mode = GET_MODE (src);
3868 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3869 src, CONST0_RTX (src_mode));
3870 }
3871
3872 /* Return the assembly token for svprfop value PRFOP. */
3873
3874 static const char *
3875 svprfop_token (enum aarch64_svprfop prfop)
3876 {
3877 switch (prfop)
3878 {
3879 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3880 AARCH64_FOR_SVPRFOP (CASE)
3881 #undef CASE
3882 case AARCH64_NUM_SVPRFOPS:
3883 break;
3884 }
3885 gcc_unreachable ();
3886 }
3887
3888 /* Return the assembly string for an SVE prefetch operation with
3889 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3890 and that SUFFIX is the format for the remaining operands. */
3891
3892 char *
3893 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3894 const char *suffix)
3895 {
3896 static char buffer[128];
3897 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3898 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3899 mnemonic, svprfop_token (prfop), suffix);
3900 gcc_assert (written < sizeof (buffer));
3901 return buffer;
3902 }
3903
3904 /* Check whether we can calculate the number of elements in PATTERN
3905 at compile time, given that there are NELTS_PER_VQ elements per
3906 128-bit block. Return the value if so, otherwise return -1. */
3907
3908 HOST_WIDE_INT
3909 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3910 {
3911 unsigned int vl, const_vg;
3912 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3913 vl = 1 + (pattern - AARCH64_SV_VL1);
3914 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3915 vl = 16 << (pattern - AARCH64_SV_VL16);
3916 else if (aarch64_sve_vg.is_constant (&const_vg))
3917 {
3918 /* There are two vector granules per quadword. */
3919 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3920 switch (pattern)
3921 {
3922 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3923 case AARCH64_SV_MUL4: return nelts & -4;
3924 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3925 case AARCH64_SV_ALL: return nelts;
3926 default: gcc_unreachable ();
3927 }
3928 }
3929 else
3930 return -1;
3931
3932 /* There are two vector granules per quadword. */
3933 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3934 if (known_le (vl, nelts_all))
3935 return vl;
3936
3937 /* Requesting more elements than are available results in a PFALSE. */
3938 if (known_gt (vl, nelts_all))
3939 return 0;
3940
3941 return -1;
3942 }
3943
3944 /* Return true if we can move VALUE into a register using a single
3945 CNT[BHWD] instruction. */
3946
3947 static bool
3948 aarch64_sve_cnt_immediate_p (poly_int64 value)
3949 {
3950 HOST_WIDE_INT factor = value.coeffs[0];
3951 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3952 return (value.coeffs[1] == factor
3953 && IN_RANGE (factor, 2, 16 * 16)
3954 && (factor & 1) == 0
3955 && factor <= 16 * (factor & -factor));
3956 }
3957
3958 /* Likewise for rtx X. */
3959
3960 bool
3961 aarch64_sve_cnt_immediate_p (rtx x)
3962 {
3963 poly_int64 value;
3964 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3965 }
3966
3967 /* Return the asm string for an instruction with a CNT-like vector size
3968 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3969 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3970 first part of the operands template (the part that comes before the
3971 vector size itself). PATTERN is the pattern to use. FACTOR is the
3972 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3973 in each quadword. If it is zero, we can use any element size. */
3974
3975 static char *
3976 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3977 aarch64_svpattern pattern,
3978 unsigned int factor,
3979 unsigned int nelts_per_vq)
3980 {
3981 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3982
3983 if (nelts_per_vq == 0)
3984 /* There is some overlap in the ranges of the four CNT instructions.
3985 Here we always use the smallest possible element size, so that the
3986 multiplier is 1 whereever possible. */
3987 nelts_per_vq = factor & -factor;
3988 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3989 gcc_assert (IN_RANGE (shift, 1, 4));
3990 char suffix = "dwhb"[shift - 1];
3991
3992 factor >>= shift;
3993 unsigned int written;
3994 if (pattern == AARCH64_SV_ALL && factor == 1)
3995 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3996 prefix, suffix, operands);
3997 else if (factor == 1)
3998 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3999 prefix, suffix, operands, svpattern_token (pattern));
4000 else
4001 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4002 prefix, suffix, operands, svpattern_token (pattern),
4003 factor);
4004 gcc_assert (written < sizeof (buffer));
4005 return buffer;
4006 }
4007
4008 /* Return the asm string for an instruction with a CNT-like vector size
4009 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4010 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4011 first part of the operands template (the part that comes before the
4012 vector size itself). X is the value of the vector size operand,
4013 as a polynomial integer rtx; we need to convert this into an "all"
4014 pattern with a multiplier. */
4015
4016 char *
4017 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4018 rtx x)
4019 {
4020 poly_int64 value = rtx_to_poly_int64 (x);
4021 gcc_assert (aarch64_sve_cnt_immediate_p (value));
4022 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4023 value.coeffs[1], 0);
4024 }
4025
4026 /* Return the asm string for an instruction with a CNT-like vector size
4027 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4028 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4029 first part of the operands template (the part that comes before the
4030 vector size itself). CNT_PAT[0..2] are the operands of the
4031 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
4032
4033 char *
4034 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4035 const char *operands, rtx *cnt_pat)
4036 {
4037 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4038 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4039 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4040 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4041 factor, nelts_per_vq);
4042 }
4043
4044 /* Return true if we can add X using a single SVE INC or DEC instruction. */
4045
4046 bool
4047 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4048 {
4049 poly_int64 value;
4050 return (poly_int_rtx_p (x, &value)
4051 && (aarch64_sve_cnt_immediate_p (value)
4052 || aarch64_sve_cnt_immediate_p (-value)));
4053 }
4054
4055 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4056 operand 0. */
4057
4058 char *
4059 aarch64_output_sve_scalar_inc_dec (rtx offset)
4060 {
4061 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4062 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4063 if (offset_value.coeffs[1] > 0)
4064 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4065 offset_value.coeffs[1], 0);
4066 else
4067 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4068 -offset_value.coeffs[1], 0);
4069 }
4070
4071 /* Return true if we can add VALUE to a register using a single ADDVL
4072 or ADDPL instruction. */
4073
4074 static bool
4075 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4076 {
4077 HOST_WIDE_INT factor = value.coeffs[0];
4078 if (factor == 0 || value.coeffs[1] != factor)
4079 return false;
4080 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4081 and a value of 16 is one vector width. */
4082 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4083 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4084 }
4085
4086 /* Likewise for rtx X. */
4087
4088 bool
4089 aarch64_sve_addvl_addpl_immediate_p (rtx x)
4090 {
4091 poly_int64 value;
4092 return (poly_int_rtx_p (x, &value)
4093 && aarch64_sve_addvl_addpl_immediate_p (value));
4094 }
4095
4096 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4097 to operand 1 and storing the result in operand 0. */
4098
4099 char *
4100 aarch64_output_sve_addvl_addpl (rtx offset)
4101 {
4102 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4103 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4104 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4105
4106 int factor = offset_value.coeffs[1];
4107 if ((factor & 15) == 0)
4108 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4109 else
4110 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4111 return buffer;
4112 }
4113
4114 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4115 instruction. If it is, store the number of elements in each vector
4116 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4117 factor in *FACTOR_OUT (if nonnull). */
4118
4119 bool
4120 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4121 unsigned int *nelts_per_vq_out)
4122 {
4123 rtx elt;
4124 poly_int64 value;
4125
4126 if (!const_vec_duplicate_p (x, &elt)
4127 || !poly_int_rtx_p (elt, &value))
4128 return false;
4129
4130 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4131 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4132 /* There's no vector INCB. */
4133 return false;
4134
4135 HOST_WIDE_INT factor = value.coeffs[0];
4136 if (value.coeffs[1] != factor)
4137 return false;
4138
4139 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4140 if ((factor % nelts_per_vq) != 0
4141 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4142 return false;
4143
4144 if (factor_out)
4145 *factor_out = factor;
4146 if (nelts_per_vq_out)
4147 *nelts_per_vq_out = nelts_per_vq;
4148 return true;
4149 }
4150
4151 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4152 instruction. */
4153
4154 bool
4155 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4156 {
4157 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4158 }
4159
4160 /* Return the asm template for an SVE vector INC or DEC instruction.
4161 OPERANDS gives the operands before the vector count and X is the
4162 value of the vector count operand itself. */
4163
4164 char *
4165 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4166 {
4167 int factor;
4168 unsigned int nelts_per_vq;
4169 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4170 gcc_unreachable ();
4171 if (factor < 0)
4172 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4173 -factor, nelts_per_vq);
4174 else
4175 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4176 factor, nelts_per_vq);
4177 }
4178
4179 static int
4180 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4181 scalar_int_mode mode)
4182 {
4183 int i;
4184 unsigned HOST_WIDE_INT val, val2, mask;
4185 int one_match, zero_match;
4186 int num_insns;
4187
4188 val = INTVAL (imm);
4189
4190 if (aarch64_move_imm (val, mode))
4191 {
4192 if (generate)
4193 emit_insn (gen_rtx_SET (dest, imm));
4194 return 1;
4195 }
4196
4197 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4198 (with XXXX non-zero). In that case check to see if the move can be done in
4199 a smaller mode. */
4200 val2 = val & 0xffffffff;
4201 if (mode == DImode
4202 && aarch64_move_imm (val2, SImode)
4203 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4204 {
4205 if (generate)
4206 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4207
4208 /* Check if we have to emit a second instruction by checking to see
4209 if any of the upper 32 bits of the original DI mode value is set. */
4210 if (val == val2)
4211 return 1;
4212
4213 i = (val >> 48) ? 48 : 32;
4214
4215 if (generate)
4216 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4217 GEN_INT ((val >> i) & 0xffff)));
4218
4219 return 2;
4220 }
4221
4222 if ((val >> 32) == 0 || mode == SImode)
4223 {
4224 if (generate)
4225 {
4226 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4227 if (mode == SImode)
4228 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4229 GEN_INT ((val >> 16) & 0xffff)));
4230 else
4231 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4232 GEN_INT ((val >> 16) & 0xffff)));
4233 }
4234 return 2;
4235 }
4236
4237 /* Remaining cases are all for DImode. */
4238
4239 mask = 0xffff;
4240 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4241 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4242 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4243 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4244
4245 if (zero_match != 2 && one_match != 2)
4246 {
4247 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4248 For a 64-bit bitmask try whether changing 16 bits to all ones or
4249 zeroes creates a valid bitmask. To check any repeated bitmask,
4250 try using 16 bits from the other 32-bit half of val. */
4251
4252 for (i = 0; i < 64; i += 16, mask <<= 16)
4253 {
4254 val2 = val & ~mask;
4255 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4256 break;
4257 val2 = val | mask;
4258 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4259 break;
4260 val2 = val2 & ~mask;
4261 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4262 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4263 break;
4264 }
4265 if (i != 64)
4266 {
4267 if (generate)
4268 {
4269 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4270 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4271 GEN_INT ((val >> i) & 0xffff)));
4272 }
4273 return 2;
4274 }
4275 }
4276
4277 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4278 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4279 otherwise skip zero bits. */
4280
4281 num_insns = 1;
4282 mask = 0xffff;
4283 val2 = one_match > zero_match ? ~val : val;
4284 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4285
4286 if (generate)
4287 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4288 ? (val | ~(mask << i))
4289 : (val & (mask << i)))));
4290 for (i += 16; i < 64; i += 16)
4291 {
4292 if ((val2 & (mask << i)) == 0)
4293 continue;
4294 if (generate)
4295 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4296 GEN_INT ((val >> i) & 0xffff)));
4297 num_insns ++;
4298 }
4299
4300 return num_insns;
4301 }
4302
4303 /* Return whether imm is a 128-bit immediate which is simple enough to
4304 expand inline. */
4305 bool
4306 aarch64_mov128_immediate (rtx imm)
4307 {
4308 if (CONST_INT_P (imm))
4309 return true;
4310
4311 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4312
4313 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4314 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4315
4316 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4317 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4318 }
4319
4320
4321 /* Return the number of temporary registers that aarch64_add_offset_1
4322 would need to add OFFSET to a register. */
4323
4324 static unsigned int
4325 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4326 {
4327 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4328 }
4329
4330 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4331 a non-polynomial OFFSET. MODE is the mode of the addition.
4332 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4333 be set and CFA adjustments added to the generated instructions.
4334
4335 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4336 temporary if register allocation is already complete. This temporary
4337 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4338 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4339 the immediate again.
4340
4341 Since this function may be used to adjust the stack pointer, we must
4342 ensure that it cannot cause transient stack deallocation (for example
4343 by first incrementing SP and then decrementing when adjusting by a
4344 large immediate). */
4345
4346 static void
4347 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4348 rtx src, HOST_WIDE_INT offset, rtx temp1,
4349 bool frame_related_p, bool emit_move_imm)
4350 {
4351 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4352 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4353
4354 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4355 rtx_insn *insn;
4356
4357 if (!moffset)
4358 {
4359 if (!rtx_equal_p (dest, src))
4360 {
4361 insn = emit_insn (gen_rtx_SET (dest, src));
4362 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4363 }
4364 return;
4365 }
4366
4367 /* Single instruction adjustment. */
4368 if (aarch64_uimm12_shift (moffset))
4369 {
4370 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4371 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4372 return;
4373 }
4374
4375 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4376 and either:
4377
4378 a) the offset cannot be loaded by a 16-bit move or
4379 b) there is no spare register into which we can move it. */
4380 if (moffset < 0x1000000
4381 && ((!temp1 && !can_create_pseudo_p ())
4382 || !aarch64_move_imm (moffset, mode)))
4383 {
4384 HOST_WIDE_INT low_off = moffset & 0xfff;
4385
4386 low_off = offset < 0 ? -low_off : low_off;
4387 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4388 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4389 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4390 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4391 return;
4392 }
4393
4394 /* Emit a move immediate if required and an addition/subtraction. */
4395 if (emit_move_imm)
4396 {
4397 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4398 temp1 = aarch64_force_temporary (mode, temp1,
4399 gen_int_mode (moffset, mode));
4400 }
4401 insn = emit_insn (offset < 0
4402 ? gen_sub3_insn (dest, src, temp1)
4403 : gen_add3_insn (dest, src, temp1));
4404 if (frame_related_p)
4405 {
4406 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4407 rtx adj = plus_constant (mode, src, offset);
4408 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4409 }
4410 }
4411
4412 /* Return the number of temporary registers that aarch64_add_offset
4413 would need to move OFFSET into a register or add OFFSET to a register;
4414 ADD_P is true if we want the latter rather than the former. */
4415
4416 static unsigned int
4417 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4418 {
4419 /* This follows the same structure as aarch64_add_offset. */
4420 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4421 return 0;
4422
4423 unsigned int count = 0;
4424 HOST_WIDE_INT factor = offset.coeffs[1];
4425 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4426 poly_int64 poly_offset (factor, factor);
4427 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4428 /* Need one register for the ADDVL/ADDPL result. */
4429 count += 1;
4430 else if (factor != 0)
4431 {
4432 factor = abs (factor);
4433 if (factor > 16 * (factor & -factor))
4434 /* Need one register for the CNT result and one for the multiplication
4435 factor. If necessary, the second temporary can be reused for the
4436 constant part of the offset. */
4437 return 2;
4438 /* Need one register for the CNT result (which might then
4439 be shifted). */
4440 count += 1;
4441 }
4442 return count + aarch64_add_offset_1_temporaries (constant);
4443 }
4444
4445 /* If X can be represented as a poly_int64, return the number
4446 of temporaries that are required to add it to a register.
4447 Return -1 otherwise. */
4448
4449 int
4450 aarch64_add_offset_temporaries (rtx x)
4451 {
4452 poly_int64 offset;
4453 if (!poly_int_rtx_p (x, &offset))
4454 return -1;
4455 return aarch64_offset_temporaries (true, offset);
4456 }
4457
4458 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4459 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4460 be set and CFA adjustments added to the generated instructions.
4461
4462 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4463 temporary if register allocation is already complete. This temporary
4464 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4465 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4466 false to avoid emitting the immediate again.
4467
4468 TEMP2, if nonnull, is a second temporary register that doesn't
4469 overlap either DEST or REG.
4470
4471 Since this function may be used to adjust the stack pointer, we must
4472 ensure that it cannot cause transient stack deallocation (for example
4473 by first incrementing SP and then decrementing when adjusting by a
4474 large immediate). */
4475
4476 static void
4477 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4478 poly_int64 offset, rtx temp1, rtx temp2,
4479 bool frame_related_p, bool emit_move_imm = true)
4480 {
4481 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4482 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4483 gcc_assert (temp1 == NULL_RTX
4484 || !frame_related_p
4485 || !reg_overlap_mentioned_p (temp1, dest));
4486 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4487
4488 /* Try using ADDVL or ADDPL to add the whole value. */
4489 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4490 {
4491 rtx offset_rtx = gen_int_mode (offset, mode);
4492 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4493 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4494 return;
4495 }
4496
4497 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4498 SVE vector register, over and above the minimum size of 128 bits.
4499 This is equivalent to half the value returned by CNTD with a
4500 vector shape of ALL. */
4501 HOST_WIDE_INT factor = offset.coeffs[1];
4502 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4503
4504 /* Try using ADDVL or ADDPL to add the VG-based part. */
4505 poly_int64 poly_offset (factor, factor);
4506 if (src != const0_rtx
4507 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4508 {
4509 rtx offset_rtx = gen_int_mode (poly_offset, mode);
4510 if (frame_related_p)
4511 {
4512 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4513 RTX_FRAME_RELATED_P (insn) = true;
4514 src = dest;
4515 }
4516 else
4517 {
4518 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4519 src = aarch64_force_temporary (mode, temp1, addr);
4520 temp1 = temp2;
4521 temp2 = NULL_RTX;
4522 }
4523 }
4524 /* Otherwise use a CNT-based sequence. */
4525 else if (factor != 0)
4526 {
4527 /* Use a subtraction if we have a negative factor. */
4528 rtx_code code = PLUS;
4529 if (factor < 0)
4530 {
4531 factor = -factor;
4532 code = MINUS;
4533 }
4534
4535 /* Calculate CNTD * FACTOR / 2. First try to fold the division
4536 into the multiplication. */
4537 rtx val;
4538 int shift = 0;
4539 if (factor & 1)
4540 /* Use a right shift by 1. */
4541 shift = -1;
4542 else
4543 factor /= 2;
4544 HOST_WIDE_INT low_bit = factor & -factor;
4545 if (factor <= 16 * low_bit)
4546 {
4547 if (factor > 16 * 8)
4548 {
4549 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4550 the value with the minimum multiplier and shift it into
4551 position. */
4552 int extra_shift = exact_log2 (low_bit);
4553 shift += extra_shift;
4554 factor >>= extra_shift;
4555 }
4556 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4557 }
4558 else
4559 {
4560 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4561 directly, since that should increase the chances of being
4562 able to use a shift and add sequence. If LOW_BIT itself
4563 is out of range, just use CNTD. */
4564 if (low_bit <= 16 * 8)
4565 factor /= low_bit;
4566 else
4567 low_bit = 1;
4568
4569 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4570 val = aarch64_force_temporary (mode, temp1, val);
4571
4572 if (can_create_pseudo_p ())
4573 {
4574 rtx coeff1 = gen_int_mode (factor, mode);
4575 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4576 }
4577 else
4578 {
4579 /* Go back to using a negative multiplication factor if we have
4580 no register from which to subtract. */
4581 if (code == MINUS && src == const0_rtx)
4582 {
4583 factor = -factor;
4584 code = PLUS;
4585 }
4586 rtx coeff1 = gen_int_mode (factor, mode);
4587 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4588 val = gen_rtx_MULT (mode, val, coeff1);
4589 }
4590 }
4591
4592 if (shift > 0)
4593 {
4594 /* Multiply by 1 << SHIFT. */
4595 val = aarch64_force_temporary (mode, temp1, val);
4596 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4597 }
4598 else if (shift == -1)
4599 {
4600 /* Divide by 2. */
4601 val = aarch64_force_temporary (mode, temp1, val);
4602 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4603 }
4604
4605 /* Calculate SRC +/- CNTD * FACTOR / 2. */
4606 if (src != const0_rtx)
4607 {
4608 val = aarch64_force_temporary (mode, temp1, val);
4609 val = gen_rtx_fmt_ee (code, mode, src, val);
4610 }
4611 else if (code == MINUS)
4612 {
4613 val = aarch64_force_temporary (mode, temp1, val);
4614 val = gen_rtx_NEG (mode, val);
4615 }
4616
4617 if (constant == 0 || frame_related_p)
4618 {
4619 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4620 if (frame_related_p)
4621 {
4622 RTX_FRAME_RELATED_P (insn) = true;
4623 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4624 gen_rtx_SET (dest, plus_constant (Pmode, src,
4625 poly_offset)));
4626 }
4627 src = dest;
4628 if (constant == 0)
4629 return;
4630 }
4631 else
4632 {
4633 src = aarch64_force_temporary (mode, temp1, val);
4634 temp1 = temp2;
4635 temp2 = NULL_RTX;
4636 }
4637
4638 emit_move_imm = true;
4639 }
4640
4641 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4642 frame_related_p, emit_move_imm);
4643 }
4644
4645 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4646 than a poly_int64. */
4647
4648 void
4649 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4650 rtx offset_rtx, rtx temp1, rtx temp2)
4651 {
4652 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4653 temp1, temp2, false);
4654 }
4655
4656 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4657 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4658 if TEMP1 already contains abs (DELTA). */
4659
4660 static inline void
4661 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4662 {
4663 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4664 temp1, temp2, true, emit_move_imm);
4665 }
4666
4667 /* Subtract DELTA from the stack pointer, marking the instructions
4668 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4669 if nonnull. */
4670
4671 static inline void
4672 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4673 bool emit_move_imm = true)
4674 {
4675 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4676 temp1, temp2, frame_related_p, emit_move_imm);
4677 }
4678
4679 /* Set DEST to (vec_series BASE STEP). */
4680
4681 static void
4682 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4683 {
4684 machine_mode mode = GET_MODE (dest);
4685 scalar_mode inner = GET_MODE_INNER (mode);
4686
4687 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4688 if (!aarch64_sve_index_immediate_p (base))
4689 base = force_reg (inner, base);
4690 if (!aarch64_sve_index_immediate_p (step))
4691 step = force_reg (inner, step);
4692
4693 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4694 }
4695
4696 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4697 register of mode MODE. Use TARGET for the result if it's nonnull
4698 and convenient.
4699
4700 The two vector modes must have the same element mode. The behavior
4701 is to duplicate architectural lane N of SRC into architectural lanes
4702 N + I * STEP of the result. On big-endian targets, architectural
4703 lane 0 of an Advanced SIMD vector is the last element of the vector
4704 in memory layout, so for big-endian targets this operation has the
4705 effect of reversing SRC before duplicating it. Callers need to
4706 account for this. */
4707
4708 rtx
4709 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4710 {
4711 machine_mode src_mode = GET_MODE (src);
4712 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4713 insn_code icode = (BYTES_BIG_ENDIAN
4714 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4715 : code_for_aarch64_vec_duplicate_vq_le (mode));
4716
4717 unsigned int i = 0;
4718 expand_operand ops[3];
4719 create_output_operand (&ops[i++], target, mode);
4720 create_output_operand (&ops[i++], src, src_mode);
4721 if (BYTES_BIG_ENDIAN)
4722 {
4723 /* Create a PARALLEL describing the reversal of SRC. */
4724 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4725 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4726 nelts_per_vq - 1, -1);
4727 create_fixed_operand (&ops[i++], sel);
4728 }
4729 expand_insn (icode, i, ops);
4730 return ops[0].value;
4731 }
4732
4733 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4734 the memory image into DEST. Return true on success. */
4735
4736 static bool
4737 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4738 {
4739 src = force_const_mem (GET_MODE (src), src);
4740 if (!src)
4741 return false;
4742
4743 /* Make sure that the address is legitimate. */
4744 if (!aarch64_sve_ld1rq_operand_p (src))
4745 {
4746 rtx addr = force_reg (Pmode, XEXP (src, 0));
4747 src = replace_equiv_address (src, addr);
4748 }
4749
4750 machine_mode mode = GET_MODE (dest);
4751 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4752 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4753 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4754 return true;
4755 }
4756
4757 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4758 SVE data mode and isn't a legitimate constant. Use TARGET for the
4759 result if convenient.
4760
4761 The returned register can have whatever mode seems most natural
4762 given the contents of SRC. */
4763
4764 static rtx
4765 aarch64_expand_sve_const_vector (rtx target, rtx src)
4766 {
4767 machine_mode mode = GET_MODE (src);
4768 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4769 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4770 scalar_mode elt_mode = GET_MODE_INNER (mode);
4771 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4772 unsigned int container_bits = aarch64_sve_container_bits (mode);
4773 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4774
4775 if (nelts_per_pattern == 1
4776 && encoded_bits <= 128
4777 && container_bits != elt_bits)
4778 {
4779 /* We have a partial vector mode and a constant whose full-vector
4780 equivalent would occupy a repeating 128-bit sequence. Build that
4781 full-vector equivalent instead, so that we have the option of
4782 using LD1RQ and Advanced SIMD operations. */
4783 unsigned int repeat = container_bits / elt_bits;
4784 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4785 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4786 for (unsigned int i = 0; i < npatterns; ++i)
4787 for (unsigned int j = 0; j < repeat; ++j)
4788 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4789 target = aarch64_target_reg (target, full_mode);
4790 return aarch64_expand_sve_const_vector (target, builder.build ());
4791 }
4792
4793 if (nelts_per_pattern == 1 && encoded_bits == 128)
4794 {
4795 /* The constant is a duplicated quadword but can't be narrowed
4796 beyond a quadword. Get the memory image of the first quadword
4797 as a 128-bit vector and try using LD1RQ to load it from memory.
4798
4799 The effect for both endiannesses is to load memory lane N into
4800 architectural lanes N + I * STEP of the result. On big-endian
4801 targets, the layout of the 128-bit vector in an Advanced SIMD
4802 register would be different from its layout in an SVE register,
4803 but this 128-bit vector is a memory value only. */
4804 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4805 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4806 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4807 return target;
4808 }
4809
4810 if (nelts_per_pattern == 1 && encoded_bits < 128)
4811 {
4812 /* The vector is a repeating sequence of 64 bits or fewer.
4813 See if we can load them using an Advanced SIMD move and then
4814 duplicate it to fill a vector. This is better than using a GPR
4815 move because it keeps everything in the same register file. */
4816 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4817 rtx_vector_builder builder (vq_mode, npatterns, 1);
4818 for (unsigned int i = 0; i < npatterns; ++i)
4819 {
4820 /* We want memory lane N to go into architectural lane N,
4821 so reverse for big-endian targets. The DUP .Q pattern
4822 has a compensating reverse built-in. */
4823 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4824 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4825 }
4826 rtx vq_src = builder.build ();
4827 if (aarch64_simd_valid_immediate (vq_src, NULL))
4828 {
4829 vq_src = force_reg (vq_mode, vq_src);
4830 return aarch64_expand_sve_dupq (target, mode, vq_src);
4831 }
4832
4833 /* Get an integer representation of the repeating part of Advanced
4834 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4835 which for big-endian targets is lane-swapped wrt a normal
4836 Advanced SIMD vector. This means that for both endiannesses,
4837 memory lane N of SVE vector SRC corresponds to architectural
4838 lane N of a register holding VQ_SRC. This in turn means that
4839 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4840 as a single 128-bit value) and thus that memory lane 0 of SRC is
4841 in the lsb of the integer. Duplicating the integer therefore
4842 ensures that memory lane N of SRC goes into architectural lane
4843 N + I * INDEX of the SVE register. */
4844 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4845 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4846 if (elt_value)
4847 {
4848 /* Pretend that we had a vector of INT_MODE to start with. */
4849 elt_mode = int_mode;
4850 mode = aarch64_full_sve_mode (int_mode).require ();
4851
4852 /* If the integer can be moved into a general register by a
4853 single instruction, do that and duplicate the result. */
4854 if (CONST_INT_P (elt_value)
4855 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4856 {
4857 elt_value = force_reg (elt_mode, elt_value);
4858 return expand_vector_broadcast (mode, elt_value);
4859 }
4860 }
4861 else if (npatterns == 1)
4862 /* We're duplicating a single value, but can't do better than
4863 force it to memory and load from there. This handles things
4864 like symbolic constants. */
4865 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4866
4867 if (elt_value)
4868 {
4869 /* Load the element from memory if we can, otherwise move it into
4870 a register and use a DUP. */
4871 rtx op = force_const_mem (elt_mode, elt_value);
4872 if (!op)
4873 op = force_reg (elt_mode, elt_value);
4874 return expand_vector_broadcast (mode, op);
4875 }
4876 }
4877
4878 /* Try using INDEX. */
4879 rtx base, step;
4880 if (const_vec_series_p (src, &base, &step))
4881 {
4882 aarch64_expand_vec_series (target, base, step);
4883 return target;
4884 }
4885
4886 /* From here on, it's better to force the whole constant to memory
4887 if we can. */
4888 if (GET_MODE_NUNITS (mode).is_constant ())
4889 return NULL_RTX;
4890
4891 /* Expand each pattern individually. */
4892 gcc_assert (npatterns > 1);
4893 rtx_vector_builder builder;
4894 auto_vec<rtx, 16> vectors (npatterns);
4895 for (unsigned int i = 0; i < npatterns; ++i)
4896 {
4897 builder.new_vector (mode, 1, nelts_per_pattern);
4898 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4899 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4900 vectors.quick_push (force_reg (mode, builder.build ()));
4901 }
4902
4903 /* Use permutes to interleave the separate vectors. */
4904 while (npatterns > 1)
4905 {
4906 npatterns /= 2;
4907 for (unsigned int i = 0; i < npatterns; ++i)
4908 {
4909 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4910 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4911 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4912 vectors[i] = tmp;
4913 }
4914 }
4915 gcc_assert (vectors[0] == target);
4916 return target;
4917 }
4918
4919 /* Use WHILE to set a predicate register of mode MODE in which the first
4920 VL bits are set and the rest are clear. Use TARGET for the register
4921 if it's nonnull and convenient. */
4922
4923 static rtx
4924 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4925 unsigned int vl)
4926 {
4927 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4928 target = aarch64_target_reg (target, mode);
4929 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4930 target, const0_rtx, limit));
4931 return target;
4932 }
4933
4934 static rtx
4935 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4936
4937 /* BUILDER is a constant predicate in which the index of every set bit
4938 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4939 by inverting every element at a multiple of ELT_SIZE and EORing the
4940 result with an ELT_SIZE PTRUE.
4941
4942 Return a register that contains the constant on success, otherwise
4943 return null. Use TARGET as the register if it is nonnull and
4944 convenient. */
4945
4946 static rtx
4947 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4948 unsigned int elt_size)
4949 {
4950 /* Invert every element at a multiple of ELT_SIZE, keeping the
4951 other bits zero. */
4952 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4953 builder.nelts_per_pattern ());
4954 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4955 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4956 inv_builder.quick_push (const1_rtx);
4957 else
4958 inv_builder.quick_push (const0_rtx);
4959 inv_builder.finalize ();
4960
4961 /* See if we can load the constant cheaply. */
4962 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4963 if (!inv)
4964 return NULL_RTX;
4965
4966 /* EOR the result with an ELT_SIZE PTRUE. */
4967 rtx mask = aarch64_ptrue_all (elt_size);
4968 mask = force_reg (VNx16BImode, mask);
4969 inv = gen_lowpart (VNx16BImode, inv);
4970 target = aarch64_target_reg (target, VNx16BImode);
4971 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4972 return target;
4973 }
4974
4975 /* BUILDER is a constant predicate in which the index of every set bit
4976 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4977 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4978 register on success, otherwise return null. Use TARGET as the register
4979 if nonnull and convenient. */
4980
4981 static rtx
4982 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4983 unsigned int elt_size,
4984 unsigned int permute_size)
4985 {
4986 /* We're going to split the constant into two new constants A and B,
4987 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4988 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4989
4990 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4991 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4992
4993 where _ indicates elements that will be discarded by the permute.
4994
4995 First calculate the ELT_SIZEs for A and B. */
4996 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4997 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4998 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4999 if (INTVAL (builder.elt (i)) != 0)
5000 {
5001 if (i & permute_size)
5002 b_elt_size |= i - permute_size;
5003 else
5004 a_elt_size |= i;
5005 }
5006 a_elt_size &= -a_elt_size;
5007 b_elt_size &= -b_elt_size;
5008
5009 /* Now construct the vectors themselves. */
5010 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5011 builder.nelts_per_pattern ());
5012 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5013 builder.nelts_per_pattern ());
5014 unsigned int nelts = builder.encoded_nelts ();
5015 for (unsigned int i = 0; i < nelts; ++i)
5016 if (i & (elt_size - 1))
5017 {
5018 a_builder.quick_push (const0_rtx);
5019 b_builder.quick_push (const0_rtx);
5020 }
5021 else if ((i & permute_size) == 0)
5022 {
5023 /* The A and B elements are significant. */
5024 a_builder.quick_push (builder.elt (i));
5025 b_builder.quick_push (builder.elt (i + permute_size));
5026 }
5027 else
5028 {
5029 /* The A and B elements are going to be discarded, so pick whatever
5030 is likely to give a nice constant. We are targeting element
5031 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5032 with the aim of each being a sequence of ones followed by
5033 a sequence of zeros. So:
5034
5035 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5036 duplicate the last X_ELT_SIZE element, to extend the
5037 current sequence of ones or zeros.
5038
5039 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5040 zero, so that the constant really does have X_ELT_SIZE and
5041 not a smaller size. */
5042 if (a_elt_size > permute_size)
5043 a_builder.quick_push (const0_rtx);
5044 else
5045 a_builder.quick_push (a_builder.elt (i - a_elt_size));
5046 if (b_elt_size > permute_size)
5047 b_builder.quick_push (const0_rtx);
5048 else
5049 b_builder.quick_push (b_builder.elt (i - b_elt_size));
5050 }
5051 a_builder.finalize ();
5052 b_builder.finalize ();
5053
5054 /* Try loading A into a register. */
5055 rtx_insn *last = get_last_insn ();
5056 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5057 if (!a)
5058 return NULL_RTX;
5059
5060 /* Try loading B into a register. */
5061 rtx b = a;
5062 if (a_builder != b_builder)
5063 {
5064 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5065 if (!b)
5066 {
5067 delete_insns_since (last);
5068 return NULL_RTX;
5069 }
5070 }
5071
5072 /* Emit the TRN1 itself. */
5073 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5074 target = aarch64_target_reg (target, mode);
5075 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
5076 gen_lowpart (mode, a),
5077 gen_lowpart (mode, b)));
5078 return target;
5079 }
5080
5081 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5082 constant in BUILDER into an SVE predicate register. Return the register
5083 on success, otherwise return null. Use TARGET for the register if
5084 nonnull and convenient.
5085
5086 ALLOW_RECURSE_P is true if we can use methods that would call this
5087 function recursively. */
5088
5089 static rtx
5090 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5091 bool allow_recurse_p)
5092 {
5093 if (builder.encoded_nelts () == 1)
5094 /* A PFALSE or a PTRUE .B ALL. */
5095 return aarch64_emit_set_immediate (target, builder);
5096
5097 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5098 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5099 {
5100 /* If we can load the constant using PTRUE, use it as-is. */
5101 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5102 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5103 return aarch64_emit_set_immediate (target, builder);
5104
5105 /* Otherwise use WHILE to set the first VL bits. */
5106 return aarch64_sve_move_pred_via_while (target, mode, vl);
5107 }
5108
5109 if (!allow_recurse_p)
5110 return NULL_RTX;
5111
5112 /* Try inverting the vector in element size ELT_SIZE and then EORing
5113 the result with an ELT_SIZE PTRUE. */
5114 if (INTVAL (builder.elt (0)) == 0)
5115 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5116 elt_size))
5117 return res;
5118
5119 /* Try using TRN1 to permute two simpler constants. */
5120 for (unsigned int i = elt_size; i <= 8; i *= 2)
5121 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5122 elt_size, i))
5123 return res;
5124
5125 return NULL_RTX;
5126 }
5127
5128 /* Return an SVE predicate register that contains the VNx16BImode
5129 constant in BUILDER, without going through the move expanders.
5130
5131 The returned register can have whatever mode seems most natural
5132 given the contents of BUILDER. Use TARGET for the result if
5133 convenient. */
5134
5135 static rtx
5136 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5137 {
5138 /* Try loading the constant using pure predicate operations. */
5139 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5140 return res;
5141
5142 /* Try forcing the constant to memory. */
5143 if (builder.full_nelts ().is_constant ())
5144 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5145 {
5146 target = aarch64_target_reg (target, VNx16BImode);
5147 emit_move_insn (target, mem);
5148 return target;
5149 }
5150
5151 /* The last resort is to load the constant as an integer and then
5152 compare it against zero. Use -1 for set bits in order to increase
5153 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5154 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5155 builder.nelts_per_pattern ());
5156 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5157 int_builder.quick_push (INTVAL (builder.elt (i))
5158 ? constm1_rtx : const0_rtx);
5159 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5160 int_builder.build ());
5161 }
5162
5163 /* Set DEST to immediate IMM. */
5164
5165 void
5166 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5167 {
5168 machine_mode mode = GET_MODE (dest);
5169
5170 /* Check on what type of symbol it is. */
5171 scalar_int_mode int_mode;
5172 if ((SYMBOL_REF_P (imm)
5173 || LABEL_REF_P (imm)
5174 || GET_CODE (imm) == CONST
5175 || GET_CODE (imm) == CONST_POLY_INT)
5176 && is_a <scalar_int_mode> (mode, &int_mode))
5177 {
5178 rtx mem;
5179 poly_int64 offset;
5180 HOST_WIDE_INT const_offset;
5181 enum aarch64_symbol_type sty;
5182
5183 /* If we have (const (plus symbol offset)), separate out the offset
5184 before we start classifying the symbol. */
5185 rtx base = strip_offset (imm, &offset);
5186
5187 /* We must always add an offset involving VL separately, rather than
5188 folding it into the relocation. */
5189 if (!offset.is_constant (&const_offset))
5190 {
5191 if (!TARGET_SVE)
5192 {
5193 aarch64_report_sve_required ();
5194 return;
5195 }
5196 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5197 emit_insn (gen_rtx_SET (dest, imm));
5198 else
5199 {
5200 /* Do arithmetic on 32-bit values if the result is smaller
5201 than that. */
5202 if (partial_subreg_p (int_mode, SImode))
5203 {
5204 /* It is invalid to do symbol calculations in modes
5205 narrower than SImode. */
5206 gcc_assert (base == const0_rtx);
5207 dest = gen_lowpart (SImode, dest);
5208 int_mode = SImode;
5209 }
5210 if (base != const0_rtx)
5211 {
5212 base = aarch64_force_temporary (int_mode, dest, base);
5213 aarch64_add_offset (int_mode, dest, base, offset,
5214 NULL_RTX, NULL_RTX, false);
5215 }
5216 else
5217 aarch64_add_offset (int_mode, dest, base, offset,
5218 dest, NULL_RTX, false);
5219 }
5220 return;
5221 }
5222
5223 sty = aarch64_classify_symbol (base, const_offset);
5224 switch (sty)
5225 {
5226 case SYMBOL_FORCE_TO_MEM:
5227 if (const_offset != 0
5228 && targetm.cannot_force_const_mem (int_mode, imm))
5229 {
5230 gcc_assert (can_create_pseudo_p ());
5231 base = aarch64_force_temporary (int_mode, dest, base);
5232 aarch64_add_offset (int_mode, dest, base, const_offset,
5233 NULL_RTX, NULL_RTX, false);
5234 return;
5235 }
5236
5237 mem = force_const_mem (ptr_mode, imm);
5238 gcc_assert (mem);
5239
5240 /* If we aren't generating PC relative literals, then
5241 we need to expand the literal pool access carefully.
5242 This is something that needs to be done in a number
5243 of places, so could well live as a separate function. */
5244 if (!aarch64_pcrelative_literal_loads)
5245 {
5246 gcc_assert (can_create_pseudo_p ());
5247 base = gen_reg_rtx (ptr_mode);
5248 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5249 if (ptr_mode != Pmode)
5250 base = convert_memory_address (Pmode, base);
5251 mem = gen_rtx_MEM (ptr_mode, base);
5252 }
5253
5254 if (int_mode != ptr_mode)
5255 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5256
5257 emit_insn (gen_rtx_SET (dest, mem));
5258
5259 return;
5260
5261 case SYMBOL_SMALL_TLSGD:
5262 case SYMBOL_SMALL_TLSDESC:
5263 case SYMBOL_SMALL_TLSIE:
5264 case SYMBOL_SMALL_GOT_28K:
5265 case SYMBOL_SMALL_GOT_4G:
5266 case SYMBOL_TINY_GOT:
5267 case SYMBOL_TINY_TLSIE:
5268 if (const_offset != 0)
5269 {
5270 gcc_assert(can_create_pseudo_p ());
5271 base = aarch64_force_temporary (int_mode, dest, base);
5272 aarch64_add_offset (int_mode, dest, base, const_offset,
5273 NULL_RTX, NULL_RTX, false);
5274 return;
5275 }
5276 /* FALLTHRU */
5277
5278 case SYMBOL_SMALL_ABSOLUTE:
5279 case SYMBOL_TINY_ABSOLUTE:
5280 case SYMBOL_TLSLE12:
5281 case SYMBOL_TLSLE24:
5282 case SYMBOL_TLSLE32:
5283 case SYMBOL_TLSLE48:
5284 aarch64_load_symref_appropriately (dest, imm, sty);
5285 return;
5286
5287 default:
5288 gcc_unreachable ();
5289 }
5290 }
5291
5292 if (!CONST_INT_P (imm))
5293 {
5294 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5295 {
5296 /* Only the low bit of each .H, .S and .D element is defined,
5297 so we can set the upper bits to whatever we like. If the
5298 predicate is all-true in MODE, prefer to set all the undefined
5299 bits as well, so that we can share a single .B predicate for
5300 all modes. */
5301 if (imm == CONSTM1_RTX (mode))
5302 imm = CONSTM1_RTX (VNx16BImode);
5303
5304 /* All methods for constructing predicate modes wider than VNx16BI
5305 will set the upper bits of each element to zero. Expose this
5306 by moving such constants as a VNx16BI, so that all bits are
5307 significant and so that constants for different modes can be
5308 shared. The wider constant will still be available as a
5309 REG_EQUAL note. */
5310 rtx_vector_builder builder;
5311 if (aarch64_get_sve_pred_bits (builder, imm))
5312 {
5313 rtx res = aarch64_expand_sve_const_pred (dest, builder);
5314 if (dest != res)
5315 emit_move_insn (dest, gen_lowpart (mode, res));
5316 return;
5317 }
5318 }
5319
5320 if (GET_CODE (imm) == HIGH
5321 || aarch64_simd_valid_immediate (imm, NULL))
5322 {
5323 emit_insn (gen_rtx_SET (dest, imm));
5324 return;
5325 }
5326
5327 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5328 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5329 {
5330 if (dest != res)
5331 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5332 return;
5333 }
5334
5335 rtx mem = force_const_mem (mode, imm);
5336 gcc_assert (mem);
5337 emit_move_insn (dest, mem);
5338 return;
5339 }
5340
5341 aarch64_internal_mov_immediate (dest, imm, true,
5342 as_a <scalar_int_mode> (mode));
5343 }
5344
5345 /* Return the MEM rtx that provides the canary value that should be used
5346 for stack-smashing protection. MODE is the mode of the memory.
5347 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
5348 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
5349 indicates whether the caller is performing a SET or a TEST operation. */
5350
5351 rtx
5352 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
5353 aarch64_salt_type salt_type)
5354 {
5355 rtx addr;
5356 if (aarch64_stack_protector_guard == SSP_GLOBAL)
5357 {
5358 gcc_assert (MEM_P (decl_rtl));
5359 addr = XEXP (decl_rtl, 0);
5360 poly_int64 offset;
5361 rtx base = strip_offset_and_salt (addr, &offset);
5362 if (!SYMBOL_REF_P (base))
5363 return decl_rtl;
5364
5365 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
5366 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
5367 addr = gen_rtx_CONST (Pmode, addr);
5368 addr = plus_constant (Pmode, addr, offset);
5369 }
5370 else
5371 {
5372 /* Calculate the address from the system register. */
5373 rtx salt = GEN_INT (salt_type);
5374 addr = gen_reg_rtx (mode);
5375 if (mode == DImode)
5376 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
5377 else
5378 {
5379 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
5380 addr = convert_memory_address (Pmode, addr);
5381 }
5382 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
5383 }
5384 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
5385 }
5386
5387 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
5388 that is known to contain PTRUE. */
5389
5390 void
5391 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5392 {
5393 expand_operand ops[3];
5394 machine_mode mode = GET_MODE (dest);
5395 create_output_operand (&ops[0], dest, mode);
5396 create_input_operand (&ops[1], pred, GET_MODE(pred));
5397 create_input_operand (&ops[2], src, mode);
5398 temporary_volatile_ok v (true);
5399 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5400 }
5401
5402 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5403 operand is in memory. In this case we need to use the predicated LD1
5404 and ST1 instead of LDR and STR, both for correctness on big-endian
5405 targets and because LD1 and ST1 support a wider range of addressing modes.
5406 PRED_MODE is the mode of the predicate.
5407
5408 See the comment at the head of aarch64-sve.md for details about the
5409 big-endian handling. */
5410
5411 void
5412 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5413 {
5414 machine_mode mode = GET_MODE (dest);
5415 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5416 if (!register_operand (src, mode)
5417 && !register_operand (dest, mode))
5418 {
5419 rtx tmp = gen_reg_rtx (mode);
5420 if (MEM_P (src))
5421 aarch64_emit_sve_pred_move (tmp, ptrue, src);
5422 else
5423 emit_move_insn (tmp, src);
5424 src = tmp;
5425 }
5426 aarch64_emit_sve_pred_move (dest, ptrue, src);
5427 }
5428
5429 /* Called only on big-endian targets. See whether an SVE vector move
5430 from SRC to DEST is effectively a REV[BHW] instruction, because at
5431 least one operand is a subreg of an SVE vector that has wider or
5432 narrower elements. Return true and emit the instruction if so.
5433
5434 For example:
5435
5436 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5437
5438 represents a VIEW_CONVERT between the following vectors, viewed
5439 in memory order:
5440
5441 R2: { [0].high, [0].low, [1].high, [1].low, ... }
5442 R1: { [0], [1], [2], [3], ... }
5443
5444 The high part of lane X in R2 should therefore correspond to lane X*2
5445 of R1, but the register representations are:
5446
5447 msb lsb
5448 R2: ...... [1].high [1].low [0].high [0].low
5449 R1: ...... [3] [2] [1] [0]
5450
5451 where the low part of lane X in R2 corresponds to lane X*2 in R1.
5452 We therefore need a reverse operation to swap the high and low values
5453 around.
5454
5455 This is purely an optimization. Without it we would spill the
5456 subreg operand to the stack in one mode and reload it in the
5457 other mode, which has the same effect as the REV. */
5458
5459 bool
5460 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5461 {
5462 gcc_assert (BYTES_BIG_ENDIAN);
5463
5464 /* Do not try to optimize subregs that LRA has created for matched
5465 reloads. These subregs only exist as a temporary measure to make
5466 the RTL well-formed, but they are exempt from the usual
5467 TARGET_CAN_CHANGE_MODE_CLASS rules.
5468
5469 For example, if we have:
5470
5471 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
5472
5473 and the constraints require R1 and R2 to be in the same register,
5474 LRA may need to create RTL such as:
5475
5476 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
5477 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
5478 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
5479
5480 which forces both the input and output of the original instruction
5481 to use the same hard register. But for this to work, the normal
5482 rules have to be suppressed on the subreg input, otherwise LRA
5483 would need to reload that input too, meaning that the process
5484 would never terminate. To compensate for this, the normal rules
5485 are also suppressed for the subreg output of the first move.
5486 Ignoring the special case and handling the first move normally
5487 would therefore generate wrong code: we would reverse the elements
5488 for the first subreg but not reverse them back for the second subreg. */
5489 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
5490 dest = SUBREG_REG (dest);
5491 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
5492 src = SUBREG_REG (src);
5493
5494 /* The optimization handles two single SVE REGs with different element
5495 sizes. */
5496 if (!REG_P (dest)
5497 || !REG_P (src)
5498 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5499 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5500 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5501 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5502 return false;
5503
5504 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
5505 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5506 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5507 UNSPEC_REV_SUBREG);
5508 emit_insn (gen_rtx_SET (dest, unspec));
5509 return true;
5510 }
5511
5512 /* Return a copy of X with mode MODE, without changing its other
5513 attributes. Unlike gen_lowpart, this doesn't care whether the
5514 mode change is valid. */
5515
5516 rtx
5517 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5518 {
5519 if (GET_MODE (x) == mode)
5520 return x;
5521
5522 x = shallow_copy_rtx (x);
5523 set_mode_and_regno (x, mode, REGNO (x));
5524 return x;
5525 }
5526
5527 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5528 stored in wider integer containers. */
5529
5530 static unsigned int
5531 aarch64_sve_rev_unspec (machine_mode mode)
5532 {
5533 switch (GET_MODE_UNIT_SIZE (mode))
5534 {
5535 case 1: return UNSPEC_REVB;
5536 case 2: return UNSPEC_REVH;
5537 case 4: return UNSPEC_REVW;
5538 }
5539 gcc_unreachable ();
5540 }
5541
5542 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5543 operands. */
5544
5545 void
5546 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5547 {
5548 /* Decide which REV operation we need. The mode with wider elements
5549 determines the mode of the operands and the mode with the narrower
5550 elements determines the reverse width. */
5551 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5552 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5553 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5554 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5555 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5556
5557 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5558 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5559
5560 /* Get the operands in the appropriate modes and emit the instruction. */
5561 ptrue = gen_lowpart (pred_mode, ptrue);
5562 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5563 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5564 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5565 dest, ptrue, src));
5566 }
5567
5568 static bool
5569 aarch64_function_ok_for_sibcall (tree, tree exp)
5570 {
5571 if (crtl->abi->id () != expr_callee_abi (exp).id ())
5572 return false;
5573
5574 return true;
5575 }
5576
5577 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5578 passed in SVE registers. */
5579
5580 static bool
5581 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5582 const function_arg_info &arg)
5583 {
5584 HOST_WIDE_INT size;
5585 machine_mode dummymode;
5586 int nregs;
5587
5588 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
5589 if (arg.mode == BLKmode && arg.type)
5590 size = int_size_in_bytes (arg.type);
5591 else
5592 /* No frontends can create types with variable-sized modes, so we
5593 shouldn't be asked to pass or return them. */
5594 size = GET_MODE_SIZE (arg.mode).to_constant ();
5595
5596 /* Aggregates are passed by reference based on their size. */
5597 if (arg.aggregate_type_p ())
5598 size = int_size_in_bytes (arg.type);
5599
5600 /* Variable sized arguments are always returned by reference. */
5601 if (size < 0)
5602 return true;
5603
5604 /* Can this be a candidate to be passed in fp/simd register(s)? */
5605 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5606 &dummymode, &nregs, NULL,
5607 !pcum || pcum->silent_p))
5608 return false;
5609
5610 /* Arguments which are variable sized or larger than 2 registers are
5611 passed by reference unless they are a homogenous floating point
5612 aggregate. */
5613 return size > 2 * UNITS_PER_WORD;
5614 }
5615
5616 /* Implement TARGET_PASS_BY_REFERENCE. */
5617
5618 static bool
5619 aarch64_pass_by_reference (cumulative_args_t pcum_v,
5620 const function_arg_info &arg)
5621 {
5622 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5623
5624 if (!arg.type)
5625 return aarch64_pass_by_reference_1 (pcum, arg);
5626
5627 pure_scalable_type_info pst_info;
5628 switch (pst_info.analyze (arg.type))
5629 {
5630 case pure_scalable_type_info::IS_PST:
5631 if (pcum && !pcum->silent_p && !TARGET_SVE)
5632 /* We can't gracefully recover at this point, so make this a
5633 fatal error. */
5634 fatal_error (input_location, "arguments of type %qT require"
5635 " the SVE ISA extension", arg.type);
5636
5637 /* Variadic SVE types are passed by reference. Normal non-variadic
5638 arguments are too if we've run out of registers. */
5639 return (!arg.named
5640 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5641 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5642
5643 case pure_scalable_type_info::DOESNT_MATTER:
5644 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
5645 return true;
5646
5647 case pure_scalable_type_info::NO_ABI_IDENTITY:
5648 case pure_scalable_type_info::ISNT_PST:
5649 return aarch64_pass_by_reference_1 (pcum, arg);
5650 }
5651 gcc_unreachable ();
5652 }
5653
5654 /* Return TRUE if VALTYPE is padded to its least significant bits. */
5655 static bool
5656 aarch64_return_in_msb (const_tree valtype)
5657 {
5658 machine_mode dummy_mode;
5659 int dummy_int;
5660
5661 /* Never happens in little-endian mode. */
5662 if (!BYTES_BIG_ENDIAN)
5663 return false;
5664
5665 /* Only composite types smaller than or equal to 16 bytes can
5666 be potentially returned in registers. */
5667 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5668 || int_size_in_bytes (valtype) <= 0
5669 || int_size_in_bytes (valtype) > 16)
5670 return false;
5671
5672 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5673 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5674 is always passed/returned in the least significant bits of fp/simd
5675 register(s). */
5676 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5677 &dummy_mode, &dummy_int, NULL,
5678 false))
5679 return false;
5680
5681 /* Likewise pure scalable types for SVE vector and predicate registers. */
5682 pure_scalable_type_info pst_info;
5683 if (pst_info.analyze_registers (valtype))
5684 return false;
5685
5686 return true;
5687 }
5688
5689 /* Implement TARGET_FUNCTION_VALUE.
5690 Define how to find the value returned by a function. */
5691
5692 static rtx
5693 aarch64_function_value (const_tree type, const_tree func,
5694 bool outgoing ATTRIBUTE_UNUSED)
5695 {
5696 machine_mode mode;
5697 int unsignedp;
5698
5699 mode = TYPE_MODE (type);
5700 if (INTEGRAL_TYPE_P (type))
5701 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5702
5703 pure_scalable_type_info pst_info;
5704 if (type && pst_info.analyze_registers (type))
5705 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5706
5707 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5708 are returned in memory, not by value. */
5709 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5710 bool sve_p = (vec_flags & VEC_ANY_SVE);
5711
5712 if (aarch64_return_in_msb (type))
5713 {
5714 HOST_WIDE_INT size = int_size_in_bytes (type);
5715
5716 if (size % UNITS_PER_WORD != 0)
5717 {
5718 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5719 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5720 }
5721 }
5722
5723 int count;
5724 machine_mode ag_mode;
5725 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5726 NULL, false))
5727 {
5728 gcc_assert (!sve_p);
5729 if (!aarch64_composite_type_p (type, mode))
5730 {
5731 gcc_assert (count == 1 && mode == ag_mode);
5732 return gen_rtx_REG (mode, V0_REGNUM);
5733 }
5734 else
5735 {
5736 int i;
5737 rtx par;
5738
5739 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5740 for (i = 0; i < count; i++)
5741 {
5742 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5743 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5744 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5745 XVECEXP (par, 0, i) = tmp;
5746 }
5747 return par;
5748 }
5749 }
5750 else
5751 {
5752 if (sve_p)
5753 {
5754 /* Vector types can acquire a partial SVE mode using things like
5755 __attribute__((vector_size(N))), and this is potentially useful.
5756 However, the choice of mode doesn't affect the type's ABI
5757 identity, so we should treat the types as though they had
5758 the associated integer mode, just like they did before SVE
5759 was introduced.
5760
5761 We know that the vector must be 128 bits or smaller,
5762 otherwise we'd have returned it in memory instead. */
5763 gcc_assert (type
5764 && (aarch64_some_values_include_pst_objects_p (type)
5765 || (vec_flags & VEC_PARTIAL)));
5766
5767 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5768 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5769 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5770 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5771 }
5772 return gen_rtx_REG (mode, R0_REGNUM);
5773 }
5774 }
5775
5776 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5777 Return true if REGNO is the number of a hard register in which the values
5778 of called function may come back. */
5779
5780 static bool
5781 aarch64_function_value_regno_p (const unsigned int regno)
5782 {
5783 /* Maximum of 16 bytes can be returned in the general registers. Examples
5784 of 16-byte return values are: 128-bit integers and 16-byte small
5785 structures (excluding homogeneous floating-point aggregates). */
5786 if (regno == R0_REGNUM || regno == R1_REGNUM)
5787 return true;
5788
5789 /* Up to four fp/simd registers can return a function value, e.g. a
5790 homogeneous floating-point aggregate having four members. */
5791 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5792 return TARGET_FLOAT;
5793
5794 return false;
5795 }
5796
5797 /* Subroutine for aarch64_return_in_memory for types that are not returned
5798 in SVE registers. */
5799
5800 static bool
5801 aarch64_return_in_memory_1 (const_tree type)
5802 {
5803 HOST_WIDE_INT size;
5804 machine_mode ag_mode;
5805 int count;
5806
5807 if (!AGGREGATE_TYPE_P (type)
5808 && TREE_CODE (type) != COMPLEX_TYPE
5809 && TREE_CODE (type) != VECTOR_TYPE)
5810 /* Simple scalar types always returned in registers. */
5811 return false;
5812
5813 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5814 &ag_mode, &count, NULL, false))
5815 return false;
5816
5817 /* Types larger than 2 registers returned in memory. */
5818 size = int_size_in_bytes (type);
5819 return (size < 0 || size > 2 * UNITS_PER_WORD);
5820 }
5821
5822 /* Implement TARGET_RETURN_IN_MEMORY.
5823
5824 If the type T of the result of a function is such that
5825 void func (T arg)
5826 would require that arg be passed as a value in a register (or set of
5827 registers) according to the parameter passing rules, then the result
5828 is returned in the same registers as would be used for such an
5829 argument. */
5830
5831 static bool
5832 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5833 {
5834 pure_scalable_type_info pst_info;
5835 switch (pst_info.analyze (type))
5836 {
5837 case pure_scalable_type_info::IS_PST:
5838 return (pst_info.num_zr () > NUM_FP_ARG_REGS
5839 || pst_info.num_pr () > NUM_PR_ARG_REGS);
5840
5841 case pure_scalable_type_info::DOESNT_MATTER:
5842 gcc_assert (aarch64_return_in_memory_1 (type));
5843 return true;
5844
5845 case pure_scalable_type_info::NO_ABI_IDENTITY:
5846 case pure_scalable_type_info::ISNT_PST:
5847 return aarch64_return_in_memory_1 (type);
5848 }
5849 gcc_unreachable ();
5850 }
5851
5852 static bool
5853 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5854 const_tree type, int *nregs)
5855 {
5856 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5857 return aarch64_vfp_is_call_or_return_candidate (mode, type,
5858 &pcum->aapcs_vfp_rmode,
5859 nregs, NULL, pcum->silent_p);
5860 }
5861
5862 /* Given MODE and TYPE of a function argument, return the alignment in
5863 bits. The idea is to suppress any stronger alignment requested by
5864 the user and opt for the natural alignment (specified in AAPCS64 \S
5865 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5866 calculated in versions of GCC prior to GCC-9. This is a helper
5867 function for local use only. */
5868
5869 static unsigned int
5870 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5871 bool *abi_break)
5872 {
5873 *abi_break = false;
5874 if (!type)
5875 return GET_MODE_ALIGNMENT (mode);
5876
5877 if (integer_zerop (TYPE_SIZE (type)))
5878 return 0;
5879
5880 gcc_assert (TYPE_MODE (type) == mode);
5881
5882 if (!AGGREGATE_TYPE_P (type))
5883 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5884
5885 if (TREE_CODE (type) == ARRAY_TYPE)
5886 return TYPE_ALIGN (TREE_TYPE (type));
5887
5888 unsigned int alignment = 0;
5889 unsigned int bitfield_alignment = 0;
5890 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5891 if (TREE_CODE (field) == FIELD_DECL)
5892 {
5893 /* Note that we explicitly consider zero-sized fields here,
5894 even though they don't map to AAPCS64 machine types.
5895 For example, in:
5896
5897 struct __attribute__((aligned(8))) empty {};
5898
5899 struct s {
5900 [[no_unique_address]] empty e;
5901 int x;
5902 };
5903
5904 "s" contains only one Fundamental Data Type (the int field)
5905 but gains 8-byte alignment and size thanks to "e". */
5906 alignment = std::max (alignment, DECL_ALIGN (field));
5907 if (DECL_BIT_FIELD_TYPE (field))
5908 bitfield_alignment
5909 = std::max (bitfield_alignment,
5910 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5911 }
5912
5913 if (bitfield_alignment > alignment)
5914 {
5915 *abi_break = true;
5916 return bitfield_alignment;
5917 }
5918
5919 return alignment;
5920 }
5921
5922 /* Layout a function argument according to the AAPCS64 rules. The rule
5923 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5924 mode that was originally given to us by the target hook, whereas the
5925 mode in ARG might be the result of replacing partial SVE modes with
5926 the equivalent integer mode. */
5927
5928 static void
5929 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5930 {
5931 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5932 tree type = arg.type;
5933 machine_mode mode = arg.mode;
5934 int ncrn, nvrn, nregs;
5935 bool allocate_ncrn, allocate_nvrn;
5936 HOST_WIDE_INT size;
5937 bool abi_break;
5938
5939 /* We need to do this once per argument. */
5940 if (pcum->aapcs_arg_processed)
5941 return;
5942
5943 pcum->aapcs_arg_processed = true;
5944
5945 pure_scalable_type_info pst_info;
5946 if (type && pst_info.analyze_registers (type))
5947 {
5948 /* The PCS says that it is invalid to pass an SVE value to an
5949 unprototyped function. There is no ABI-defined location we
5950 can return in this case, so we have no real choice but to raise
5951 an error immediately, even though this is only a query function. */
5952 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5953 {
5954 gcc_assert (!pcum->silent_p);
5955 error ("SVE type %qT cannot be passed to an unprototyped function",
5956 arg.type);
5957 /* Avoid repeating the message, and avoid tripping the assert
5958 below. */
5959 pcum->pcs_variant = ARM_PCS_SVE;
5960 }
5961
5962 /* We would have converted the argument into pass-by-reference
5963 form if it didn't fit in registers. */
5964 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5965 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
5966 gcc_assert (arg.named
5967 && pcum->pcs_variant == ARM_PCS_SVE
5968 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5969 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5970 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5971 P0_REGNUM + pcum->aapcs_nprn);
5972 return;
5973 }
5974
5975 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5976 are passed by reference, not by value. */
5977 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5978 bool sve_p = (vec_flags & VEC_ANY_SVE);
5979 if (sve_p)
5980 /* Vector types can acquire a partial SVE mode using things like
5981 __attribute__((vector_size(N))), and this is potentially useful.
5982 However, the choice of mode doesn't affect the type's ABI
5983 identity, so we should treat the types as though they had
5984 the associated integer mode, just like they did before SVE
5985 was introduced.
5986
5987 We know that the vector must be 128 bits or smaller,
5988 otherwise we'd have passed it in memory instead. */
5989 gcc_assert (type
5990 && (aarch64_some_values_include_pst_objects_p (type)
5991 || (vec_flags & VEC_PARTIAL)));
5992
5993 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5994 if (type)
5995 size = int_size_in_bytes (type);
5996 else
5997 /* No frontends can create types with variable-sized modes, so we
5998 shouldn't be asked to pass or return them. */
5999 size = GET_MODE_SIZE (mode).to_constant ();
6000 size = ROUND_UP (size, UNITS_PER_WORD);
6001
6002 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6003 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6004 mode,
6005 type,
6006 &nregs);
6007 gcc_assert (!sve_p || !allocate_nvrn);
6008
6009 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6010 The following code thus handles passing by SIMD/FP registers first. */
6011
6012 nvrn = pcum->aapcs_nvrn;
6013
6014 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6015 and homogenous short-vector aggregates (HVA). */
6016 if (allocate_nvrn)
6017 {
6018 if (!pcum->silent_p && !TARGET_FLOAT)
6019 aarch64_err_no_fpadvsimd (mode);
6020
6021 if (nvrn + nregs <= NUM_FP_ARG_REGS)
6022 {
6023 pcum->aapcs_nextnvrn = nvrn + nregs;
6024 if (!aarch64_composite_type_p (type, mode))
6025 {
6026 gcc_assert (nregs == 1);
6027 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6028 }
6029 else
6030 {
6031 rtx par;
6032 int i;
6033 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6034 for (i = 0; i < nregs; i++)
6035 {
6036 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6037 V0_REGNUM + nvrn + i);
6038 rtx offset = gen_int_mode
6039 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6040 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6041 XVECEXP (par, 0, i) = tmp;
6042 }
6043 pcum->aapcs_reg = par;
6044 }
6045 return;
6046 }
6047 else
6048 {
6049 /* C.3 NSRN is set to 8. */
6050 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6051 goto on_stack;
6052 }
6053 }
6054
6055 ncrn = pcum->aapcs_ncrn;
6056 nregs = size / UNITS_PER_WORD;
6057
6058 /* C6 - C9. though the sign and zero extension semantics are
6059 handled elsewhere. This is the case where the argument fits
6060 entirely general registers. */
6061 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6062 {
6063 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6064
6065 /* C.8 if the argument has an alignment of 16 then the NGRN is
6066 rounded up to the next even number. */
6067 if (nregs == 2
6068 && ncrn % 2
6069 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6070 comparison is there because for > 16 * BITS_PER_UNIT
6071 alignment nregs should be > 2 and therefore it should be
6072 passed by reference rather than value. */
6073 && (aarch64_function_arg_alignment (mode, type, &abi_break)
6074 == 16 * BITS_PER_UNIT))
6075 {
6076 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6077 inform (input_location, "parameter passing for argument of type "
6078 "%qT changed in GCC 9.1", type);
6079 ++ncrn;
6080 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6081 }
6082
6083 /* If an argument with an SVE mode needs to be shifted up to the
6084 high part of the register, treat it as though it had an integer mode.
6085 Using the normal (parallel [...]) would suppress the shifting. */
6086 if (sve_p
6087 && BYTES_BIG_ENDIAN
6088 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6089 && aarch64_pad_reg_upward (mode, type, false))
6090 {
6091 mode = int_mode_for_mode (mode).require ();
6092 sve_p = false;
6093 }
6094
6095 /* NREGS can be 0 when e.g. an empty structure is to be passed.
6096 A reg is still generated for it, but the caller should be smart
6097 enough not to use it. */
6098 if (nregs == 0
6099 || (nregs == 1 && !sve_p)
6100 || GET_MODE_CLASS (mode) == MODE_INT)
6101 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6102 else
6103 {
6104 rtx par;
6105 int i;
6106
6107 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6108 for (i = 0; i < nregs; i++)
6109 {
6110 scalar_int_mode reg_mode = word_mode;
6111 if (nregs == 1)
6112 reg_mode = int_mode_for_mode (mode).require ();
6113 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6114 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6115 GEN_INT (i * UNITS_PER_WORD));
6116 XVECEXP (par, 0, i) = tmp;
6117 }
6118 pcum->aapcs_reg = par;
6119 }
6120
6121 pcum->aapcs_nextncrn = ncrn + nregs;
6122 return;
6123 }
6124
6125 /* C.11 */
6126 pcum->aapcs_nextncrn = NUM_ARG_REGS;
6127
6128 /* The argument is passed on stack; record the needed number of words for
6129 this argument and align the total size if necessary. */
6130 on_stack:
6131 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
6132
6133 if (aarch64_function_arg_alignment (mode, type, &abi_break)
6134 == 16 * BITS_PER_UNIT)
6135 {
6136 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6137 if (pcum->aapcs_stack_size != new_size)
6138 {
6139 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6140 inform (input_location, "parameter passing for argument of type "
6141 "%qT changed in GCC 9.1", type);
6142 pcum->aapcs_stack_size = new_size;
6143 }
6144 }
6145 return;
6146 }
6147
6148 /* Implement TARGET_FUNCTION_ARG. */
6149
6150 static rtx
6151 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6152 {
6153 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6154 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
6155 || pcum->pcs_variant == ARM_PCS_SIMD
6156 || pcum->pcs_variant == ARM_PCS_SVE);
6157
6158 if (arg.end_marker_p ())
6159 return gen_int_mode (pcum->pcs_variant, DImode);
6160
6161 aarch64_layout_arg (pcum_v, arg);
6162 return pcum->aapcs_reg;
6163 }
6164
6165 void
6166 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
6167 const_tree fntype,
6168 rtx libname ATTRIBUTE_UNUSED,
6169 const_tree fndecl ATTRIBUTE_UNUSED,
6170 unsigned n_named ATTRIBUTE_UNUSED,
6171 bool silent_p)
6172 {
6173 pcum->aapcs_ncrn = 0;
6174 pcum->aapcs_nvrn = 0;
6175 pcum->aapcs_nprn = 0;
6176 pcum->aapcs_nextncrn = 0;
6177 pcum->aapcs_nextnvrn = 0;
6178 pcum->aapcs_nextnprn = 0;
6179 if (fntype)
6180 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6181 else
6182 pcum->pcs_variant = ARM_PCS_AAPCS64;
6183 pcum->aapcs_reg = NULL_RTX;
6184 pcum->aapcs_arg_processed = false;
6185 pcum->aapcs_stack_words = 0;
6186 pcum->aapcs_stack_size = 0;
6187 pcum->silent_p = silent_p;
6188
6189 if (!silent_p
6190 && !TARGET_FLOAT
6191 && fntype && fntype != error_mark_node)
6192 {
6193 const_tree type = TREE_TYPE (fntype);
6194 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
6195 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
6196 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6197 &mode, &nregs, NULL, false))
6198 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
6199 }
6200
6201 if (!silent_p
6202 && !TARGET_SVE
6203 && pcum->pcs_variant == ARM_PCS_SVE)
6204 {
6205 /* We can't gracefully recover at this point, so make this a
6206 fatal error. */
6207 if (fndecl)
6208 fatal_error (input_location, "%qE requires the SVE ISA extension",
6209 fndecl);
6210 else
6211 fatal_error (input_location, "calls to functions of type %qT require"
6212 " the SVE ISA extension", fntype);
6213 }
6214 }
6215
6216 static void
6217 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6218 const function_arg_info &arg)
6219 {
6220 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6221 if (pcum->pcs_variant == ARM_PCS_AAPCS64
6222 || pcum->pcs_variant == ARM_PCS_SIMD
6223 || pcum->pcs_variant == ARM_PCS_SVE)
6224 {
6225 aarch64_layout_arg (pcum_v, arg);
6226 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6227 != (pcum->aapcs_stack_words != 0));
6228 pcum->aapcs_arg_processed = false;
6229 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6230 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6231 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6232 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6233 pcum->aapcs_stack_words = 0;
6234 pcum->aapcs_reg = NULL_RTX;
6235 }
6236 }
6237
6238 bool
6239 aarch64_function_arg_regno_p (unsigned regno)
6240 {
6241 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6242 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6243 }
6244
6245 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
6246 PARM_BOUNDARY bits of alignment, but will be given anything up
6247 to STACK_BOUNDARY bits if the type requires it. This makes sure
6248 that both before and after the layout of each argument, the Next
6249 Stacked Argument Address (NSAA) will have a minimum alignment of
6250 8 bytes. */
6251
6252 static unsigned int
6253 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6254 {
6255 bool abi_break;
6256 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6257 &abi_break);
6258 if (abi_break & warn_psabi)
6259 inform (input_location, "parameter passing for argument of type "
6260 "%qT changed in GCC 9.1", type);
6261
6262 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6263 }
6264
6265 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
6266
6267 static fixed_size_mode
6268 aarch64_get_reg_raw_mode (int regno)
6269 {
6270 if (TARGET_SVE && FP_REGNUM_P (regno))
6271 /* Don't use the SVE part of the register for __builtin_apply and
6272 __builtin_return. The SVE registers aren't used by the normal PCS,
6273 so using them there would be a waste of time. The PCS extensions
6274 for SVE types are fundamentally incompatible with the
6275 __builtin_return/__builtin_apply interface. */
6276 return as_a <fixed_size_mode> (V16QImode);
6277 return default_get_reg_raw_mode (regno);
6278 }
6279
6280 /* Implement TARGET_FUNCTION_ARG_PADDING.
6281
6282 Small aggregate types are placed in the lowest memory address.
6283
6284 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
6285
6286 static pad_direction
6287 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6288 {
6289 /* On little-endian targets, the least significant byte of every stack
6290 argument is passed at the lowest byte address of the stack slot. */
6291 if (!BYTES_BIG_ENDIAN)
6292 return PAD_UPWARD;
6293
6294 /* Otherwise, integral, floating-point and pointer types are padded downward:
6295 the least significant byte of a stack argument is passed at the highest
6296 byte address of the stack slot. */
6297 if (type
6298 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6299 || POINTER_TYPE_P (type))
6300 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6301 return PAD_DOWNWARD;
6302
6303 /* Everything else padded upward, i.e. data in first byte of stack slot. */
6304 return PAD_UPWARD;
6305 }
6306
6307 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6308
6309 It specifies padding for the last (may also be the only)
6310 element of a block move between registers and memory. If
6311 assuming the block is in the memory, padding upward means that
6312 the last element is padded after its highest significant byte,
6313 while in downward padding, the last element is padded at the
6314 its least significant byte side.
6315
6316 Small aggregates and small complex types are always padded
6317 upwards.
6318
6319 We don't need to worry about homogeneous floating-point or
6320 short-vector aggregates; their move is not affected by the
6321 padding direction determined here. Regardless of endianness,
6322 each element of such an aggregate is put in the least
6323 significant bits of a fp/simd register.
6324
6325 Return !BYTES_BIG_ENDIAN if the least significant byte of the
6326 register has useful data, and return the opposite if the most
6327 significant byte does. */
6328
6329 bool
6330 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6331 bool first ATTRIBUTE_UNUSED)
6332 {
6333
6334 /* Aside from pure scalable types, small composite types are always
6335 padded upward. */
6336 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6337 {
6338 HOST_WIDE_INT size;
6339 if (type)
6340 size = int_size_in_bytes (type);
6341 else
6342 /* No frontends can create types with variable-sized modes, so we
6343 shouldn't be asked to pass or return them. */
6344 size = GET_MODE_SIZE (mode).to_constant ();
6345 if (size < 2 * UNITS_PER_WORD)
6346 {
6347 pure_scalable_type_info pst_info;
6348 if (pst_info.analyze_registers (type))
6349 return false;
6350 return true;
6351 }
6352 }
6353
6354 /* Otherwise, use the default padding. */
6355 return !BYTES_BIG_ENDIAN;
6356 }
6357
6358 static scalar_int_mode
6359 aarch64_libgcc_cmp_return_mode (void)
6360 {
6361 return SImode;
6362 }
6363
6364 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6365
6366 /* We use the 12-bit shifted immediate arithmetic instructions so values
6367 must be multiple of (1 << 12), i.e. 4096. */
6368 #define ARITH_FACTOR 4096
6369
6370 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6371 #error Cannot use simple address calculation for stack probing
6372 #endif
6373
6374 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6375 inclusive. These are offsets from the current stack pointer. */
6376
6377 static void
6378 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6379 {
6380 HOST_WIDE_INT size;
6381 if (!poly_size.is_constant (&size))
6382 {
6383 sorry ("stack probes for SVE frames");
6384 return;
6385 }
6386
6387 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
6388
6389 /* See the same assertion on PROBE_INTERVAL above. */
6390 gcc_assert ((first % ARITH_FACTOR) == 0);
6391
6392 /* See if we have a constant small number of probes to generate. If so,
6393 that's the easy case. */
6394 if (size <= PROBE_INTERVAL)
6395 {
6396 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6397
6398 emit_set_insn (reg1,
6399 plus_constant (Pmode,
6400 stack_pointer_rtx, -(first + base)));
6401 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6402 }
6403
6404 /* The run-time loop is made up of 8 insns in the generic case while the
6405 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
6406 else if (size <= 4 * PROBE_INTERVAL)
6407 {
6408 HOST_WIDE_INT i, rem;
6409
6410 emit_set_insn (reg1,
6411 plus_constant (Pmode,
6412 stack_pointer_rtx,
6413 -(first + PROBE_INTERVAL)));
6414 emit_stack_probe (reg1);
6415
6416 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6417 it exceeds SIZE. If only two probes are needed, this will not
6418 generate any code. Then probe at FIRST + SIZE. */
6419 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6420 {
6421 emit_set_insn (reg1,
6422 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6423 emit_stack_probe (reg1);
6424 }
6425
6426 rem = size - (i - PROBE_INTERVAL);
6427 if (rem > 256)
6428 {
6429 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6430
6431 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6432 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6433 }
6434 else
6435 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6436 }
6437
6438 /* Otherwise, do the same as above, but in a loop. Note that we must be
6439 extra careful with variables wrapping around because we might be at
6440 the very top (or the very bottom) of the address space and we have
6441 to be able to handle this case properly; in particular, we use an
6442 equality test for the loop condition. */
6443 else
6444 {
6445 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
6446
6447 /* Step 1: round SIZE to the previous multiple of the interval. */
6448
6449 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6450
6451
6452 /* Step 2: compute initial and final value of the loop counter. */
6453
6454 /* TEST_ADDR = SP + FIRST. */
6455 emit_set_insn (reg1,
6456 plus_constant (Pmode, stack_pointer_rtx, -first));
6457
6458 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
6459 HOST_WIDE_INT adjustment = - (first + rounded_size);
6460 if (! aarch64_uimm12_shift (adjustment))
6461 {
6462 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6463 true, Pmode);
6464 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6465 }
6466 else
6467 emit_set_insn (reg2,
6468 plus_constant (Pmode, stack_pointer_rtx, adjustment));
6469
6470 /* Step 3: the loop
6471
6472 do
6473 {
6474 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6475 probe at TEST_ADDR
6476 }
6477 while (TEST_ADDR != LAST_ADDR)
6478
6479 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6480 until it is equal to ROUNDED_SIZE. */
6481
6482 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6483
6484
6485 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6486 that SIZE is equal to ROUNDED_SIZE. */
6487
6488 if (size != rounded_size)
6489 {
6490 HOST_WIDE_INT rem = size - rounded_size;
6491
6492 if (rem > 256)
6493 {
6494 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6495
6496 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6497 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6498 }
6499 else
6500 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6501 }
6502 }
6503
6504 /* Make sure nothing is scheduled before we are done. */
6505 emit_insn (gen_blockage ());
6506 }
6507
6508 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
6509 absolute addresses. */
6510
6511 const char *
6512 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6513 {
6514 static int labelno = 0;
6515 char loop_lab[32];
6516 rtx xops[2];
6517
6518 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6519
6520 /* Loop. */
6521 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6522
6523 HOST_WIDE_INT stack_clash_probe_interval
6524 = 1 << param_stack_clash_protection_guard_size;
6525
6526 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
6527 xops[0] = reg1;
6528 HOST_WIDE_INT interval;
6529 if (flag_stack_clash_protection)
6530 interval = stack_clash_probe_interval;
6531 else
6532 interval = PROBE_INTERVAL;
6533
6534 gcc_assert (aarch64_uimm12_shift (interval));
6535 xops[1] = GEN_INT (interval);
6536
6537 output_asm_insn ("sub\t%0, %0, %1", xops);
6538
6539 /* If doing stack clash protection then we probe up by the ABI specified
6540 amount. We do this because we're dropping full pages at a time in the
6541 loop. But if we're doing non-stack clash probing, probe at SP 0. */
6542 if (flag_stack_clash_protection)
6543 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6544 else
6545 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6546
6547 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
6548 by this amount for each iteration. */
6549 output_asm_insn ("str\txzr, [%0, %1]", xops);
6550
6551 /* Test if TEST_ADDR == LAST_ADDR. */
6552 xops[1] = reg2;
6553 output_asm_insn ("cmp\t%0, %1", xops);
6554
6555 /* Branch. */
6556 fputs ("\tb.ne\t", asm_out_file);
6557 assemble_name_raw (asm_out_file, loop_lab);
6558 fputc ('\n', asm_out_file);
6559
6560 return "";
6561 }
6562
6563 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6564 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6565 of GUARD_SIZE. When a probe is emitted it is done at most
6566 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6567 at most MIN_PROBE_THRESHOLD. By the end of this function
6568 BASE = BASE - ADJUSTMENT. */
6569
6570 const char *
6571 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6572 rtx min_probe_threshold, rtx guard_size)
6573 {
6574 /* This function is not allowed to use any instruction generation function
6575 like gen_ and friends. If you do you'll likely ICE during CFG validation,
6576 so instead emit the code you want using output_asm_insn. */
6577 gcc_assert (flag_stack_clash_protection);
6578 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6579 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6580
6581 /* The minimum required allocation before the residual requires probing. */
6582 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6583
6584 /* Clamp the value down to the nearest value that can be used with a cmp. */
6585 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6586 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6587
6588 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6589 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6590
6591 static int labelno = 0;
6592 char loop_start_lab[32];
6593 char loop_end_lab[32];
6594 rtx xops[2];
6595
6596 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6597 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6598
6599 /* Emit loop start label. */
6600 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6601
6602 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
6603 xops[0] = adjustment;
6604 xops[1] = probe_offset_value_rtx;
6605 output_asm_insn ("cmp\t%0, %1", xops);
6606
6607 /* Branch to end if not enough adjustment to probe. */
6608 fputs ("\tb.lt\t", asm_out_file);
6609 assemble_name_raw (asm_out_file, loop_end_lab);
6610 fputc ('\n', asm_out_file);
6611
6612 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
6613 xops[0] = base;
6614 xops[1] = probe_offset_value_rtx;
6615 output_asm_insn ("sub\t%0, %0, %1", xops);
6616
6617 /* Probe at BASE. */
6618 xops[1] = const0_rtx;
6619 output_asm_insn ("str\txzr, [%0, %1]", xops);
6620
6621 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
6622 xops[0] = adjustment;
6623 xops[1] = probe_offset_value_rtx;
6624 output_asm_insn ("sub\t%0, %0, %1", xops);
6625
6626 /* Branch to start if still more bytes to allocate. */
6627 fputs ("\tb\t", asm_out_file);
6628 assemble_name_raw (asm_out_file, loop_start_lab);
6629 fputc ('\n', asm_out_file);
6630
6631 /* No probe leave. */
6632 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6633
6634 /* BASE = BASE - ADJUSTMENT. */
6635 xops[0] = base;
6636 xops[1] = adjustment;
6637 output_asm_insn ("sub\t%0, %0, %1", xops);
6638 return "";
6639 }
6640
6641 /* Determine whether a frame chain needs to be generated. */
6642 static bool
6643 aarch64_needs_frame_chain (void)
6644 {
6645 /* Force a frame chain for EH returns so the return address is at FP+8. */
6646 if (frame_pointer_needed || crtl->calls_eh_return)
6647 return true;
6648
6649 /* A leaf function cannot have calls or write LR. */
6650 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6651
6652 /* Don't use a frame chain in leaf functions if leaf frame pointers
6653 are disabled. */
6654 if (flag_omit_leaf_frame_pointer && is_leaf)
6655 return false;
6656
6657 return aarch64_use_frame_pointer;
6658 }
6659
6660 /* Mark the registers that need to be saved by the callee and calculate
6661 the size of the callee-saved registers area and frame record (both FP
6662 and LR may be omitted). */
6663 static void
6664 aarch64_layout_frame (void)
6665 {
6666 poly_int64 offset = 0;
6667 int regno, last_fp_reg = INVALID_REGNUM;
6668 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6669 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6670 bool frame_related_fp_reg_p = false;
6671 aarch64_frame &frame = cfun->machine->frame;
6672
6673 frame.emit_frame_chain = aarch64_needs_frame_chain ();
6674
6675 /* Adjust the outgoing arguments size if required. Keep it in sync with what
6676 the mid-end is doing. */
6677 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6678
6679 #define SLOT_NOT_REQUIRED (-2)
6680 #define SLOT_REQUIRED (-1)
6681
6682 frame.wb_candidate1 = INVALID_REGNUM;
6683 frame.wb_candidate2 = INVALID_REGNUM;
6684 frame.spare_pred_reg = INVALID_REGNUM;
6685
6686 /* First mark all the registers that really need to be saved... */
6687 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6688 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6689
6690 /* ... that includes the eh data registers (if needed)... */
6691 if (crtl->calls_eh_return)
6692 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6693 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6694
6695 /* ... and any callee saved register that dataflow says is live. */
6696 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6697 if (df_regs_ever_live_p (regno)
6698 && !fixed_regs[regno]
6699 && (regno == R30_REGNUM
6700 || !crtl->abi->clobbers_full_reg_p (regno)))
6701 frame.reg_offset[regno] = SLOT_REQUIRED;
6702
6703 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6704 if (df_regs_ever_live_p (regno)
6705 && !fixed_regs[regno]
6706 && !crtl->abi->clobbers_full_reg_p (regno))
6707 {
6708 frame.reg_offset[regno] = SLOT_REQUIRED;
6709 last_fp_reg = regno;
6710 if (aarch64_emit_cfi_for_reg_p (regno))
6711 frame_related_fp_reg_p = true;
6712 }
6713
6714 /* Big-endian SVE frames need a spare predicate register in order
6715 to save Z8-Z15. Decide which register they should use. Prefer
6716 an unused argument register if possible, so that we don't force P4
6717 to be saved unnecessarily. */
6718 if (frame_related_fp_reg_p
6719 && crtl->abi->id () == ARM_PCS_SVE
6720 && BYTES_BIG_ENDIAN)
6721 {
6722 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6723 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6724 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6725 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6726 break;
6727 gcc_assert (regno <= P7_REGNUM);
6728 frame.spare_pred_reg = regno;
6729 df_set_regs_ever_live (regno, true);
6730 }
6731
6732 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6733 if (df_regs_ever_live_p (regno)
6734 && !fixed_regs[regno]
6735 && !crtl->abi->clobbers_full_reg_p (regno))
6736 frame.reg_offset[regno] = SLOT_REQUIRED;
6737
6738 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
6739 LR counts as an implicit probe which allows us to maintain the invariant
6740 described in the comment at expand_prologue. */
6741 gcc_assert (crtl->is_leaf
6742 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6743
6744 /* Now assign stack slots for the registers. Start with the predicate
6745 registers, since predicate LDR and STR have a relatively small
6746 offset range. These saves happen below the hard frame pointer. */
6747 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6748 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6749 {
6750 frame.reg_offset[regno] = offset;
6751 offset += BYTES_PER_SVE_PRED;
6752 }
6753
6754 if (maybe_ne (offset, 0))
6755 {
6756 /* If we have any vector registers to save above the predicate registers,
6757 the offset of the vector register save slots need to be a multiple
6758 of the vector size. This lets us use the immediate forms of LDR/STR
6759 (or LD1/ST1 for big-endian).
6760
6761 A vector register is 8 times the size of a predicate register,
6762 and we need to save a maximum of 12 predicate registers, so the
6763 first vector register will be at either #1, MUL VL or #2, MUL VL.
6764
6765 If we don't have any vector registers to save, and we know how
6766 big the predicate save area is, we can just round it up to the
6767 next 16-byte boundary. */
6768 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6769 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6770 else
6771 {
6772 if (known_le (offset, vector_save_size))
6773 offset = vector_save_size;
6774 else if (known_le (offset, vector_save_size * 2))
6775 offset = vector_save_size * 2;
6776 else
6777 gcc_unreachable ();
6778 }
6779 }
6780
6781 /* If we need to save any SVE vector registers, add them next. */
6782 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6783 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6784 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6785 {
6786 frame.reg_offset[regno] = offset;
6787 offset += vector_save_size;
6788 }
6789
6790 /* OFFSET is now the offset of the hard frame pointer from the bottom
6791 of the callee save area. */
6792 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6793 frame.below_hard_fp_saved_regs_size = offset;
6794 if (frame.emit_frame_chain)
6795 {
6796 /* FP and LR are placed in the linkage record. */
6797 frame.reg_offset[R29_REGNUM] = offset;
6798 frame.wb_candidate1 = R29_REGNUM;
6799 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6800 frame.wb_candidate2 = R30_REGNUM;
6801 offset += 2 * UNITS_PER_WORD;
6802 }
6803
6804 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6805 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6806 {
6807 frame.reg_offset[regno] = offset;
6808 if (frame.wb_candidate1 == INVALID_REGNUM)
6809 frame.wb_candidate1 = regno;
6810 else if (frame.wb_candidate2 == INVALID_REGNUM)
6811 frame.wb_candidate2 = regno;
6812 offset += UNITS_PER_WORD;
6813 }
6814
6815 poly_int64 max_int_offset = offset;
6816 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6817 bool has_align_gap = maybe_ne (offset, max_int_offset);
6818
6819 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6820 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6821 {
6822 /* If there is an alignment gap between integer and fp callee-saves,
6823 allocate the last fp register to it if possible. */
6824 if (regno == last_fp_reg
6825 && has_align_gap
6826 && known_eq (vector_save_size, 8)
6827 && multiple_p (offset, 16))
6828 {
6829 frame.reg_offset[regno] = max_int_offset;
6830 break;
6831 }
6832
6833 frame.reg_offset[regno] = offset;
6834 if (frame.wb_candidate1 == INVALID_REGNUM)
6835 frame.wb_candidate1 = regno;
6836 else if (frame.wb_candidate2 == INVALID_REGNUM
6837 && frame.wb_candidate1 >= V0_REGNUM)
6838 frame.wb_candidate2 = regno;
6839 offset += vector_save_size;
6840 }
6841
6842 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6843
6844 frame.saved_regs_size = offset;
6845
6846 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6847
6848 poly_int64 above_outgoing_args
6849 = aligned_upper_bound (varargs_and_saved_regs_size
6850 + get_frame_size (),
6851 STACK_BOUNDARY / BITS_PER_UNIT);
6852
6853 frame.hard_fp_offset
6854 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6855
6856 /* Both these values are already aligned. */
6857 gcc_assert (multiple_p (crtl->outgoing_args_size,
6858 STACK_BOUNDARY / BITS_PER_UNIT));
6859 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6860
6861 frame.locals_offset = frame.saved_varargs_size;
6862
6863 frame.initial_adjust = 0;
6864 frame.final_adjust = 0;
6865 frame.callee_adjust = 0;
6866 frame.sve_callee_adjust = 0;
6867 frame.callee_offset = 0;
6868
6869 HOST_WIDE_INT max_push_offset = 0;
6870 if (frame.wb_candidate2 != INVALID_REGNUM)
6871 max_push_offset = 512;
6872 else if (frame.wb_candidate1 != INVALID_REGNUM)
6873 max_push_offset = 256;
6874
6875 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6876 HOST_WIDE_INT const_saved_regs_size;
6877 if (frame.frame_size.is_constant (&const_size)
6878 && const_size < max_push_offset
6879 && known_eq (frame.hard_fp_offset, const_size))
6880 {
6881 /* Simple, small frame with no outgoing arguments:
6882
6883 stp reg1, reg2, [sp, -frame_size]!
6884 stp reg3, reg4, [sp, 16] */
6885 frame.callee_adjust = const_size;
6886 }
6887 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6888 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6889 && const_outgoing_args_size + const_saved_regs_size < 512
6890 /* We could handle this case even with outgoing args, provided
6891 that the number of args left us with valid offsets for all
6892 predicate and vector save slots. It's such a rare case that
6893 it hardly seems worth the effort though. */
6894 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6895 && !(cfun->calls_alloca
6896 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6897 && const_fp_offset < max_push_offset))
6898 {
6899 /* Frame with small outgoing arguments:
6900
6901 sub sp, sp, frame_size
6902 stp reg1, reg2, [sp, outgoing_args_size]
6903 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6904 frame.initial_adjust = frame.frame_size;
6905 frame.callee_offset = const_outgoing_args_size;
6906 }
6907 else if (saves_below_hard_fp_p
6908 && known_eq (frame.saved_regs_size,
6909 frame.below_hard_fp_saved_regs_size))
6910 {
6911 /* Frame in which all saves are SVE saves:
6912
6913 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6914 save SVE registers relative to SP
6915 sub sp, sp, outgoing_args_size */
6916 frame.initial_adjust = (frame.hard_fp_offset
6917 + frame.below_hard_fp_saved_regs_size);
6918 frame.final_adjust = crtl->outgoing_args_size;
6919 }
6920 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6921 && const_fp_offset < max_push_offset)
6922 {
6923 /* Frame with large outgoing arguments or SVE saves, but with
6924 a small local area:
6925
6926 stp reg1, reg2, [sp, -hard_fp_offset]!
6927 stp reg3, reg4, [sp, 16]
6928 [sub sp, sp, below_hard_fp_saved_regs_size]
6929 [save SVE registers relative to SP]
6930 sub sp, sp, outgoing_args_size */
6931 frame.callee_adjust = const_fp_offset;
6932 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6933 frame.final_adjust = crtl->outgoing_args_size;
6934 }
6935 else
6936 {
6937 /* Frame with large local area and outgoing arguments or SVE saves,
6938 using frame pointer:
6939
6940 sub sp, sp, hard_fp_offset
6941 stp x29, x30, [sp, 0]
6942 add x29, sp, 0
6943 stp reg3, reg4, [sp, 16]
6944 [sub sp, sp, below_hard_fp_saved_regs_size]
6945 [save SVE registers relative to SP]
6946 sub sp, sp, outgoing_args_size */
6947 frame.initial_adjust = frame.hard_fp_offset;
6948 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6949 frame.final_adjust = crtl->outgoing_args_size;
6950 }
6951
6952 /* Make sure the individual adjustments add up to the full frame size. */
6953 gcc_assert (known_eq (frame.initial_adjust
6954 + frame.callee_adjust
6955 + frame.sve_callee_adjust
6956 + frame.final_adjust, frame.frame_size));
6957
6958 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
6959 {
6960 /* We've decided not to associate any register saves with the initial
6961 stack allocation. */
6962 frame.wb_candidate1 = INVALID_REGNUM;
6963 frame.wb_candidate2 = INVALID_REGNUM;
6964 }
6965
6966 frame.laid_out = true;
6967 }
6968
6969 /* Return true if the register REGNO is saved on entry to
6970 the current function. */
6971
6972 static bool
6973 aarch64_register_saved_on_entry (int regno)
6974 {
6975 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6976 }
6977
6978 /* Return the next register up from REGNO up to LIMIT for the callee
6979 to save. */
6980
6981 static unsigned
6982 aarch64_next_callee_save (unsigned regno, unsigned limit)
6983 {
6984 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6985 regno ++;
6986 return regno;
6987 }
6988
6989 /* Push the register number REGNO of mode MODE to the stack with write-back
6990 adjusting the stack by ADJUSTMENT. */
6991
6992 static void
6993 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6994 HOST_WIDE_INT adjustment)
6995 {
6996 rtx base_rtx = stack_pointer_rtx;
6997 rtx insn, reg, mem;
6998
6999 reg = gen_rtx_REG (mode, regno);
7000 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7001 plus_constant (Pmode, base_rtx, -adjustment));
7002 mem = gen_frame_mem (mode, mem);
7003
7004 insn = emit_move_insn (mem, reg);
7005 RTX_FRAME_RELATED_P (insn) = 1;
7006 }
7007
7008 /* Generate and return an instruction to store the pair of registers
7009 REG and REG2 of mode MODE to location BASE with write-back adjusting
7010 the stack location BASE by ADJUSTMENT. */
7011
7012 static rtx
7013 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7014 HOST_WIDE_INT adjustment)
7015 {
7016 switch (mode)
7017 {
7018 case E_DImode:
7019 return gen_storewb_pairdi_di (base, base, reg, reg2,
7020 GEN_INT (-adjustment),
7021 GEN_INT (UNITS_PER_WORD - adjustment));
7022 case E_DFmode:
7023 return gen_storewb_pairdf_di (base, base, reg, reg2,
7024 GEN_INT (-adjustment),
7025 GEN_INT (UNITS_PER_WORD - adjustment));
7026 case E_TFmode:
7027 return gen_storewb_pairtf_di (base, base, reg, reg2,
7028 GEN_INT (-adjustment),
7029 GEN_INT (UNITS_PER_VREG - adjustment));
7030 default:
7031 gcc_unreachable ();
7032 }
7033 }
7034
7035 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7036 stack pointer by ADJUSTMENT. */
7037
7038 static void
7039 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
7040 {
7041 rtx_insn *insn;
7042 machine_mode mode = aarch64_reg_save_mode (regno1);
7043
7044 if (regno2 == INVALID_REGNUM)
7045 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7046
7047 rtx reg1 = gen_rtx_REG (mode, regno1);
7048 rtx reg2 = gen_rtx_REG (mode, regno2);
7049
7050 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7051 reg2, adjustment));
7052 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
7053 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7054 RTX_FRAME_RELATED_P (insn) = 1;
7055 }
7056
7057 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7058 adjusting it by ADJUSTMENT afterwards. */
7059
7060 static rtx
7061 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7062 HOST_WIDE_INT adjustment)
7063 {
7064 switch (mode)
7065 {
7066 case E_DImode:
7067 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
7068 GEN_INT (UNITS_PER_WORD));
7069 case E_DFmode:
7070 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
7071 GEN_INT (UNITS_PER_WORD));
7072 case E_TFmode:
7073 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7074 GEN_INT (UNITS_PER_VREG));
7075 default:
7076 gcc_unreachable ();
7077 }
7078 }
7079
7080 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7081 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7082 into CFI_OPS. */
7083
7084 static void
7085 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7086 rtx *cfi_ops)
7087 {
7088 machine_mode mode = aarch64_reg_save_mode (regno1);
7089 rtx reg1 = gen_rtx_REG (mode, regno1);
7090
7091 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7092
7093 if (regno2 == INVALID_REGNUM)
7094 {
7095 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7096 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
7097 emit_move_insn (reg1, gen_frame_mem (mode, mem));
7098 }
7099 else
7100 {
7101 rtx reg2 = gen_rtx_REG (mode, regno2);
7102 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7103 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7104 reg2, adjustment));
7105 }
7106 }
7107
7108 /* Generate and return a store pair instruction of mode MODE to store
7109 register REG1 to MEM1 and register REG2 to MEM2. */
7110
7111 static rtx
7112 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
7113 rtx reg2)
7114 {
7115 switch (mode)
7116 {
7117 case E_DImode:
7118 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
7119
7120 case E_DFmode:
7121 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
7122
7123 case E_TFmode:
7124 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7125
7126 case E_V4SImode:
7127 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
7128
7129 case E_V16QImode:
7130 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
7131
7132 default:
7133 gcc_unreachable ();
7134 }
7135 }
7136
7137 /* Generate and regurn a load pair isntruction of mode MODE to load register
7138 REG1 from MEM1 and register REG2 from MEM2. */
7139
7140 static rtx
7141 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
7142 rtx mem2)
7143 {
7144 switch (mode)
7145 {
7146 case E_DImode:
7147 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
7148
7149 case E_DFmode:
7150 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
7151
7152 case E_TFmode:
7153 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7154
7155 case E_V4SImode:
7156 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
7157
7158 default:
7159 gcc_unreachable ();
7160 }
7161 }
7162
7163 /* Return TRUE if return address signing should be enabled for the current
7164 function, otherwise return FALSE. */
7165
7166 bool
7167 aarch64_return_address_signing_enabled (void)
7168 {
7169 /* This function should only be called after frame laid out. */
7170 gcc_assert (cfun->machine->frame.laid_out);
7171
7172 /* Turn return address signing off in any function that uses
7173 __builtin_eh_return. The address passed to __builtin_eh_return
7174 is not signed so either it has to be signed (with original sp)
7175 or the code path that uses it has to avoid authenticating it.
7176 Currently eh return introduces a return to anywhere gadget, no
7177 matter what we do here since it uses ret with user provided
7178 address. An ideal fix for that is to use indirect branch which
7179 can be protected with BTI j (to some extent). */
7180 if (crtl->calls_eh_return)
7181 return false;
7182
7183 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
7184 if its LR is pushed onto stack. */
7185 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7186 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
7187 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
7188 }
7189
7190 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
7191 bool
7192 aarch64_bti_enabled (void)
7193 {
7194 return (aarch64_enable_bti == 1);
7195 }
7196
7197 /* The caller is going to use ST1D or LD1D to save or restore an SVE
7198 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7199 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
7200
7201 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7202 or LD1D address
7203
7204 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7205 if the variable isn't already nonnull
7206
7207 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7208 Handle this case using a temporary base register that is suitable for
7209 all offsets in that range. Use ANCHOR_REG as this base register if it
7210 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
7211
7212 static inline void
7213 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7214 rtx &anchor_reg, poly_int64 &offset,
7215 rtx &ptrue)
7216 {
7217 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7218 {
7219 /* This is the maximum valid offset of the anchor from the base.
7220 Lower values would be valid too. */
7221 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7222 if (!anchor_reg)
7223 {
7224 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7225 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7226 gen_int_mode (anchor_offset, Pmode)));
7227 }
7228 base_rtx = anchor_reg;
7229 offset -= anchor_offset;
7230 }
7231 if (!ptrue)
7232 {
7233 int pred_reg = cfun->machine->frame.spare_pred_reg;
7234 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7235 CONSTM1_RTX (VNx16BImode));
7236 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7237 }
7238 }
7239
7240 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7241 is saved at BASE + OFFSET. */
7242
7243 static void
7244 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7245 rtx base, poly_int64 offset)
7246 {
7247 rtx mem = gen_frame_mem (GET_MODE (reg),
7248 plus_constant (Pmode, base, offset));
7249 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7250 }
7251
7252 /* Emit code to save the callee-saved registers from register number START
7253 to LIMIT to the stack at the location starting at offset START_OFFSET,
7254 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
7255 is true if the hard frame pointer has been set up. */
7256
7257 static void
7258 aarch64_save_callee_saves (poly_int64 start_offset,
7259 unsigned start, unsigned limit, bool skip_wb,
7260 bool hard_fp_valid_p)
7261 {
7262 rtx_insn *insn;
7263 unsigned regno;
7264 unsigned regno2;
7265 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7266
7267 for (regno = aarch64_next_callee_save (start, limit);
7268 regno <= limit;
7269 regno = aarch64_next_callee_save (regno + 1, limit))
7270 {
7271 rtx reg, mem;
7272 poly_int64 offset;
7273 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7274
7275 if (skip_wb
7276 && (regno == cfun->machine->frame.wb_candidate1
7277 || regno == cfun->machine->frame.wb_candidate2))
7278 continue;
7279
7280 if (cfun->machine->reg_is_wrapped_separately[regno])
7281 continue;
7282
7283 machine_mode mode = aarch64_reg_save_mode (regno);
7284 reg = gen_rtx_REG (mode, regno);
7285 offset = start_offset + cfun->machine->frame.reg_offset[regno];
7286 rtx base_rtx = stack_pointer_rtx;
7287 poly_int64 sp_offset = offset;
7288
7289 HOST_WIDE_INT const_offset;
7290 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7291 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7292 offset, ptrue);
7293 else if (GP_REGNUM_P (regno)
7294 && (!offset.is_constant (&const_offset) || const_offset >= 512))
7295 {
7296 gcc_assert (known_eq (start_offset, 0));
7297 poly_int64 fp_offset
7298 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7299 if (hard_fp_valid_p)
7300 base_rtx = hard_frame_pointer_rtx;
7301 else
7302 {
7303 if (!anchor_reg)
7304 {
7305 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7306 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7307 gen_int_mode (fp_offset, Pmode)));
7308 }
7309 base_rtx = anchor_reg;
7310 }
7311 offset -= fp_offset;
7312 }
7313 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7314 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7315
7316 if (!aarch64_sve_mode_p (mode)
7317 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7318 && !cfun->machine->reg_is_wrapped_separately[regno2]
7319 && known_eq (GET_MODE_SIZE (mode),
7320 cfun->machine->frame.reg_offset[regno2]
7321 - cfun->machine->frame.reg_offset[regno]))
7322 {
7323 rtx reg2 = gen_rtx_REG (mode, regno2);
7324 rtx mem2;
7325
7326 offset += GET_MODE_SIZE (mode);
7327 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7328 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7329 reg2));
7330
7331 /* The first part of a frame-related parallel insn is
7332 always assumed to be relevant to the frame
7333 calculations; subsequent parts, are only
7334 frame-related if explicitly marked. */
7335 if (aarch64_emit_cfi_for_reg_p (regno2))
7336 {
7337 if (need_cfa_note_p)
7338 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7339 sp_offset + GET_MODE_SIZE (mode));
7340 else
7341 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7342 }
7343
7344 regno = regno2;
7345 }
7346 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7347 {
7348 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7349 need_cfa_note_p = true;
7350 }
7351 else if (aarch64_sve_mode_p (mode))
7352 insn = emit_insn (gen_rtx_SET (mem, reg));
7353 else
7354 insn = emit_move_insn (mem, reg);
7355
7356 RTX_FRAME_RELATED_P (insn) = frame_related_p;
7357 if (frame_related_p && need_cfa_note_p)
7358 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7359 }
7360 }
7361
7362 /* Emit code to restore the callee registers from register number START
7363 up to and including LIMIT. Restore from the stack offset START_OFFSET,
7364 skipping any write-back candidates if SKIP_WB is true. Write the
7365 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
7366
7367 static void
7368 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7369 unsigned limit, bool skip_wb, rtx *cfi_ops)
7370 {
7371 unsigned regno;
7372 unsigned regno2;
7373 poly_int64 offset;
7374 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7375
7376 for (regno = aarch64_next_callee_save (start, limit);
7377 regno <= limit;
7378 regno = aarch64_next_callee_save (regno + 1, limit))
7379 {
7380 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7381 if (cfun->machine->reg_is_wrapped_separately[regno])
7382 continue;
7383
7384 rtx reg, mem;
7385
7386 if (skip_wb
7387 && (regno == cfun->machine->frame.wb_candidate1
7388 || regno == cfun->machine->frame.wb_candidate2))
7389 continue;
7390
7391 machine_mode mode = aarch64_reg_save_mode (regno);
7392 reg = gen_rtx_REG (mode, regno);
7393 offset = start_offset + cfun->machine->frame.reg_offset[regno];
7394 rtx base_rtx = stack_pointer_rtx;
7395 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7396 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7397 offset, ptrue);
7398 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7399
7400 if (!aarch64_sve_mode_p (mode)
7401 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7402 && !cfun->machine->reg_is_wrapped_separately[regno2]
7403 && known_eq (GET_MODE_SIZE (mode),
7404 cfun->machine->frame.reg_offset[regno2]
7405 - cfun->machine->frame.reg_offset[regno]))
7406 {
7407 rtx reg2 = gen_rtx_REG (mode, regno2);
7408 rtx mem2;
7409
7410 offset += GET_MODE_SIZE (mode);
7411 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7412 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7413
7414 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7415 regno = regno2;
7416 }
7417 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7418 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7419 else if (aarch64_sve_mode_p (mode))
7420 emit_insn (gen_rtx_SET (reg, mem));
7421 else
7422 emit_move_insn (reg, mem);
7423 if (frame_related_p)
7424 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7425 }
7426 }
7427
7428 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7429 of MODE. */
7430
7431 static inline bool
7432 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7433 {
7434 HOST_WIDE_INT multiple;
7435 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7436 && IN_RANGE (multiple, -8, 7));
7437 }
7438
7439 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7440 of MODE. */
7441
7442 static inline bool
7443 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7444 {
7445 HOST_WIDE_INT multiple;
7446 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7447 && IN_RANGE (multiple, 0, 63));
7448 }
7449
7450 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7451 of MODE. */
7452
7453 bool
7454 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7455 {
7456 HOST_WIDE_INT multiple;
7457 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7458 && IN_RANGE (multiple, -64, 63));
7459 }
7460
7461 /* Return true if OFFSET is a signed 9-bit value. */
7462
7463 bool
7464 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7465 poly_int64 offset)
7466 {
7467 HOST_WIDE_INT const_offset;
7468 return (offset.is_constant (&const_offset)
7469 && IN_RANGE (const_offset, -256, 255));
7470 }
7471
7472 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7473 of MODE. */
7474
7475 static inline bool
7476 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7477 {
7478 HOST_WIDE_INT multiple;
7479 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7480 && IN_RANGE (multiple, -256, 255));
7481 }
7482
7483 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7484 of MODE. */
7485
7486 static inline bool
7487 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7488 {
7489 HOST_WIDE_INT multiple;
7490 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7491 && IN_RANGE (multiple, 0, 4095));
7492 }
7493
7494 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
7495
7496 static sbitmap
7497 aarch64_get_separate_components (void)
7498 {
7499 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7500 bitmap_clear (components);
7501
7502 /* The registers we need saved to the frame. */
7503 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7504 if (aarch64_register_saved_on_entry (regno))
7505 {
7506 /* Punt on saves and restores that use ST1D and LD1D. We could
7507 try to be smarter, but it would involve making sure that the
7508 spare predicate register itself is safe to use at the save
7509 and restore points. Also, when a frame pointer is being used,
7510 the slots are often out of reach of ST1D and LD1D anyway. */
7511 machine_mode mode = aarch64_reg_save_mode (regno);
7512 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7513 continue;
7514
7515 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7516
7517 /* If the register is saved in the first SVE save slot, we use
7518 it as a stack probe for -fstack-clash-protection. */
7519 if (flag_stack_clash_protection
7520 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7521 && known_eq (offset, 0))
7522 continue;
7523
7524 /* Get the offset relative to the register we'll use. */
7525 if (frame_pointer_needed)
7526 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7527 else
7528 offset += crtl->outgoing_args_size;
7529
7530 /* Check that we can access the stack slot of the register with one
7531 direct load with no adjustments needed. */
7532 if (aarch64_sve_mode_p (mode)
7533 ? offset_9bit_signed_scaled_p (mode, offset)
7534 : offset_12bit_unsigned_scaled_p (mode, offset))
7535 bitmap_set_bit (components, regno);
7536 }
7537
7538 /* Don't mess with the hard frame pointer. */
7539 if (frame_pointer_needed)
7540 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7541
7542 /* If the spare predicate register used by big-endian SVE code
7543 is call-preserved, it must be saved in the main prologue
7544 before any saves that use it. */
7545 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7546 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7547
7548 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7549 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7550 /* If registers have been chosen to be stored/restored with
7551 writeback don't interfere with them to avoid having to output explicit
7552 stack adjustment instructions. */
7553 if (reg2 != INVALID_REGNUM)
7554 bitmap_clear_bit (components, reg2);
7555 if (reg1 != INVALID_REGNUM)
7556 bitmap_clear_bit (components, reg1);
7557
7558 bitmap_clear_bit (components, LR_REGNUM);
7559 bitmap_clear_bit (components, SP_REGNUM);
7560
7561 return components;
7562 }
7563
7564 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
7565
7566 static sbitmap
7567 aarch64_components_for_bb (basic_block bb)
7568 {
7569 bitmap in = DF_LIVE_IN (bb);
7570 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7571 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7572
7573 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7574 bitmap_clear (components);
7575
7576 /* Clobbered registers don't generate values in any meaningful sense,
7577 since nothing after the clobber can rely on their value. And we can't
7578 say that partially-clobbered registers are unconditionally killed,
7579 because whether they're killed or not depends on the mode of the
7580 value they're holding. Thus partially call-clobbered registers
7581 appear in neither the kill set nor the gen set.
7582
7583 Check manually for any calls that clobber more of a register than the
7584 current function can. */
7585 function_abi_aggregator callee_abis;
7586 rtx_insn *insn;
7587 FOR_BB_INSNS (bb, insn)
7588 if (CALL_P (insn))
7589 callee_abis.note_callee_abi (insn_callee_abi (insn));
7590 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7591
7592 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
7593 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7594 if (!fixed_regs[regno]
7595 && !crtl->abi->clobbers_full_reg_p (regno)
7596 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7597 || bitmap_bit_p (in, regno)
7598 || bitmap_bit_p (gen, regno)
7599 || bitmap_bit_p (kill, regno)))
7600 {
7601 bitmap_set_bit (components, regno);
7602
7603 /* If there is a callee-save at an adjacent offset, add it too
7604 to increase the use of LDP/STP. */
7605 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7606 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7607
7608 if (regno2 <= LAST_SAVED_REGNUM)
7609 {
7610 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7611 if (regno < regno2
7612 ? known_eq (offset + 8, offset2)
7613 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7614 bitmap_set_bit (components, regno2);
7615 }
7616 }
7617
7618 return components;
7619 }
7620
7621 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7622 Nothing to do for aarch64. */
7623
7624 static void
7625 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7626 {
7627 }
7628
7629 /* Return the next set bit in BMP from START onwards. Return the total number
7630 of bits in BMP if no set bit is found at or after START. */
7631
7632 static unsigned int
7633 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7634 {
7635 unsigned int nbits = SBITMAP_SIZE (bmp);
7636 if (start == nbits)
7637 return start;
7638
7639 gcc_assert (start < nbits);
7640 for (unsigned int i = start; i < nbits; i++)
7641 if (bitmap_bit_p (bmp, i))
7642 return i;
7643
7644 return nbits;
7645 }
7646
7647 /* Do the work for aarch64_emit_prologue_components and
7648 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
7649 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7650 for these components or the epilogue sequence. That is, it determines
7651 whether we should emit stores or loads and what kind of CFA notes to attach
7652 to the insns. Otherwise the logic for the two sequences is very
7653 similar. */
7654
7655 static void
7656 aarch64_process_components (sbitmap components, bool prologue_p)
7657 {
7658 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7659 ? HARD_FRAME_POINTER_REGNUM
7660 : STACK_POINTER_REGNUM);
7661
7662 unsigned last_regno = SBITMAP_SIZE (components);
7663 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7664 rtx_insn *insn = NULL;
7665
7666 while (regno != last_regno)
7667 {
7668 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7669 machine_mode mode = aarch64_reg_save_mode (regno);
7670
7671 rtx reg = gen_rtx_REG (mode, regno);
7672 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7673 if (frame_pointer_needed)
7674 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7675 else
7676 offset += crtl->outgoing_args_size;
7677
7678 rtx addr = plus_constant (Pmode, ptr_reg, offset);
7679 rtx mem = gen_frame_mem (mode, addr);
7680
7681 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7682 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7683 /* No more registers to handle after REGNO.
7684 Emit a single save/restore and exit. */
7685 if (regno2 == last_regno)
7686 {
7687 insn = emit_insn (set);
7688 if (frame_related_p)
7689 {
7690 RTX_FRAME_RELATED_P (insn) = 1;
7691 if (prologue_p)
7692 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7693 else
7694 add_reg_note (insn, REG_CFA_RESTORE, reg);
7695 }
7696 break;
7697 }
7698
7699 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7700 /* The next register is not of the same class or its offset is not
7701 mergeable with the current one into a pair. */
7702 if (aarch64_sve_mode_p (mode)
7703 || !satisfies_constraint_Ump (mem)
7704 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7705 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7706 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7707 GET_MODE_SIZE (mode)))
7708 {
7709 insn = emit_insn (set);
7710 if (frame_related_p)
7711 {
7712 RTX_FRAME_RELATED_P (insn) = 1;
7713 if (prologue_p)
7714 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7715 else
7716 add_reg_note (insn, REG_CFA_RESTORE, reg);
7717 }
7718
7719 regno = regno2;
7720 continue;
7721 }
7722
7723 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7724
7725 /* REGNO2 can be saved/restored in a pair with REGNO. */
7726 rtx reg2 = gen_rtx_REG (mode, regno2);
7727 if (frame_pointer_needed)
7728 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7729 else
7730 offset2 += crtl->outgoing_args_size;
7731 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7732 rtx mem2 = gen_frame_mem (mode, addr2);
7733 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7734 : gen_rtx_SET (reg2, mem2);
7735
7736 if (prologue_p)
7737 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7738 else
7739 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7740
7741 if (frame_related_p || frame_related2_p)
7742 {
7743 RTX_FRAME_RELATED_P (insn) = 1;
7744 if (prologue_p)
7745 {
7746 if (frame_related_p)
7747 add_reg_note (insn, REG_CFA_OFFSET, set);
7748 if (frame_related2_p)
7749 add_reg_note (insn, REG_CFA_OFFSET, set2);
7750 }
7751 else
7752 {
7753 if (frame_related_p)
7754 add_reg_note (insn, REG_CFA_RESTORE, reg);
7755 if (frame_related2_p)
7756 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7757 }
7758 }
7759
7760 regno = aarch64_get_next_set_bit (components, regno2 + 1);
7761 }
7762 }
7763
7764 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
7765
7766 static void
7767 aarch64_emit_prologue_components (sbitmap components)
7768 {
7769 aarch64_process_components (components, true);
7770 }
7771
7772 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7773
7774 static void
7775 aarch64_emit_epilogue_components (sbitmap components)
7776 {
7777 aarch64_process_components (components, false);
7778 }
7779
7780 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7781
7782 static void
7783 aarch64_set_handled_components (sbitmap components)
7784 {
7785 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7786 if (bitmap_bit_p (components, regno))
7787 cfun->machine->reg_is_wrapped_separately[regno] = true;
7788 }
7789
7790 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7791 determining the probe offset for alloca. */
7792
7793 static HOST_WIDE_INT
7794 aarch64_stack_clash_protection_alloca_probe_range (void)
7795 {
7796 return STACK_CLASH_CALLER_GUARD;
7797 }
7798
7799
7800 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7801 registers. If POLY_SIZE is not large enough to require a probe this function
7802 will only adjust the stack. When allocating the stack space
7803 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7804 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7805 arguments. If we are then we ensure that any allocation larger than the ABI
7806 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7807 maintained.
7808
7809 We emit barriers after each stack adjustment to prevent optimizations from
7810 breaking the invariant that we never drop the stack more than a page. This
7811 invariant is needed to make it easier to correctly handle asynchronous
7812 events, e.g. if we were to allow the stack to be dropped by more than a page
7813 and then have multiple probes up and we take a signal somewhere in between
7814 then the signal handler doesn't know the state of the stack and can make no
7815 assumptions about which pages have been probed. */
7816
7817 static void
7818 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7819 poly_int64 poly_size,
7820 bool frame_related_p,
7821 bool final_adjustment_p)
7822 {
7823 HOST_WIDE_INT guard_size
7824 = 1 << param_stack_clash_protection_guard_size;
7825 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7826 HOST_WIDE_INT min_probe_threshold
7827 = (final_adjustment_p
7828 ? guard_used_by_caller
7829 : guard_size - guard_used_by_caller);
7830 /* When doing the final adjustment for the outgoing arguments, take into
7831 account any unprobed space there is above the current SP. There are
7832 two cases:
7833
7834 - When saving SVE registers below the hard frame pointer, we force
7835 the lowest save to take place in the prologue before doing the final
7836 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7837 This acts as a probe at SP, so there is no unprobed space.
7838
7839 - When there are no SVE register saves, we use the store of the link
7840 register as a probe. We can't assume that LR was saved at position 0
7841 though, so treat any space below it as unprobed. */
7842 if (final_adjustment_p
7843 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7844 {
7845 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7846 if (known_ge (lr_offset, 0))
7847 min_probe_threshold -= lr_offset.to_constant ();
7848 else
7849 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7850 }
7851
7852 poly_int64 frame_size = cfun->machine->frame.frame_size;
7853
7854 /* We should always have a positive probe threshold. */
7855 gcc_assert (min_probe_threshold > 0);
7856
7857 if (flag_stack_clash_protection && !final_adjustment_p)
7858 {
7859 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7860 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7861 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7862
7863 if (known_eq (frame_size, 0))
7864 {
7865 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7866 }
7867 else if (known_lt (initial_adjust + sve_callee_adjust,
7868 guard_size - guard_used_by_caller)
7869 && known_lt (final_adjust, guard_used_by_caller))
7870 {
7871 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7872 }
7873 }
7874
7875 /* If SIZE is not large enough to require probing, just adjust the stack and
7876 exit. */
7877 if (known_lt (poly_size, min_probe_threshold)
7878 || !flag_stack_clash_protection)
7879 {
7880 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7881 return;
7882 }
7883
7884 HOST_WIDE_INT size;
7885 /* Handle the SVE non-constant case first. */
7886 if (!poly_size.is_constant (&size))
7887 {
7888 if (dump_file)
7889 {
7890 fprintf (dump_file, "Stack clash SVE prologue: ");
7891 print_dec (poly_size, dump_file);
7892 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7893 }
7894
7895 /* First calculate the amount of bytes we're actually spilling. */
7896 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7897 poly_size, temp1, temp2, false, true);
7898
7899 rtx_insn *insn = get_last_insn ();
7900
7901 if (frame_related_p)
7902 {
7903 /* This is done to provide unwinding information for the stack
7904 adjustments we're about to do, however to prevent the optimizers
7905 from removing the R11 move and leaving the CFA note (which would be
7906 very wrong) we tie the old and new stack pointer together.
7907 The tie will expand to nothing but the optimizers will not touch
7908 the instruction. */
7909 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7910 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7911 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7912
7913 /* We want the CFA independent of the stack pointer for the
7914 duration of the loop. */
7915 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7916 RTX_FRAME_RELATED_P (insn) = 1;
7917 }
7918
7919 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7920 rtx guard_const = gen_int_mode (guard_size, Pmode);
7921
7922 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7923 stack_pointer_rtx, temp1,
7924 probe_const, guard_const));
7925
7926 /* Now reset the CFA register if needed. */
7927 if (frame_related_p)
7928 {
7929 add_reg_note (insn, REG_CFA_DEF_CFA,
7930 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7931 gen_int_mode (poly_size, Pmode)));
7932 RTX_FRAME_RELATED_P (insn) = 1;
7933 }
7934
7935 return;
7936 }
7937
7938 if (dump_file)
7939 fprintf (dump_file,
7940 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7941 " bytes, probing will be required.\n", size);
7942
7943 /* Round size to the nearest multiple of guard_size, and calculate the
7944 residual as the difference between the original size and the rounded
7945 size. */
7946 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7947 HOST_WIDE_INT residual = size - rounded_size;
7948
7949 /* We can handle a small number of allocations/probes inline. Otherwise
7950 punt to a loop. */
7951 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7952 {
7953 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7954 {
7955 aarch64_sub_sp (NULL, temp2, guard_size, true);
7956 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7957 guard_used_by_caller));
7958 emit_insn (gen_blockage ());
7959 }
7960 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7961 }
7962 else
7963 {
7964 /* Compute the ending address. */
7965 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7966 temp1, NULL, false, true);
7967 rtx_insn *insn = get_last_insn ();
7968
7969 /* For the initial allocation, we don't have a frame pointer
7970 set up, so we always need CFI notes. If we're doing the
7971 final allocation, then we may have a frame pointer, in which
7972 case it is the CFA, otherwise we need CFI notes.
7973
7974 We can determine which allocation we are doing by looking at
7975 the value of FRAME_RELATED_P since the final allocations are not
7976 frame related. */
7977 if (frame_related_p)
7978 {
7979 /* We want the CFA independent of the stack pointer for the
7980 duration of the loop. */
7981 add_reg_note (insn, REG_CFA_DEF_CFA,
7982 plus_constant (Pmode, temp1, rounded_size));
7983 RTX_FRAME_RELATED_P (insn) = 1;
7984 }
7985
7986 /* This allocates and probes the stack. Note that this re-uses some of
7987 the existing Ada stack protection code. However we are guaranteed not
7988 to enter the non loop or residual branches of that code.
7989
7990 The non-loop part won't be entered because if our allocation amount
7991 doesn't require a loop, the case above would handle it.
7992
7993 The residual amount won't be entered because TEMP1 is a mutliple of
7994 the allocation size. The residual will always be 0. As such, the only
7995 part we are actually using from that code is the loop setup. The
7996 actual probing is done in aarch64_output_probe_stack_range. */
7997 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7998 stack_pointer_rtx, temp1));
7999
8000 /* Now reset the CFA register if needed. */
8001 if (frame_related_p)
8002 {
8003 add_reg_note (insn, REG_CFA_DEF_CFA,
8004 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8005 RTX_FRAME_RELATED_P (insn) = 1;
8006 }
8007
8008 emit_insn (gen_blockage ());
8009 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8010 }
8011
8012 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
8013 be probed. This maintains the requirement that each page is probed at
8014 least once. For initial probing we probe only if the allocation is
8015 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8016 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
8017 GUARD_SIZE. This works that for any allocation that is large enough to
8018 trigger a probe here, we'll have at least one, and if they're not large
8019 enough for this code to emit anything for them, The page would have been
8020 probed by the saving of FP/LR either by this function or any callees. If
8021 we don't have any callees then we won't have more stack adjustments and so
8022 are still safe. */
8023 if (residual)
8024 {
8025 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8026 /* If we're doing final adjustments, and we've done any full page
8027 allocations then any residual needs to be probed. */
8028 if (final_adjustment_p && rounded_size != 0)
8029 min_probe_threshold = 0;
8030 /* If doing a small final adjustment, we always probe at offset 0.
8031 This is done to avoid issues when LR is not at position 0 or when
8032 the final adjustment is smaller than the probing offset. */
8033 else if (final_adjustment_p && rounded_size == 0)
8034 residual_probe_offset = 0;
8035
8036 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8037 if (residual >= min_probe_threshold)
8038 {
8039 if (dump_file)
8040 fprintf (dump_file,
8041 "Stack clash AArch64 prologue residuals: "
8042 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8043 "\n", residual);
8044
8045 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8046 residual_probe_offset));
8047 emit_insn (gen_blockage ());
8048 }
8049 }
8050 }
8051
8052 /* Return 1 if the register is used by the epilogue. We need to say the
8053 return register is used, but only after epilogue generation is complete.
8054 Note that in the case of sibcalls, the values "used by the epilogue" are
8055 considered live at the start of the called function.
8056
8057 For SIMD functions we need to return 1 for FP registers that are saved and
8058 restored by a function but are not zero in call_used_regs. If we do not do
8059 this optimizations may remove the restore of the register. */
8060
8061 int
8062 aarch64_epilogue_uses (int regno)
8063 {
8064 if (epilogue_completed)
8065 {
8066 if (regno == LR_REGNUM)
8067 return 1;
8068 }
8069 return 0;
8070 }
8071
8072 /* AArch64 stack frames generated by this compiler look like:
8073
8074 +-------------------------------+
8075 | |
8076 | incoming stack arguments |
8077 | |
8078 +-------------------------------+
8079 | | <-- incoming stack pointer (aligned)
8080 | callee-allocated save area |
8081 | for register varargs |
8082 | |
8083 +-------------------------------+
8084 | local variables | <-- frame_pointer_rtx
8085 | |
8086 +-------------------------------+
8087 | padding | \
8088 +-------------------------------+ |
8089 | callee-saved registers | | frame.saved_regs_size
8090 +-------------------------------+ |
8091 | LR' | |
8092 +-------------------------------+ |
8093 | FP' | |
8094 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
8095 | SVE vector registers | | \
8096 +-------------------------------+ | | below_hard_fp_saved_regs_size
8097 | SVE predicate registers | / /
8098 +-------------------------------+
8099 | dynamic allocation |
8100 +-------------------------------+
8101 | padding |
8102 +-------------------------------+
8103 | outgoing stack arguments | <-- arg_pointer
8104 | |
8105 +-------------------------------+
8106 | | <-- stack_pointer_rtx (aligned)
8107
8108 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8109 but leave frame_pointer_rtx and hard_frame_pointer_rtx
8110 unchanged.
8111
8112 By default for stack-clash we assume the guard is at least 64KB, but this
8113 value is configurable to either 4KB or 64KB. We also force the guard size to
8114 be the same as the probing interval and both values are kept in sync.
8115
8116 With those assumptions the callee can allocate up to 63KB (or 3KB depending
8117 on the guard size) of stack space without probing.
8118
8119 When probing is needed, we emit a probe at the start of the prologue
8120 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8121
8122 We have to track how much space has been allocated and the only stores
8123 to the stack we track as implicit probes are the FP/LR stores.
8124
8125 For outgoing arguments we probe if the size is larger than 1KB, such that
8126 the ABI specified buffer is maintained for the next callee.
8127
8128 The following registers are reserved during frame layout and should not be
8129 used for any other purpose:
8130
8131 - r11: Used by stack clash protection when SVE is enabled, and also
8132 as an anchor register when saving and restoring registers
8133 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8134 - r14 and r15: Used for speculation tracking.
8135 - r16(IP0), r17(IP1): Used by indirect tailcalls.
8136 - r30(LR), r29(FP): Used by standard frame layout.
8137
8138 These registers must be avoided in frame layout related code unless the
8139 explicit intention is to interact with one of the features listed above. */
8140
8141 /* Generate the prologue instructions for entry into a function.
8142 Establish the stack frame by decreasing the stack pointer with a
8143 properly calculated size and, if necessary, create a frame record
8144 filled with the values of LR and previous frame pointer. The
8145 current FP is also set up if it is in use. */
8146
8147 void
8148 aarch64_expand_prologue (void)
8149 {
8150 poly_int64 frame_size = cfun->machine->frame.frame_size;
8151 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8152 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8153 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8154 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8155 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8156 poly_int64 below_hard_fp_saved_regs_size
8157 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8158 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8159 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8160 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
8161 rtx_insn *insn;
8162
8163 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8164 {
8165 /* Fold the SVE allocation into the initial allocation.
8166 We don't do this in aarch64_layout_arg to avoid pessimizing
8167 the epilogue code. */
8168 initial_adjust += sve_callee_adjust;
8169 sve_callee_adjust = 0;
8170 }
8171
8172 /* Sign return address for functions. */
8173 if (aarch64_return_address_signing_enabled ())
8174 {
8175 switch (aarch64_ra_sign_key)
8176 {
8177 case AARCH64_KEY_A:
8178 insn = emit_insn (gen_paciasp ());
8179 break;
8180 case AARCH64_KEY_B:
8181 insn = emit_insn (gen_pacibsp ());
8182 break;
8183 default:
8184 gcc_unreachable ();
8185 }
8186 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8187 RTX_FRAME_RELATED_P (insn) = 1;
8188 }
8189
8190 if (flag_stack_usage_info)
8191 current_function_static_stack_size = constant_lower_bound (frame_size);
8192
8193 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8194 {
8195 if (crtl->is_leaf && !cfun->calls_alloca)
8196 {
8197 if (maybe_gt (frame_size, PROBE_INTERVAL)
8198 && maybe_gt (frame_size, get_stack_check_protect ()))
8199 aarch64_emit_probe_stack_range (get_stack_check_protect (),
8200 (frame_size
8201 - get_stack_check_protect ()));
8202 }
8203 else if (maybe_gt (frame_size, 0))
8204 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
8205 }
8206
8207 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8208 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8209
8210 /* In theory we should never have both an initial adjustment
8211 and a callee save adjustment. Verify that is the case since the
8212 code below does not handle it for -fstack-clash-protection. */
8213 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8214
8215 /* Will only probe if the initial adjustment is larger than the guard
8216 less the amount of the guard reserved for use by the caller's
8217 outgoing args. */
8218 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8219 true, false);
8220
8221 if (callee_adjust != 0)
8222 aarch64_push_regs (reg1, reg2, callee_adjust);
8223
8224 /* The offset of the frame chain record (if any) from the current SP. */
8225 poly_int64 chain_offset = (initial_adjust + callee_adjust
8226 - cfun->machine->frame.hard_fp_offset);
8227 gcc_assert (known_ge (chain_offset, 0));
8228
8229 /* The offset of the bottom of the save area from the current SP. */
8230 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8231
8232 if (emit_frame_chain)
8233 {
8234 if (callee_adjust == 0)
8235 {
8236 reg1 = R29_REGNUM;
8237 reg2 = R30_REGNUM;
8238 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8239 false, false);
8240 }
8241 else
8242 gcc_assert (known_eq (chain_offset, 0));
8243 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8244 stack_pointer_rtx, chain_offset,
8245 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8246 if (frame_pointer_needed && !frame_size.is_constant ())
8247 {
8248 /* Variable-sized frames need to describe the save slot
8249 address using DW_CFA_expression rather than DW_CFA_offset.
8250 This means that, without taking further action, the
8251 locations of the registers that we've already saved would
8252 remain based on the stack pointer even after we redefine
8253 the CFA based on the frame pointer. We therefore need new
8254 DW_CFA_expressions to re-express the save slots with addresses
8255 based on the frame pointer. */
8256 rtx_insn *insn = get_last_insn ();
8257 gcc_assert (RTX_FRAME_RELATED_P (insn));
8258
8259 /* Add an explicit CFA definition if this was previously
8260 implicit. */
8261 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8262 {
8263 rtx src = plus_constant (Pmode, stack_pointer_rtx,
8264 callee_offset);
8265 add_reg_note (insn, REG_CFA_ADJUST_CFA,
8266 gen_rtx_SET (hard_frame_pointer_rtx, src));
8267 }
8268
8269 /* Change the save slot expressions for the registers that
8270 we've already saved. */
8271 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8272 hard_frame_pointer_rtx, UNITS_PER_WORD);
8273 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8274 hard_frame_pointer_rtx, 0);
8275 }
8276 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8277 }
8278
8279 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8280 callee_adjust != 0 || emit_frame_chain,
8281 emit_frame_chain);
8282 if (maybe_ne (sve_callee_adjust, 0))
8283 {
8284 gcc_assert (!flag_stack_clash_protection
8285 || known_eq (initial_adjust, 0));
8286 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8287 sve_callee_adjust,
8288 !frame_pointer_needed, false);
8289 saved_regs_offset += sve_callee_adjust;
8290 }
8291 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8292 false, emit_frame_chain);
8293 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8294 callee_adjust != 0 || emit_frame_chain,
8295 emit_frame_chain);
8296
8297 /* We may need to probe the final adjustment if it is larger than the guard
8298 that is assumed by the called. */
8299 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8300 !frame_pointer_needed, true);
8301 }
8302
8303 /* Return TRUE if we can use a simple_return insn.
8304
8305 This function checks whether the callee saved stack is empty, which
8306 means no restore actions are need. The pro_and_epilogue will use
8307 this to check whether shrink-wrapping opt is feasible. */
8308
8309 bool
8310 aarch64_use_return_insn_p (void)
8311 {
8312 if (!reload_completed)
8313 return false;
8314
8315 if (crtl->profile)
8316 return false;
8317
8318 return known_eq (cfun->machine->frame.frame_size, 0);
8319 }
8320
8321 /* Generate the epilogue instructions for returning from a function.
8322 This is almost exactly the reverse of the prolog sequence, except
8323 that we need to insert barriers to avoid scheduling loads that read
8324 from a deallocated stack, and we optimize the unwind records by
8325 emitting them all together if possible. */
8326 void
8327 aarch64_expand_epilogue (bool for_sibcall)
8328 {
8329 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8330 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8331 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8332 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8333 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8334 poly_int64 below_hard_fp_saved_regs_size
8335 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8336 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8337 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8338 rtx cfi_ops = NULL;
8339 rtx_insn *insn;
8340 /* A stack clash protection prologue may not have left EP0_REGNUM or
8341 EP1_REGNUM in a usable state. The same is true for allocations
8342 with an SVE component, since we then need both temporary registers
8343 for each allocation. For stack clash we are in a usable state if
8344 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
8345 HOST_WIDE_INT guard_size
8346 = 1 << param_stack_clash_protection_guard_size;
8347 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8348
8349 /* We can re-use the registers when:
8350
8351 (a) the deallocation amount is the same as the corresponding
8352 allocation amount (which is false if we combine the initial
8353 and SVE callee save allocations in the prologue); and
8354
8355 (b) the allocation amount doesn't need a probe (which is false
8356 if the amount is guard_size - guard_used_by_caller or greater).
8357
8358 In such situations the register should remain live with the correct
8359 value. */
8360 bool can_inherit_p = (initial_adjust.is_constant ()
8361 && final_adjust.is_constant ()
8362 && (!flag_stack_clash_protection
8363 || (known_lt (initial_adjust,
8364 guard_size - guard_used_by_caller)
8365 && known_eq (sve_callee_adjust, 0))));
8366
8367 /* We need to add memory barrier to prevent read from deallocated stack. */
8368 bool need_barrier_p
8369 = maybe_ne (get_frame_size ()
8370 + cfun->machine->frame.saved_varargs_size, 0);
8371
8372 /* Emit a barrier to prevent loads from a deallocated stack. */
8373 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8374 || cfun->calls_alloca
8375 || crtl->calls_eh_return)
8376 {
8377 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8378 need_barrier_p = false;
8379 }
8380
8381 /* Restore the stack pointer from the frame pointer if it may not
8382 be the same as the stack pointer. */
8383 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8384 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8385 if (frame_pointer_needed
8386 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8387 /* If writeback is used when restoring callee-saves, the CFA
8388 is restored on the instruction doing the writeback. */
8389 aarch64_add_offset (Pmode, stack_pointer_rtx,
8390 hard_frame_pointer_rtx,
8391 -callee_offset - below_hard_fp_saved_regs_size,
8392 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8393 else
8394 /* The case where we need to re-use the register here is very rare, so
8395 avoid the complicated condition and just always emit a move if the
8396 immediate doesn't fit. */
8397 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8398
8399 /* Restore the vector registers before the predicate registers,
8400 so that we can use P4 as a temporary for big-endian SVE frames. */
8401 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8402 callee_adjust != 0, &cfi_ops);
8403 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8404 false, &cfi_ops);
8405 if (maybe_ne (sve_callee_adjust, 0))
8406 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8407 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8408 R0_REGNUM, R30_REGNUM,
8409 callee_adjust != 0, &cfi_ops);
8410
8411 if (need_barrier_p)
8412 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8413
8414 if (callee_adjust != 0)
8415 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8416
8417 /* If we have no register restore information, the CFA must have been
8418 defined in terms of the stack pointer since the end of the prologue. */
8419 gcc_assert (cfi_ops || !frame_pointer_needed);
8420
8421 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8422 {
8423 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
8424 insn = get_last_insn ();
8425 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8426 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8427 RTX_FRAME_RELATED_P (insn) = 1;
8428 cfi_ops = NULL;
8429 }
8430
8431 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8432 add restriction on emit_move optimization to leaf functions. */
8433 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8434 (!can_inherit_p || !crtl->is_leaf
8435 || df_regs_ever_live_p (EP0_REGNUM)));
8436
8437 if (cfi_ops)
8438 {
8439 /* Emit delayed restores and reset the CFA to be SP. */
8440 insn = get_last_insn ();
8441 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8442 REG_NOTES (insn) = cfi_ops;
8443 RTX_FRAME_RELATED_P (insn) = 1;
8444 }
8445
8446 /* We prefer to emit the combined return/authenticate instruction RETAA,
8447 however there are three cases in which we must instead emit an explicit
8448 authentication instruction.
8449
8450 1) Sibcalls don't return in a normal way, so if we're about to call one
8451 we must authenticate.
8452
8453 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8454 generating code for !TARGET_ARMV8_3 we can't use it and must
8455 explicitly authenticate.
8456
8457 3) On an eh_return path we make extra stack adjustments to update the
8458 canonical frame address to be the exception handler's CFA. We want
8459 to authenticate using the CFA of the function which calls eh_return.
8460 */
8461 if (aarch64_return_address_signing_enabled ()
8462 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8463 {
8464 switch (aarch64_ra_sign_key)
8465 {
8466 case AARCH64_KEY_A:
8467 insn = emit_insn (gen_autiasp ());
8468 break;
8469 case AARCH64_KEY_B:
8470 insn = emit_insn (gen_autibsp ());
8471 break;
8472 default:
8473 gcc_unreachable ();
8474 }
8475 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8476 RTX_FRAME_RELATED_P (insn) = 1;
8477 }
8478
8479 /* Stack adjustment for exception handler. */
8480 if (crtl->calls_eh_return && !for_sibcall)
8481 {
8482 /* We need to unwind the stack by the offset computed by
8483 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
8484 to be SP; letting the CFA move during this adjustment
8485 is just as correct as retaining the CFA from the body
8486 of the function. Therefore, do nothing special. */
8487 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8488 }
8489
8490 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8491 if (!for_sibcall)
8492 emit_jump_insn (ret_rtx);
8493 }
8494
8495 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
8496 normally or return to a previous frame after unwinding.
8497
8498 An EH return uses a single shared return sequence. The epilogue is
8499 exactly like a normal epilogue except that it has an extra input
8500 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8501 that must be applied after the frame has been destroyed. An extra label
8502 is inserted before the epilogue which initializes this register to zero,
8503 and this is the entry point for a normal return.
8504
8505 An actual EH return updates the return address, initializes the stack
8506 adjustment and jumps directly into the epilogue (bypassing the zeroing
8507 of the adjustment). Since the return address is typically saved on the
8508 stack when a function makes a call, the saved LR must be updated outside
8509 the epilogue.
8510
8511 This poses problems as the store is generated well before the epilogue,
8512 so the offset of LR is not known yet. Also optimizations will remove the
8513 store as it appears dead, even after the epilogue is generated (as the
8514 base or offset for loading LR is different in many cases).
8515
8516 To avoid these problems this implementation forces the frame pointer
8517 in eh_return functions so that the location of LR is fixed and known early.
8518 It also marks the store volatile, so no optimization is permitted to
8519 remove the store. */
8520 rtx
8521 aarch64_eh_return_handler_rtx (void)
8522 {
8523 rtx tmp = gen_frame_mem (Pmode,
8524 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8525
8526 /* Mark the store volatile, so no optimization is permitted to remove it. */
8527 MEM_VOLATILE_P (tmp) = true;
8528 return tmp;
8529 }
8530
8531 /* Output code to add DELTA to the first argument, and then jump
8532 to FUNCTION. Used for C++ multiple inheritance. */
8533 static void
8534 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8535 HOST_WIDE_INT delta,
8536 HOST_WIDE_INT vcall_offset,
8537 tree function)
8538 {
8539 /* The this pointer is always in x0. Note that this differs from
8540 Arm where the this pointer maybe bumped to r1 if r0 is required
8541 to return a pointer to an aggregate. On AArch64 a result value
8542 pointer will be in x8. */
8543 int this_regno = R0_REGNUM;
8544 rtx this_rtx, temp0, temp1, addr, funexp;
8545 rtx_insn *insn;
8546 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8547
8548 if (aarch64_bti_enabled ())
8549 emit_insn (gen_bti_c());
8550
8551 reload_completed = 1;
8552 emit_note (NOTE_INSN_PROLOGUE_END);
8553
8554 this_rtx = gen_rtx_REG (Pmode, this_regno);
8555 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8556 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8557
8558 if (vcall_offset == 0)
8559 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8560 else
8561 {
8562 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8563
8564 addr = this_rtx;
8565 if (delta != 0)
8566 {
8567 if (delta >= -256 && delta < 256)
8568 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8569 plus_constant (Pmode, this_rtx, delta));
8570 else
8571 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8572 temp1, temp0, false);
8573 }
8574
8575 if (Pmode == ptr_mode)
8576 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8577 else
8578 aarch64_emit_move (temp0,
8579 gen_rtx_ZERO_EXTEND (Pmode,
8580 gen_rtx_MEM (ptr_mode, addr)));
8581
8582 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8583 addr = plus_constant (Pmode, temp0, vcall_offset);
8584 else
8585 {
8586 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8587 Pmode);
8588 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8589 }
8590
8591 if (Pmode == ptr_mode)
8592 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8593 else
8594 aarch64_emit_move (temp1,
8595 gen_rtx_SIGN_EXTEND (Pmode,
8596 gen_rtx_MEM (ptr_mode, addr)));
8597
8598 emit_insn (gen_add2_insn (this_rtx, temp1));
8599 }
8600
8601 /* Generate a tail call to the target function. */
8602 if (!TREE_USED (function))
8603 {
8604 assemble_external (function);
8605 TREE_USED (function) = 1;
8606 }
8607 funexp = XEXP (DECL_RTL (function), 0);
8608 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8609 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8610 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8611 SIBLING_CALL_P (insn) = 1;
8612
8613 insn = get_insns ();
8614 shorten_branches (insn);
8615
8616 assemble_start_function (thunk, fnname);
8617 final_start_function (insn, file, 1);
8618 final (insn, file, 1);
8619 final_end_function ();
8620 assemble_end_function (thunk, fnname);
8621
8622 /* Stop pretending to be a post-reload pass. */
8623 reload_completed = 0;
8624 }
8625
8626 static bool
8627 aarch64_tls_referenced_p (rtx x)
8628 {
8629 if (!TARGET_HAVE_TLS)
8630 return false;
8631 subrtx_iterator::array_type array;
8632 FOR_EACH_SUBRTX (iter, array, x, ALL)
8633 {
8634 const_rtx x = *iter;
8635 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
8636 return true;
8637 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8638 TLS offsets, not real symbol references. */
8639 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8640 iter.skip_subrtxes ();
8641 }
8642 return false;
8643 }
8644
8645
8646 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8647 a left shift of 0 or 12 bits. */
8648 bool
8649 aarch64_uimm12_shift (HOST_WIDE_INT val)
8650 {
8651 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8652 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8653 );
8654 }
8655
8656 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8657 that can be created with a left shift of 0 or 12. */
8658 static HOST_WIDE_INT
8659 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8660 {
8661 /* Check to see if the value fits in 24 bits, as that is the maximum we can
8662 handle correctly. */
8663 gcc_assert ((val & 0xffffff) == val);
8664
8665 if (((val & 0xfff) << 0) == val)
8666 return val;
8667
8668 return val & (0xfff << 12);
8669 }
8670
8671 /* Return true if val is an immediate that can be loaded into a
8672 register by a MOVZ instruction. */
8673 static bool
8674 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8675 {
8676 if (GET_MODE_SIZE (mode) > 4)
8677 {
8678 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8679 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8680 return 1;
8681 }
8682 else
8683 {
8684 /* Ignore sign extension. */
8685 val &= (HOST_WIDE_INT) 0xffffffff;
8686 }
8687 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8688 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8689 }
8690
8691 /* Test whether:
8692
8693 X = (X & AND_VAL) | IOR_VAL;
8694
8695 can be implemented using:
8696
8697 MOVK X, #(IOR_VAL >> shift), LSL #shift
8698
8699 Return the shift if so, otherwise return -1. */
8700 int
8701 aarch64_movk_shift (const wide_int_ref &and_val,
8702 const wide_int_ref &ior_val)
8703 {
8704 unsigned int precision = and_val.get_precision ();
8705 unsigned HOST_WIDE_INT mask = 0xffff;
8706 for (unsigned int shift = 0; shift < precision; shift += 16)
8707 {
8708 if (and_val == ~mask && (ior_val & mask) == ior_val)
8709 return shift;
8710 mask <<= 16;
8711 }
8712 return -1;
8713 }
8714
8715 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
8716 64-bit (DImode) integer. */
8717
8718 static unsigned HOST_WIDE_INT
8719 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8720 {
8721 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8722 while (size < 64)
8723 {
8724 val &= (HOST_WIDE_INT_1U << size) - 1;
8725 val |= val << size;
8726 size *= 2;
8727 }
8728 return val;
8729 }
8730
8731 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
8732
8733 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8734 {
8735 0x0000000100000001ull,
8736 0x0001000100010001ull,
8737 0x0101010101010101ull,
8738 0x1111111111111111ull,
8739 0x5555555555555555ull,
8740 };
8741
8742
8743 /* Return true if val is a valid bitmask immediate. */
8744
8745 bool
8746 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8747 {
8748 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8749 int bits;
8750
8751 /* Check for a single sequence of one bits and return quickly if so.
8752 The special cases of all ones and all zeroes returns false. */
8753 val = aarch64_replicate_bitmask_imm (val_in, mode);
8754 tmp = val + (val & -val);
8755
8756 if (tmp == (tmp & -tmp))
8757 return (val + 1) > 1;
8758
8759 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
8760 if (mode == SImode)
8761 val = (val << 32) | (val & 0xffffffff);
8762
8763 /* Invert if the immediate doesn't start with a zero bit - this means we
8764 only need to search for sequences of one bits. */
8765 if (val & 1)
8766 val = ~val;
8767
8768 /* Find the first set bit and set tmp to val with the first sequence of one
8769 bits removed. Return success if there is a single sequence of ones. */
8770 first_one = val & -val;
8771 tmp = val & (val + first_one);
8772
8773 if (tmp == 0)
8774 return true;
8775
8776 /* Find the next set bit and compute the difference in bit position. */
8777 next_one = tmp & -tmp;
8778 bits = clz_hwi (first_one) - clz_hwi (next_one);
8779 mask = val ^ tmp;
8780
8781 /* Check the bit position difference is a power of 2, and that the first
8782 sequence of one bits fits within 'bits' bits. */
8783 if ((mask >> bits) != 0 || bits != (bits & -bits))
8784 return false;
8785
8786 /* Check the sequence of one bits is repeated 64/bits times. */
8787 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8788 }
8789
8790 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8791 Assumed precondition: VAL_IN Is not zero. */
8792
8793 unsigned HOST_WIDE_INT
8794 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8795 {
8796 int lowest_bit_set = ctz_hwi (val_in);
8797 int highest_bit_set = floor_log2 (val_in);
8798 gcc_assert (val_in != 0);
8799
8800 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8801 (HOST_WIDE_INT_1U << lowest_bit_set));
8802 }
8803
8804 /* Create constant where bits outside of lowest bit set to highest bit set
8805 are set to 1. */
8806
8807 unsigned HOST_WIDE_INT
8808 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8809 {
8810 return val_in | ~aarch64_and_split_imm1 (val_in);
8811 }
8812
8813 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8814
8815 bool
8816 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8817 {
8818 scalar_int_mode int_mode;
8819 if (!is_a <scalar_int_mode> (mode, &int_mode))
8820 return false;
8821
8822 if (aarch64_bitmask_imm (val_in, int_mode))
8823 return false;
8824
8825 if (aarch64_move_imm (val_in, int_mode))
8826 return false;
8827
8828 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8829
8830 return aarch64_bitmask_imm (imm2, int_mode);
8831 }
8832
8833 /* Return true if val is an immediate that can be loaded into a
8834 register in a single instruction. */
8835 bool
8836 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8837 {
8838 scalar_int_mode int_mode;
8839 if (!is_a <scalar_int_mode> (mode, &int_mode))
8840 return false;
8841
8842 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8843 return 1;
8844 return aarch64_bitmask_imm (val, int_mode);
8845 }
8846
8847 static bool
8848 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8849 {
8850 if (GET_CODE (x) == HIGH)
8851 return true;
8852
8853 /* There's no way to calculate VL-based values using relocations. */
8854 subrtx_iterator::array_type array;
8855 FOR_EACH_SUBRTX (iter, array, x, ALL)
8856 if (GET_CODE (*iter) == CONST_POLY_INT)
8857 return true;
8858
8859 poly_int64 offset;
8860 rtx base = strip_offset_and_salt (x, &offset);
8861 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
8862 {
8863 /* We checked for POLY_INT_CST offsets above. */
8864 if (aarch64_classify_symbol (base, offset.to_constant ())
8865 != SYMBOL_FORCE_TO_MEM)
8866 return true;
8867 else
8868 /* Avoid generating a 64-bit relocation in ILP32; leave
8869 to aarch64_expand_mov_immediate to handle it properly. */
8870 return mode != ptr_mode;
8871 }
8872
8873 return aarch64_tls_referenced_p (x);
8874 }
8875
8876 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8877 The expansion for a table switch is quite expensive due to the number
8878 of instructions, the table lookup and hard to predict indirect jump.
8879 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8880 set, otherwise use tables for > 16 cases as a tradeoff between size and
8881 performance. When optimizing for size, use the default setting. */
8882
8883 static unsigned int
8884 aarch64_case_values_threshold (void)
8885 {
8886 /* Use the specified limit for the number of cases before using jump
8887 tables at higher optimization levels. */
8888 if (optimize > 2
8889 && selected_cpu->tune->max_case_values != 0)
8890 return selected_cpu->tune->max_case_values;
8891 else
8892 return optimize_size ? default_case_values_threshold () : 17;
8893 }
8894
8895 /* Return true if register REGNO is a valid index register.
8896 STRICT_P is true if REG_OK_STRICT is in effect. */
8897
8898 bool
8899 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8900 {
8901 if (!HARD_REGISTER_NUM_P (regno))
8902 {
8903 if (!strict_p)
8904 return true;
8905
8906 if (!reg_renumber)
8907 return false;
8908
8909 regno = reg_renumber[regno];
8910 }
8911 return GP_REGNUM_P (regno);
8912 }
8913
8914 /* Return true if register REGNO is a valid base register for mode MODE.
8915 STRICT_P is true if REG_OK_STRICT is in effect. */
8916
8917 bool
8918 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8919 {
8920 if (!HARD_REGISTER_NUM_P (regno))
8921 {
8922 if (!strict_p)
8923 return true;
8924
8925 if (!reg_renumber)
8926 return false;
8927
8928 regno = reg_renumber[regno];
8929 }
8930
8931 /* The fake registers will be eliminated to either the stack or
8932 hard frame pointer, both of which are usually valid base registers.
8933 Reload deals with the cases where the eliminated form isn't valid. */
8934 return (GP_REGNUM_P (regno)
8935 || regno == SP_REGNUM
8936 || regno == FRAME_POINTER_REGNUM
8937 || regno == ARG_POINTER_REGNUM);
8938 }
8939
8940 /* Return true if X is a valid base register for mode MODE.
8941 STRICT_P is true if REG_OK_STRICT is in effect. */
8942
8943 static bool
8944 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8945 {
8946 if (!strict_p
8947 && SUBREG_P (x)
8948 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8949 x = SUBREG_REG (x);
8950
8951 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8952 }
8953
8954 /* Return true if address offset is a valid index. If it is, fill in INFO
8955 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8956
8957 static bool
8958 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8959 machine_mode mode, bool strict_p)
8960 {
8961 enum aarch64_address_type type;
8962 rtx index;
8963 int shift;
8964
8965 /* (reg:P) */
8966 if ((REG_P (x) || SUBREG_P (x))
8967 && GET_MODE (x) == Pmode)
8968 {
8969 type = ADDRESS_REG_REG;
8970 index = x;
8971 shift = 0;
8972 }
8973 /* (sign_extend:DI (reg:SI)) */
8974 else if ((GET_CODE (x) == SIGN_EXTEND
8975 || GET_CODE (x) == ZERO_EXTEND)
8976 && GET_MODE (x) == DImode
8977 && GET_MODE (XEXP (x, 0)) == SImode)
8978 {
8979 type = (GET_CODE (x) == SIGN_EXTEND)
8980 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8981 index = XEXP (x, 0);
8982 shift = 0;
8983 }
8984 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8985 else if (GET_CODE (x) == MULT
8986 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8987 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8988 && GET_MODE (XEXP (x, 0)) == DImode
8989 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8990 && CONST_INT_P (XEXP (x, 1)))
8991 {
8992 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8993 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8994 index = XEXP (XEXP (x, 0), 0);
8995 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8996 }
8997 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8998 else if (GET_CODE (x) == ASHIFT
8999 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9000 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9001 && GET_MODE (XEXP (x, 0)) == DImode
9002 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9003 && CONST_INT_P (XEXP (x, 1)))
9004 {
9005 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9006 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9007 index = XEXP (XEXP (x, 0), 0);
9008 shift = INTVAL (XEXP (x, 1));
9009 }
9010 /* (and:DI (mult:DI (reg:DI) (const_int scale))
9011 (const_int 0xffffffff<<shift)) */
9012 else if (GET_CODE (x) == AND
9013 && GET_MODE (x) == DImode
9014 && GET_CODE (XEXP (x, 0)) == MULT
9015 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9016 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9017 && CONST_INT_P (XEXP (x, 1)))
9018 {
9019 type = ADDRESS_REG_UXTW;
9020 index = XEXP (XEXP (x, 0), 0);
9021 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9022 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9023 shift = -1;
9024 }
9025 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9026 (const_int 0xffffffff<<shift)) */
9027 else if (GET_CODE (x) == AND
9028 && GET_MODE (x) == DImode
9029 && GET_CODE (XEXP (x, 0)) == ASHIFT
9030 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9031 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9032 && CONST_INT_P (XEXP (x, 1)))
9033 {
9034 type = ADDRESS_REG_UXTW;
9035 index = XEXP (XEXP (x, 0), 0);
9036 shift = INTVAL (XEXP (XEXP (x, 0), 1));
9037 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9038 shift = -1;
9039 }
9040 /* (mult:P (reg:P) (const_int scale)) */
9041 else if (GET_CODE (x) == MULT
9042 && GET_MODE (x) == Pmode
9043 && GET_MODE (XEXP (x, 0)) == Pmode
9044 && CONST_INT_P (XEXP (x, 1)))
9045 {
9046 type = ADDRESS_REG_REG;
9047 index = XEXP (x, 0);
9048 shift = exact_log2 (INTVAL (XEXP (x, 1)));
9049 }
9050 /* (ashift:P (reg:P) (const_int shift)) */
9051 else if (GET_CODE (x) == ASHIFT
9052 && GET_MODE (x) == Pmode
9053 && GET_MODE (XEXP (x, 0)) == Pmode
9054 && CONST_INT_P (XEXP (x, 1)))
9055 {
9056 type = ADDRESS_REG_REG;
9057 index = XEXP (x, 0);
9058 shift = INTVAL (XEXP (x, 1));
9059 }
9060 else
9061 return false;
9062
9063 if (!strict_p
9064 && SUBREG_P (index)
9065 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
9066 index = SUBREG_REG (index);
9067
9068 if (aarch64_sve_data_mode_p (mode))
9069 {
9070 if (type != ADDRESS_REG_REG
9071 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9072 return false;
9073 }
9074 else
9075 {
9076 if (shift != 0
9077 && !(IN_RANGE (shift, 1, 3)
9078 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9079 return false;
9080 }
9081
9082 if (REG_P (index)
9083 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9084 {
9085 info->type = type;
9086 info->offset = index;
9087 info->shift = shift;
9088 return true;
9089 }
9090
9091 return false;
9092 }
9093
9094 /* Return true if MODE is one of the modes for which we
9095 support LDP/STP operations. */
9096
9097 static bool
9098 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9099 {
9100 return mode == SImode || mode == DImode
9101 || mode == SFmode || mode == DFmode
9102 || (aarch64_vector_mode_supported_p (mode)
9103 && (known_eq (GET_MODE_SIZE (mode), 8)
9104 || (known_eq (GET_MODE_SIZE (mode), 16)
9105 && (aarch64_tune_params.extra_tuning_flags
9106 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
9107 }
9108
9109 /* Return true if REGNO is a virtual pointer register, or an eliminable
9110 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
9111 include stack_pointer or hard_frame_pointer. */
9112 static bool
9113 virt_or_elim_regno_p (unsigned regno)
9114 {
9115 return ((regno >= FIRST_VIRTUAL_REGISTER
9116 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9117 || regno == FRAME_POINTER_REGNUM
9118 || regno == ARG_POINTER_REGNUM);
9119 }
9120
9121 /* Return true if X is a valid address of type TYPE for machine mode MODE.
9122 If it is, fill in INFO appropriately. STRICT_P is true if
9123 REG_OK_STRICT is in effect. */
9124
9125 bool
9126 aarch64_classify_address (struct aarch64_address_info *info,
9127 rtx x, machine_mode mode, bool strict_p,
9128 aarch64_addr_query_type type)
9129 {
9130 enum rtx_code code = GET_CODE (x);
9131 rtx op0, op1;
9132 poly_int64 offset;
9133
9134 HOST_WIDE_INT const_size;
9135
9136 /* Whether a vector mode is partial doesn't affect address legitimacy.
9137 Partial vectors like VNx8QImode allow the same indexed addressing
9138 mode and MUL VL addressing mode as full vectors like VNx16QImode;
9139 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
9140 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9141 vec_flags &= ~VEC_PARTIAL;
9142
9143 /* On BE, we use load/store pair for all large int mode load/stores.
9144 TI/TFmode may also use a load/store pair. */
9145 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
9146 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
9147 || type == ADDR_QUERY_LDP_STP_N
9148 || mode == TImode
9149 || mode == TFmode
9150 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
9151
9152 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9153 corresponds to the actual size of the memory being loaded/stored and the
9154 mode of the corresponding addressing mode is half of that. */
9155 if (type == ADDR_QUERY_LDP_STP_N
9156 && known_eq (GET_MODE_SIZE (mode), 16))
9157 mode = DFmode;
9158
9159 bool allow_reg_index_p = (!load_store_pair_p
9160 && (known_lt (GET_MODE_SIZE (mode), 16)
9161 || vec_flags == VEC_ADVSIMD
9162 || vec_flags & VEC_SVE_DATA));
9163
9164 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
9165 [Rn, #offset, MUL VL]. */
9166 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9167 && (code != REG && code != PLUS))
9168 return false;
9169
9170 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9171 REG addressing. */
9172 if (advsimd_struct_p
9173 && !BYTES_BIG_ENDIAN
9174 && (code != POST_INC && code != REG))
9175 return false;
9176
9177 gcc_checking_assert (GET_MODE (x) == VOIDmode
9178 || SCALAR_INT_MODE_P (GET_MODE (x)));
9179
9180 switch (code)
9181 {
9182 case REG:
9183 case SUBREG:
9184 info->type = ADDRESS_REG_IMM;
9185 info->base = x;
9186 info->offset = const0_rtx;
9187 info->const_offset = 0;
9188 return aarch64_base_register_rtx_p (x, strict_p);
9189
9190 case PLUS:
9191 op0 = XEXP (x, 0);
9192 op1 = XEXP (x, 1);
9193
9194 if (! strict_p
9195 && REG_P (op0)
9196 && virt_or_elim_regno_p (REGNO (op0))
9197 && poly_int_rtx_p (op1, &offset))
9198 {
9199 info->type = ADDRESS_REG_IMM;
9200 info->base = op0;
9201 info->offset = op1;
9202 info->const_offset = offset;
9203
9204 return true;
9205 }
9206
9207 if (maybe_ne (GET_MODE_SIZE (mode), 0)
9208 && aarch64_base_register_rtx_p (op0, strict_p)
9209 && poly_int_rtx_p (op1, &offset))
9210 {
9211 info->type = ADDRESS_REG_IMM;
9212 info->base = op0;
9213 info->offset = op1;
9214 info->const_offset = offset;
9215
9216 /* TImode and TFmode values are allowed in both pairs of X
9217 registers and individual Q registers. The available
9218 address modes are:
9219 X,X: 7-bit signed scaled offset
9220 Q: 9-bit signed offset
9221 We conservatively require an offset representable in either mode.
9222 When performing the check for pairs of X registers i.e. LDP/STP
9223 pass down DImode since that is the natural size of the LDP/STP
9224 instruction memory accesses. */
9225 if (mode == TImode || mode == TFmode)
9226 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9227 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9228 || offset_12bit_unsigned_scaled_p (mode, offset)));
9229
9230 /* A 7bit offset check because OImode will emit a ldp/stp
9231 instruction (only big endian will get here).
9232 For ldp/stp instructions, the offset is scaled for the size of a
9233 single element of the pair. */
9234 if (mode == OImode)
9235 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9236
9237 /* Three 9/12 bit offsets checks because CImode will emit three
9238 ldr/str instructions (only big endian will get here). */
9239 if (mode == CImode)
9240 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9241 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9242 offset + 32)
9243 || offset_12bit_unsigned_scaled_p (V16QImode,
9244 offset + 32)));
9245
9246 /* Two 7bit offsets checks because XImode will emit two ldp/stp
9247 instructions (only big endian will get here). */
9248 if (mode == XImode)
9249 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9250 && aarch64_offset_7bit_signed_scaled_p (TImode,
9251 offset + 32));
9252
9253 /* Make "m" use the LD1 offset range for SVE data modes, so
9254 that pre-RTL optimizers like ivopts will work to that
9255 instead of the wider LDR/STR range. */
9256 if (vec_flags == VEC_SVE_DATA)
9257 return (type == ADDR_QUERY_M
9258 ? offset_4bit_signed_scaled_p (mode, offset)
9259 : offset_9bit_signed_scaled_p (mode, offset));
9260
9261 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9262 {
9263 poly_int64 end_offset = (offset
9264 + GET_MODE_SIZE (mode)
9265 - BYTES_PER_SVE_VECTOR);
9266 return (type == ADDR_QUERY_M
9267 ? offset_4bit_signed_scaled_p (mode, offset)
9268 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9269 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9270 end_offset)));
9271 }
9272
9273 if (vec_flags == VEC_SVE_PRED)
9274 return offset_9bit_signed_scaled_p (mode, offset);
9275
9276 if (load_store_pair_p)
9277 return ((known_eq (GET_MODE_SIZE (mode), 4)
9278 || known_eq (GET_MODE_SIZE (mode), 8)
9279 || known_eq (GET_MODE_SIZE (mode), 16))
9280 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9281 else
9282 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9283 || offset_12bit_unsigned_scaled_p (mode, offset));
9284 }
9285
9286 if (allow_reg_index_p)
9287 {
9288 /* Look for base + (scaled/extended) index register. */
9289 if (aarch64_base_register_rtx_p (op0, strict_p)
9290 && aarch64_classify_index (info, op1, mode, strict_p))
9291 {
9292 info->base = op0;
9293 return true;
9294 }
9295 if (aarch64_base_register_rtx_p (op1, strict_p)
9296 && aarch64_classify_index (info, op0, mode, strict_p))
9297 {
9298 info->base = op1;
9299 return true;
9300 }
9301 }
9302
9303 return false;
9304
9305 case POST_INC:
9306 case POST_DEC:
9307 case PRE_INC:
9308 case PRE_DEC:
9309 info->type = ADDRESS_REG_WB;
9310 info->base = XEXP (x, 0);
9311 info->offset = NULL_RTX;
9312 return aarch64_base_register_rtx_p (info->base, strict_p);
9313
9314 case POST_MODIFY:
9315 case PRE_MODIFY:
9316 info->type = ADDRESS_REG_WB;
9317 info->base = XEXP (x, 0);
9318 if (GET_CODE (XEXP (x, 1)) == PLUS
9319 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9320 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9321 && aarch64_base_register_rtx_p (info->base, strict_p))
9322 {
9323 info->offset = XEXP (XEXP (x, 1), 1);
9324 info->const_offset = offset;
9325
9326 /* TImode and TFmode values are allowed in both pairs of X
9327 registers and individual Q registers. The available
9328 address modes are:
9329 X,X: 7-bit signed scaled offset
9330 Q: 9-bit signed offset
9331 We conservatively require an offset representable in either mode.
9332 */
9333 if (mode == TImode || mode == TFmode)
9334 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9335 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9336
9337 if (load_store_pair_p)
9338 return ((known_eq (GET_MODE_SIZE (mode), 4)
9339 || known_eq (GET_MODE_SIZE (mode), 8)
9340 || known_eq (GET_MODE_SIZE (mode), 16))
9341 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9342 else
9343 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9344 }
9345 return false;
9346
9347 case CONST:
9348 case SYMBOL_REF:
9349 case LABEL_REF:
9350 /* load literal: pc-relative constant pool entry. Only supported
9351 for SI mode or larger. */
9352 info->type = ADDRESS_SYMBOLIC;
9353
9354 if (!load_store_pair_p
9355 && GET_MODE_SIZE (mode).is_constant (&const_size)
9356 && const_size >= 4)
9357 {
9358 poly_int64 offset;
9359 rtx sym = strip_offset_and_salt (x, &offset);
9360 return ((LABEL_REF_P (sym)
9361 || (SYMBOL_REF_P (sym)
9362 && CONSTANT_POOL_ADDRESS_P (sym)
9363 && aarch64_pcrelative_literal_loads)));
9364 }
9365 return false;
9366
9367 case LO_SUM:
9368 info->type = ADDRESS_LO_SUM;
9369 info->base = XEXP (x, 0);
9370 info->offset = XEXP (x, 1);
9371 if (allow_reg_index_p
9372 && aarch64_base_register_rtx_p (info->base, strict_p))
9373 {
9374 poly_int64 offset;
9375 HOST_WIDE_INT const_offset;
9376 rtx sym = strip_offset_and_salt (info->offset, &offset);
9377 if (SYMBOL_REF_P (sym)
9378 && offset.is_constant (&const_offset)
9379 && (aarch64_classify_symbol (sym, const_offset)
9380 == SYMBOL_SMALL_ABSOLUTE))
9381 {
9382 /* The symbol and offset must be aligned to the access size. */
9383 unsigned int align;
9384
9385 if (CONSTANT_POOL_ADDRESS_P (sym))
9386 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9387 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9388 {
9389 tree exp = SYMBOL_REF_DECL (sym);
9390 align = TYPE_ALIGN (TREE_TYPE (exp));
9391 align = aarch64_constant_alignment (exp, align);
9392 }
9393 else if (SYMBOL_REF_DECL (sym))
9394 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9395 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9396 && SYMBOL_REF_BLOCK (sym) != NULL)
9397 align = SYMBOL_REF_BLOCK (sym)->alignment;
9398 else
9399 align = BITS_PER_UNIT;
9400
9401 poly_int64 ref_size = GET_MODE_SIZE (mode);
9402 if (known_eq (ref_size, 0))
9403 ref_size = GET_MODE_SIZE (DImode);
9404
9405 return (multiple_p (const_offset, ref_size)
9406 && multiple_p (align / BITS_PER_UNIT, ref_size));
9407 }
9408 }
9409 return false;
9410
9411 default:
9412 return false;
9413 }
9414 }
9415
9416 /* Return true if the address X is valid for a PRFM instruction.
9417 STRICT_P is true if we should do strict checking with
9418 aarch64_classify_address. */
9419
9420 bool
9421 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9422 {
9423 struct aarch64_address_info addr;
9424
9425 /* PRFM accepts the same addresses as DImode... */
9426 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9427 if (!res)
9428 return false;
9429
9430 /* ... except writeback forms. */
9431 return addr.type != ADDRESS_REG_WB;
9432 }
9433
9434 bool
9435 aarch64_symbolic_address_p (rtx x)
9436 {
9437 poly_int64 offset;
9438 x = strip_offset_and_salt (x, &offset);
9439 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
9440 }
9441
9442 /* Classify the base of symbolic expression X. */
9443
9444 enum aarch64_symbol_type
9445 aarch64_classify_symbolic_expression (rtx x)
9446 {
9447 rtx offset;
9448
9449 split_const (x, &x, &offset);
9450 return aarch64_classify_symbol (x, INTVAL (offset));
9451 }
9452
9453
9454 /* Return TRUE if X is a legitimate address for accessing memory in
9455 mode MODE. */
9456 static bool
9457 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9458 {
9459 struct aarch64_address_info addr;
9460
9461 return aarch64_classify_address (&addr, x, mode, strict_p);
9462 }
9463
9464 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9465 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
9466 bool
9467 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9468 aarch64_addr_query_type type)
9469 {
9470 struct aarch64_address_info addr;
9471
9472 return aarch64_classify_address (&addr, x, mode, strict_p, type);
9473 }
9474
9475 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
9476
9477 static bool
9478 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9479 poly_int64 orig_offset,
9480 machine_mode mode)
9481 {
9482 HOST_WIDE_INT size;
9483 if (GET_MODE_SIZE (mode).is_constant (&size))
9484 {
9485 HOST_WIDE_INT const_offset, second_offset;
9486
9487 /* A general SVE offset is A * VQ + B. Remove the A component from
9488 coefficient 0 in order to get the constant B. */
9489 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9490
9491 /* Split an out-of-range address displacement into a base and
9492 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
9493 range otherwise to increase opportunities for sharing the base
9494 address of different sizes. Unaligned accesses use the signed
9495 9-bit range, TImode/TFmode use the intersection of signed
9496 scaled 7-bit and signed 9-bit offset. */
9497 if (mode == TImode || mode == TFmode)
9498 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9499 else if ((const_offset & (size - 1)) != 0)
9500 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9501 else
9502 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9503
9504 if (second_offset == 0 || known_eq (orig_offset, second_offset))
9505 return false;
9506
9507 /* Split the offset into second_offset and the rest. */
9508 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9509 *offset2 = gen_int_mode (second_offset, Pmode);
9510 return true;
9511 }
9512 else
9513 {
9514 /* Get the mode we should use as the basis of the range. For structure
9515 modes this is the mode of one vector. */
9516 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9517 machine_mode step_mode
9518 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9519
9520 /* Get the "mul vl" multiplier we'd like to use. */
9521 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9522 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9523 if (vec_flags & VEC_SVE_DATA)
9524 /* LDR supports a 9-bit range, but the move patterns for
9525 structure modes require all vectors to be in range of the
9526 same base. The simplest way of accomodating that while still
9527 promoting reuse of anchor points between different modes is
9528 to use an 8-bit range unconditionally. */
9529 vnum = ((vnum + 128) & 255) - 128;
9530 else
9531 /* Predicates are only handled singly, so we might as well use
9532 the full range. */
9533 vnum = ((vnum + 256) & 511) - 256;
9534 if (vnum == 0)
9535 return false;
9536
9537 /* Convert the "mul vl" multiplier into a byte offset. */
9538 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9539 if (known_eq (second_offset, orig_offset))
9540 return false;
9541
9542 /* Split the offset into second_offset and the rest. */
9543 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9544 *offset2 = gen_int_mode (second_offset, Pmode);
9545 return true;
9546 }
9547 }
9548
9549 /* Return the binary representation of floating point constant VALUE in INTVAL.
9550 If the value cannot be converted, return false without setting INTVAL.
9551 The conversion is done in the given MODE. */
9552 bool
9553 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9554 {
9555
9556 /* We make a general exception for 0. */
9557 if (aarch64_float_const_zero_rtx_p (value))
9558 {
9559 *intval = 0;
9560 return true;
9561 }
9562
9563 scalar_float_mode mode;
9564 if (!CONST_DOUBLE_P (value)
9565 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9566 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9567 /* Only support up to DF mode. */
9568 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9569 return false;
9570
9571 unsigned HOST_WIDE_INT ival = 0;
9572
9573 long res[2];
9574 real_to_target (res,
9575 CONST_DOUBLE_REAL_VALUE (value),
9576 REAL_MODE_FORMAT (mode));
9577
9578 if (mode == DFmode)
9579 {
9580 int order = BYTES_BIG_ENDIAN ? 1 : 0;
9581 ival = zext_hwi (res[order], 32);
9582 ival |= (zext_hwi (res[1 - order], 32) << 32);
9583 }
9584 else
9585 ival = zext_hwi (res[0], 32);
9586
9587 *intval = ival;
9588 return true;
9589 }
9590
9591 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9592 single MOV(+MOVK) followed by an FMOV. */
9593 bool
9594 aarch64_float_const_rtx_p (rtx x)
9595 {
9596 machine_mode mode = GET_MODE (x);
9597 if (mode == VOIDmode)
9598 return false;
9599
9600 /* Determine whether it's cheaper to write float constants as
9601 mov/movk pairs over ldr/adrp pairs. */
9602 unsigned HOST_WIDE_INT ival;
9603
9604 if (CONST_DOUBLE_P (x)
9605 && SCALAR_FLOAT_MODE_P (mode)
9606 && aarch64_reinterpret_float_as_int (x, &ival))
9607 {
9608 scalar_int_mode imode = (mode == HFmode
9609 ? SImode
9610 : int_mode_for_mode (mode).require ());
9611 int num_instr = aarch64_internal_mov_immediate
9612 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9613 return num_instr < 3;
9614 }
9615
9616 return false;
9617 }
9618
9619 /* Return TRUE if rtx X is immediate constant 0.0 */
9620 bool
9621 aarch64_float_const_zero_rtx_p (rtx x)
9622 {
9623 if (GET_MODE (x) == VOIDmode)
9624 return false;
9625
9626 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9627 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9628 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9629 }
9630
9631 /* Return TRUE if rtx X is immediate constant that fits in a single
9632 MOVI immediate operation. */
9633 bool
9634 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9635 {
9636 if (!TARGET_SIMD)
9637 return false;
9638
9639 machine_mode vmode;
9640 scalar_int_mode imode;
9641 unsigned HOST_WIDE_INT ival;
9642
9643 if (CONST_DOUBLE_P (x)
9644 && SCALAR_FLOAT_MODE_P (mode))
9645 {
9646 if (!aarch64_reinterpret_float_as_int (x, &ival))
9647 return false;
9648
9649 /* We make a general exception for 0. */
9650 if (aarch64_float_const_zero_rtx_p (x))
9651 return true;
9652
9653 imode = int_mode_for_mode (mode).require ();
9654 }
9655 else if (CONST_INT_P (x)
9656 && is_a <scalar_int_mode> (mode, &imode))
9657 ival = INTVAL (x);
9658 else
9659 return false;
9660
9661 /* use a 64 bit mode for everything except for DI/DF mode, where we use
9662 a 128 bit vector mode. */
9663 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9664
9665 vmode = aarch64_simd_container_mode (imode, width);
9666 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9667
9668 return aarch64_simd_valid_immediate (v_op, NULL);
9669 }
9670
9671
9672 /* Return the fixed registers used for condition codes. */
9673
9674 static bool
9675 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9676 {
9677 *p1 = CC_REGNUM;
9678 *p2 = INVALID_REGNUM;
9679 return true;
9680 }
9681
9682 /* This function is used by the call expanders of the machine description.
9683 RESULT is the register in which the result is returned. It's NULL for
9684 "call" and "sibcall".
9685 MEM is the location of the function call.
9686 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9687 SIBCALL indicates whether this function call is normal call or sibling call.
9688 It will generate different pattern accordingly. */
9689
9690 void
9691 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9692 {
9693 rtx call, callee, tmp;
9694 rtvec vec;
9695 machine_mode mode;
9696
9697 gcc_assert (MEM_P (mem));
9698 callee = XEXP (mem, 0);
9699 mode = GET_MODE (callee);
9700 gcc_assert (mode == Pmode);
9701
9702 /* Decide if we should generate indirect calls by loading the
9703 address of the callee into a register before performing
9704 the branch-and-link. */
9705 if (SYMBOL_REF_P (callee)
9706 ? (aarch64_is_long_call_p (callee)
9707 || aarch64_is_noplt_call_p (callee))
9708 : !REG_P (callee))
9709 XEXP (mem, 0) = force_reg (mode, callee);
9710
9711 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9712
9713 if (result != NULL_RTX)
9714 call = gen_rtx_SET (result, call);
9715
9716 if (sibcall)
9717 tmp = ret_rtx;
9718 else
9719 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9720
9721 gcc_assert (CONST_INT_P (callee_abi));
9722 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9723 UNSPEC_CALLEE_ABI);
9724
9725 vec = gen_rtvec (3, call, callee_abi, tmp);
9726 call = gen_rtx_PARALLEL (VOIDmode, vec);
9727
9728 aarch64_emit_call_insn (call);
9729 }
9730
9731 /* Emit call insn with PAT and do aarch64-specific handling. */
9732
9733 void
9734 aarch64_emit_call_insn (rtx pat)
9735 {
9736 rtx insn = emit_call_insn (pat);
9737
9738 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9739 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9740 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9741 }
9742
9743 machine_mode
9744 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9745 {
9746 machine_mode mode_x = GET_MODE (x);
9747 rtx_code code_x = GET_CODE (x);
9748
9749 /* All floating point compares return CCFP if it is an equality
9750 comparison, and CCFPE otherwise. */
9751 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9752 {
9753 switch (code)
9754 {
9755 case EQ:
9756 case NE:
9757 case UNORDERED:
9758 case ORDERED:
9759 case UNLT:
9760 case UNLE:
9761 case UNGT:
9762 case UNGE:
9763 case UNEQ:
9764 return CCFPmode;
9765
9766 case LT:
9767 case LE:
9768 case GT:
9769 case GE:
9770 case LTGT:
9771 return CCFPEmode;
9772
9773 default:
9774 gcc_unreachable ();
9775 }
9776 }
9777
9778 /* Equality comparisons of short modes against zero can be performed
9779 using the TST instruction with the appropriate bitmask. */
9780 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9781 && (code == EQ || code == NE)
9782 && (mode_x == HImode || mode_x == QImode))
9783 return CC_NZmode;
9784
9785 /* Similarly, comparisons of zero_extends from shorter modes can
9786 be performed using an ANDS with an immediate mask. */
9787 if (y == const0_rtx && code_x == ZERO_EXTEND
9788 && (mode_x == SImode || mode_x == DImode)
9789 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9790 && (code == EQ || code == NE))
9791 return CC_NZmode;
9792
9793 if ((mode_x == SImode || mode_x == DImode)
9794 && y == const0_rtx
9795 && (code == EQ || code == NE || code == LT || code == GE)
9796 && (code_x == PLUS || code_x == MINUS || code_x == AND
9797 || code_x == NEG
9798 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9799 && CONST_INT_P (XEXP (x, 2)))))
9800 return CC_NZmode;
9801
9802 /* A compare with a shifted operand. Because of canonicalization,
9803 the comparison will have to be swapped when we emit the assembly
9804 code. */
9805 if ((mode_x == SImode || mode_x == DImode)
9806 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
9807 && (code_x == ASHIFT || code_x == ASHIFTRT
9808 || code_x == LSHIFTRT
9809 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9810 return CC_SWPmode;
9811
9812 /* Similarly for a negated operand, but we can only do this for
9813 equalities. */
9814 if ((mode_x == SImode || mode_x == DImode)
9815 && (REG_P (y) || SUBREG_P (y))
9816 && (code == EQ || code == NE)
9817 && code_x == NEG)
9818 return CC_Zmode;
9819
9820 /* A test for unsigned overflow from an addition. */
9821 if ((mode_x == DImode || mode_x == TImode)
9822 && (code == LTU || code == GEU)
9823 && code_x == PLUS
9824 && rtx_equal_p (XEXP (x, 0), y))
9825 return CC_Cmode;
9826
9827 /* A test for unsigned overflow from an add with carry. */
9828 if ((mode_x == DImode || mode_x == TImode)
9829 && (code == LTU || code == GEU)
9830 && code_x == PLUS
9831 && CONST_SCALAR_INT_P (y)
9832 && (rtx_mode_t (y, mode_x)
9833 == (wi::shwi (1, mode_x)
9834 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9835 return CC_ADCmode;
9836
9837 /* A test for signed overflow. */
9838 if ((mode_x == DImode || mode_x == TImode)
9839 && code == NE
9840 && code_x == PLUS
9841 && GET_CODE (y) == SIGN_EXTEND)
9842 return CC_Vmode;
9843
9844 /* For everything else, return CCmode. */
9845 return CCmode;
9846 }
9847
9848 static int
9849 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9850
9851 int
9852 aarch64_get_condition_code (rtx x)
9853 {
9854 machine_mode mode = GET_MODE (XEXP (x, 0));
9855 enum rtx_code comp_code = GET_CODE (x);
9856
9857 if (GET_MODE_CLASS (mode) != MODE_CC)
9858 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9859 return aarch64_get_condition_code_1 (mode, comp_code);
9860 }
9861
9862 static int
9863 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9864 {
9865 switch (mode)
9866 {
9867 case E_CCFPmode:
9868 case E_CCFPEmode:
9869 switch (comp_code)
9870 {
9871 case GE: return AARCH64_GE;
9872 case GT: return AARCH64_GT;
9873 case LE: return AARCH64_LS;
9874 case LT: return AARCH64_MI;
9875 case NE: return AARCH64_NE;
9876 case EQ: return AARCH64_EQ;
9877 case ORDERED: return AARCH64_VC;
9878 case UNORDERED: return AARCH64_VS;
9879 case UNLT: return AARCH64_LT;
9880 case UNLE: return AARCH64_LE;
9881 case UNGT: return AARCH64_HI;
9882 case UNGE: return AARCH64_PL;
9883 default: return -1;
9884 }
9885 break;
9886
9887 case E_CCmode:
9888 switch (comp_code)
9889 {
9890 case NE: return AARCH64_NE;
9891 case EQ: return AARCH64_EQ;
9892 case GE: return AARCH64_GE;
9893 case GT: return AARCH64_GT;
9894 case LE: return AARCH64_LE;
9895 case LT: return AARCH64_LT;
9896 case GEU: return AARCH64_CS;
9897 case GTU: return AARCH64_HI;
9898 case LEU: return AARCH64_LS;
9899 case LTU: return AARCH64_CC;
9900 default: return -1;
9901 }
9902 break;
9903
9904 case E_CC_SWPmode:
9905 switch (comp_code)
9906 {
9907 case NE: return AARCH64_NE;
9908 case EQ: return AARCH64_EQ;
9909 case GE: return AARCH64_LE;
9910 case GT: return AARCH64_LT;
9911 case LE: return AARCH64_GE;
9912 case LT: return AARCH64_GT;
9913 case GEU: return AARCH64_LS;
9914 case GTU: return AARCH64_CC;
9915 case LEU: return AARCH64_CS;
9916 case LTU: return AARCH64_HI;
9917 default: return -1;
9918 }
9919 break;
9920
9921 case E_CC_NZCmode:
9922 switch (comp_code)
9923 {
9924 case NE: return AARCH64_NE; /* = any */
9925 case EQ: return AARCH64_EQ; /* = none */
9926 case GE: return AARCH64_PL; /* = nfrst */
9927 case LT: return AARCH64_MI; /* = first */
9928 case GEU: return AARCH64_CS; /* = nlast */
9929 case GTU: return AARCH64_HI; /* = pmore */
9930 case LEU: return AARCH64_LS; /* = plast */
9931 case LTU: return AARCH64_CC; /* = last */
9932 default: return -1;
9933 }
9934 break;
9935
9936 case E_CC_NZmode:
9937 switch (comp_code)
9938 {
9939 case NE: return AARCH64_NE;
9940 case EQ: return AARCH64_EQ;
9941 case GE: return AARCH64_PL;
9942 case LT: return AARCH64_MI;
9943 default: return -1;
9944 }
9945 break;
9946
9947 case E_CC_Zmode:
9948 switch (comp_code)
9949 {
9950 case NE: return AARCH64_NE;
9951 case EQ: return AARCH64_EQ;
9952 default: return -1;
9953 }
9954 break;
9955
9956 case E_CC_Cmode:
9957 switch (comp_code)
9958 {
9959 case LTU: return AARCH64_CS;
9960 case GEU: return AARCH64_CC;
9961 default: return -1;
9962 }
9963 break;
9964
9965 case E_CC_ADCmode:
9966 switch (comp_code)
9967 {
9968 case GEU: return AARCH64_CS;
9969 case LTU: return AARCH64_CC;
9970 default: return -1;
9971 }
9972 break;
9973
9974 case E_CC_Vmode:
9975 switch (comp_code)
9976 {
9977 case NE: return AARCH64_VS;
9978 case EQ: return AARCH64_VC;
9979 default: return -1;
9980 }
9981 break;
9982
9983 default:
9984 return -1;
9985 }
9986
9987 return -1;
9988 }
9989
9990 bool
9991 aarch64_const_vec_all_same_in_range_p (rtx x,
9992 HOST_WIDE_INT minval,
9993 HOST_WIDE_INT maxval)
9994 {
9995 rtx elt;
9996 return (const_vec_duplicate_p (x, &elt)
9997 && CONST_INT_P (elt)
9998 && IN_RANGE (INTVAL (elt), minval, maxval));
9999 }
10000
10001 bool
10002 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10003 {
10004 return aarch64_const_vec_all_same_in_range_p (x, val, val);
10005 }
10006
10007 /* Return true if VEC is a constant in which every element is in the range
10008 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
10009
10010 static bool
10011 aarch64_const_vec_all_in_range_p (rtx vec,
10012 HOST_WIDE_INT minval,
10013 HOST_WIDE_INT maxval)
10014 {
10015 if (GET_CODE (vec) != CONST_VECTOR
10016 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10017 return false;
10018
10019 int nunits;
10020 if (!CONST_VECTOR_STEPPED_P (vec))
10021 nunits = const_vector_encoded_nelts (vec);
10022 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10023 return false;
10024
10025 for (int i = 0; i < nunits; i++)
10026 {
10027 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10028 if (!CONST_INT_P (vec_elem)
10029 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10030 return false;
10031 }
10032 return true;
10033 }
10034
10035 /* N Z C V. */
10036 #define AARCH64_CC_V 1
10037 #define AARCH64_CC_C (1 << 1)
10038 #define AARCH64_CC_Z (1 << 2)
10039 #define AARCH64_CC_N (1 << 3)
10040
10041 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
10042 static const int aarch64_nzcv_codes[] =
10043 {
10044 0, /* EQ, Z == 1. */
10045 AARCH64_CC_Z, /* NE, Z == 0. */
10046 0, /* CS, C == 1. */
10047 AARCH64_CC_C, /* CC, C == 0. */
10048 0, /* MI, N == 1. */
10049 AARCH64_CC_N, /* PL, N == 0. */
10050 0, /* VS, V == 1. */
10051 AARCH64_CC_V, /* VC, V == 0. */
10052 0, /* HI, C ==1 && Z == 0. */
10053 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
10054 AARCH64_CC_V, /* GE, N == V. */
10055 0, /* LT, N != V. */
10056 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
10057 0, /* LE, !(Z == 0 && N == V). */
10058 0, /* AL, Any. */
10059 0 /* NV, Any. */
10060 };
10061
10062 /* Print floating-point vector immediate operand X to F, negating it
10063 first if NEGATE is true. Return true on success, false if it isn't
10064 a constant we can handle. */
10065
10066 static bool
10067 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10068 {
10069 rtx elt;
10070
10071 if (!const_vec_duplicate_p (x, &elt))
10072 return false;
10073
10074 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10075 if (negate)
10076 r = real_value_negate (&r);
10077
10078 /* Handle the SVE single-bit immediates specially, since they have a
10079 fixed form in the assembly syntax. */
10080 if (real_equal (&r, &dconst0))
10081 asm_fprintf (f, "0.0");
10082 else if (real_equal (&r, &dconst2))
10083 asm_fprintf (f, "2.0");
10084 else if (real_equal (&r, &dconst1))
10085 asm_fprintf (f, "1.0");
10086 else if (real_equal (&r, &dconsthalf))
10087 asm_fprintf (f, "0.5");
10088 else
10089 {
10090 const int buf_size = 20;
10091 char float_buf[buf_size] = {'\0'};
10092 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10093 1, GET_MODE (elt));
10094 asm_fprintf (f, "%s", float_buf);
10095 }
10096
10097 return true;
10098 }
10099
10100 /* Return the equivalent letter for size. */
10101 static char
10102 sizetochar (int size)
10103 {
10104 switch (size)
10105 {
10106 case 64: return 'd';
10107 case 32: return 's';
10108 case 16: return 'h';
10109 case 8 : return 'b';
10110 default: gcc_unreachable ();
10111 }
10112 }
10113
10114 /* Print operand X to file F in a target specific manner according to CODE.
10115 The acceptable formatting commands given by CODE are:
10116 'c': An integer or symbol address without a preceding #
10117 sign.
10118 'C': Take the duplicated element in a vector constant
10119 and print it in hex.
10120 'D': Take the duplicated element in a vector constant
10121 and print it as an unsigned integer, in decimal.
10122 'e': Print the sign/zero-extend size as a character 8->b,
10123 16->h, 32->w. Can also be used for masks:
10124 0xff->b, 0xffff->h, 0xffffffff->w.
10125 'I': If the operand is a duplicated vector constant,
10126 replace it with the duplicated scalar. If the
10127 operand is then a floating-point constant, replace
10128 it with the integer bit representation. Print the
10129 transformed constant as a signed decimal number.
10130 'p': Prints N such that 2^N == X (X must be power of 2 and
10131 const int).
10132 'P': Print the number of non-zero bits in X (a const_int).
10133 'H': Print the higher numbered register of a pair (TImode)
10134 of regs.
10135 'm': Print a condition (eq, ne, etc).
10136 'M': Same as 'm', but invert condition.
10137 'N': Take the duplicated element in a vector constant
10138 and print the negative of it in decimal.
10139 'b/h/s/d/q': Print a scalar FP/SIMD register name.
10140 'S/T/U/V': Print a FP/SIMD register name for a register list.
10141 The register printed is the FP/SIMD register name
10142 of X + 0/1/2/3 for S/T/U/V.
10143 'R': Print a scalar Integer/FP/SIMD register name + 1.
10144 'X': Print bottom 16 bits of integer constant in hex.
10145 'w/x': Print a general register name or the zero register
10146 (32-bit or 64-bit).
10147 '0': Print a normal operand, if it's a general register,
10148 then we assume DImode.
10149 'k': Print NZCV for conditional compare instructions.
10150 'A': Output address constant representing the first
10151 argument of X, specifying a relocation offset
10152 if appropriate.
10153 'L': Output constant address specified by X
10154 with a relocation offset if appropriate.
10155 'G': Prints address of X, specifying a PC relative
10156 relocation mode if appropriate.
10157 'y': Output address of LDP or STP - this is used for
10158 some LDP/STPs which don't use a PARALLEL in their
10159 pattern (so the mode needs to be adjusted).
10160 'z': Output address of a typical LDP or STP. */
10161
10162 static void
10163 aarch64_print_operand (FILE *f, rtx x, int code)
10164 {
10165 rtx elt;
10166 switch (code)
10167 {
10168 case 'c':
10169 if (CONST_INT_P (x))
10170 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10171 else
10172 {
10173 poly_int64 offset;
10174 rtx base = strip_offset_and_salt (x, &offset);
10175 if (SYMBOL_REF_P (base))
10176 output_addr_const (f, x);
10177 else
10178 output_operand_lossage ("unsupported operand for code '%c'", code);
10179 }
10180 break;
10181
10182 case 'e':
10183 {
10184 x = unwrap_const_vec_duplicate (x);
10185 if (!CONST_INT_P (x))
10186 {
10187 output_operand_lossage ("invalid operand for '%%%c'", code);
10188 return;
10189 }
10190
10191 HOST_WIDE_INT val = INTVAL (x);
10192 if ((val & ~7) == 8 || val == 0xff)
10193 fputc ('b', f);
10194 else if ((val & ~7) == 16 || val == 0xffff)
10195 fputc ('h', f);
10196 else if ((val & ~7) == 32 || val == 0xffffffff)
10197 fputc ('w', f);
10198 else
10199 {
10200 output_operand_lossage ("invalid operand for '%%%c'", code);
10201 return;
10202 }
10203 }
10204 break;
10205
10206 case 'p':
10207 {
10208 int n;
10209
10210 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10211 {
10212 output_operand_lossage ("invalid operand for '%%%c'", code);
10213 return;
10214 }
10215
10216 asm_fprintf (f, "%d", n);
10217 }
10218 break;
10219
10220 case 'P':
10221 if (!CONST_INT_P (x))
10222 {
10223 output_operand_lossage ("invalid operand for '%%%c'", code);
10224 return;
10225 }
10226
10227 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10228 break;
10229
10230 case 'H':
10231 if (x == const0_rtx)
10232 {
10233 asm_fprintf (f, "xzr");
10234 break;
10235 }
10236
10237 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10238 {
10239 output_operand_lossage ("invalid operand for '%%%c'", code);
10240 return;
10241 }
10242
10243 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10244 break;
10245
10246 case 'I':
10247 {
10248 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10249 if (CONST_INT_P (x))
10250 asm_fprintf (f, "%wd", INTVAL (x));
10251 else
10252 {
10253 output_operand_lossage ("invalid operand for '%%%c'", code);
10254 return;
10255 }
10256 break;
10257 }
10258
10259 case 'M':
10260 case 'm':
10261 {
10262 int cond_code;
10263 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
10264 if (x == const_true_rtx)
10265 {
10266 if (code == 'M')
10267 fputs ("nv", f);
10268 return;
10269 }
10270
10271 if (!COMPARISON_P (x))
10272 {
10273 output_operand_lossage ("invalid operand for '%%%c'", code);
10274 return;
10275 }
10276
10277 cond_code = aarch64_get_condition_code (x);
10278 gcc_assert (cond_code >= 0);
10279 if (code == 'M')
10280 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10281 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10282 fputs (aarch64_sve_condition_codes[cond_code], f);
10283 else
10284 fputs (aarch64_condition_codes[cond_code], f);
10285 }
10286 break;
10287
10288 case 'N':
10289 if (!const_vec_duplicate_p (x, &elt))
10290 {
10291 output_operand_lossage ("invalid vector constant");
10292 return;
10293 }
10294
10295 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10296 asm_fprintf (f, "%wd", -INTVAL (elt));
10297 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10298 && aarch64_print_vector_float_operand (f, x, true))
10299 ;
10300 else
10301 {
10302 output_operand_lossage ("invalid vector constant");
10303 return;
10304 }
10305 break;
10306
10307 case 'b':
10308 case 'h':
10309 case 's':
10310 case 'd':
10311 case 'q':
10312 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10313 {
10314 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10315 return;
10316 }
10317 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10318 break;
10319
10320 case 'S':
10321 case 'T':
10322 case 'U':
10323 case 'V':
10324 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10325 {
10326 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10327 return;
10328 }
10329 asm_fprintf (f, "%c%d",
10330 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10331 REGNO (x) - V0_REGNUM + (code - 'S'));
10332 break;
10333
10334 case 'R':
10335 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10336 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10337 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10338 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10339 else
10340 output_operand_lossage ("incompatible register operand for '%%%c'",
10341 code);
10342 break;
10343
10344 case 'X':
10345 if (!CONST_INT_P (x))
10346 {
10347 output_operand_lossage ("invalid operand for '%%%c'", code);
10348 return;
10349 }
10350 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10351 break;
10352
10353 case 'C':
10354 {
10355 /* Print a replicated constant in hex. */
10356 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10357 {
10358 output_operand_lossage ("invalid operand for '%%%c'", code);
10359 return;
10360 }
10361 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10362 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10363 }
10364 break;
10365
10366 case 'D':
10367 {
10368 /* Print a replicated constant in decimal, treating it as
10369 unsigned. */
10370 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10371 {
10372 output_operand_lossage ("invalid operand for '%%%c'", code);
10373 return;
10374 }
10375 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10376 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10377 }
10378 break;
10379
10380 case 'w':
10381 case 'x':
10382 if (x == const0_rtx
10383 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10384 {
10385 asm_fprintf (f, "%czr", code);
10386 break;
10387 }
10388
10389 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10390 {
10391 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10392 break;
10393 }
10394
10395 if (REG_P (x) && REGNO (x) == SP_REGNUM)
10396 {
10397 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10398 break;
10399 }
10400
10401 /* Fall through */
10402
10403 case 0:
10404 if (x == NULL)
10405 {
10406 output_operand_lossage ("missing operand");
10407 return;
10408 }
10409
10410 switch (GET_CODE (x))
10411 {
10412 case REG:
10413 if (aarch64_sve_data_mode_p (GET_MODE (x)))
10414 {
10415 if (REG_NREGS (x) == 1)
10416 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10417 else
10418 {
10419 char suffix
10420 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10421 asm_fprintf (f, "{z%d.%c - z%d.%c}",
10422 REGNO (x) - V0_REGNUM, suffix,
10423 END_REGNO (x) - V0_REGNUM - 1, suffix);
10424 }
10425 }
10426 else
10427 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10428 break;
10429
10430 case MEM:
10431 output_address (GET_MODE (x), XEXP (x, 0));
10432 break;
10433
10434 case LABEL_REF:
10435 case SYMBOL_REF:
10436 output_addr_const (asm_out_file, x);
10437 break;
10438
10439 case CONST_INT:
10440 asm_fprintf (f, "%wd", INTVAL (x));
10441 break;
10442
10443 case CONST:
10444 if (!VECTOR_MODE_P (GET_MODE (x)))
10445 {
10446 output_addr_const (asm_out_file, x);
10447 break;
10448 }
10449 /* fall through */
10450
10451 case CONST_VECTOR:
10452 if (!const_vec_duplicate_p (x, &elt))
10453 {
10454 output_operand_lossage ("invalid vector constant");
10455 return;
10456 }
10457
10458 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10459 asm_fprintf (f, "%wd", INTVAL (elt));
10460 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10461 && aarch64_print_vector_float_operand (f, x, false))
10462 ;
10463 else
10464 {
10465 output_operand_lossage ("invalid vector constant");
10466 return;
10467 }
10468 break;
10469
10470 case CONST_DOUBLE:
10471 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10472 be getting CONST_DOUBLEs holding integers. */
10473 gcc_assert (GET_MODE (x) != VOIDmode);
10474 if (aarch64_float_const_zero_rtx_p (x))
10475 {
10476 fputc ('0', f);
10477 break;
10478 }
10479 else if (aarch64_float_const_representable_p (x))
10480 {
10481 #define buf_size 20
10482 char float_buf[buf_size] = {'\0'};
10483 real_to_decimal_for_mode (float_buf,
10484 CONST_DOUBLE_REAL_VALUE (x),
10485 buf_size, buf_size,
10486 1, GET_MODE (x));
10487 asm_fprintf (asm_out_file, "%s", float_buf);
10488 break;
10489 #undef buf_size
10490 }
10491 output_operand_lossage ("invalid constant");
10492 return;
10493 default:
10494 output_operand_lossage ("invalid operand");
10495 return;
10496 }
10497 break;
10498
10499 case 'A':
10500 if (GET_CODE (x) == HIGH)
10501 x = XEXP (x, 0);
10502
10503 switch (aarch64_classify_symbolic_expression (x))
10504 {
10505 case SYMBOL_SMALL_GOT_4G:
10506 asm_fprintf (asm_out_file, ":got:");
10507 break;
10508
10509 case SYMBOL_SMALL_TLSGD:
10510 asm_fprintf (asm_out_file, ":tlsgd:");
10511 break;
10512
10513 case SYMBOL_SMALL_TLSDESC:
10514 asm_fprintf (asm_out_file, ":tlsdesc:");
10515 break;
10516
10517 case SYMBOL_SMALL_TLSIE:
10518 asm_fprintf (asm_out_file, ":gottprel:");
10519 break;
10520
10521 case SYMBOL_TLSLE24:
10522 asm_fprintf (asm_out_file, ":tprel:");
10523 break;
10524
10525 case SYMBOL_TINY_GOT:
10526 gcc_unreachable ();
10527 break;
10528
10529 default:
10530 break;
10531 }
10532 output_addr_const (asm_out_file, x);
10533 break;
10534
10535 case 'L':
10536 switch (aarch64_classify_symbolic_expression (x))
10537 {
10538 case SYMBOL_SMALL_GOT_4G:
10539 asm_fprintf (asm_out_file, ":lo12:");
10540 break;
10541
10542 case SYMBOL_SMALL_TLSGD:
10543 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10544 break;
10545
10546 case SYMBOL_SMALL_TLSDESC:
10547 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10548 break;
10549
10550 case SYMBOL_SMALL_TLSIE:
10551 asm_fprintf (asm_out_file, ":gottprel_lo12:");
10552 break;
10553
10554 case SYMBOL_TLSLE12:
10555 asm_fprintf (asm_out_file, ":tprel_lo12:");
10556 break;
10557
10558 case SYMBOL_TLSLE24:
10559 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10560 break;
10561
10562 case SYMBOL_TINY_GOT:
10563 asm_fprintf (asm_out_file, ":got:");
10564 break;
10565
10566 case SYMBOL_TINY_TLSIE:
10567 asm_fprintf (asm_out_file, ":gottprel:");
10568 break;
10569
10570 default:
10571 break;
10572 }
10573 output_addr_const (asm_out_file, x);
10574 break;
10575
10576 case 'G':
10577 switch (aarch64_classify_symbolic_expression (x))
10578 {
10579 case SYMBOL_TLSLE24:
10580 asm_fprintf (asm_out_file, ":tprel_hi12:");
10581 break;
10582 default:
10583 break;
10584 }
10585 output_addr_const (asm_out_file, x);
10586 break;
10587
10588 case 'k':
10589 {
10590 HOST_WIDE_INT cond_code;
10591
10592 if (!CONST_INT_P (x))
10593 {
10594 output_operand_lossage ("invalid operand for '%%%c'", code);
10595 return;
10596 }
10597
10598 cond_code = INTVAL (x);
10599 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10600 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10601 }
10602 break;
10603
10604 case 'y':
10605 case 'z':
10606 {
10607 machine_mode mode = GET_MODE (x);
10608
10609 if (!MEM_P (x)
10610 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10611 {
10612 output_operand_lossage ("invalid operand for '%%%c'", code);
10613 return;
10614 }
10615
10616 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10617 code == 'y'
10618 ? ADDR_QUERY_LDP_STP_N
10619 : ADDR_QUERY_LDP_STP))
10620 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10621 }
10622 break;
10623
10624 default:
10625 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10626 return;
10627 }
10628 }
10629
10630 /* Print address 'x' of a memory access with mode 'mode'.
10631 'op' is the context required by aarch64_classify_address. It can either be
10632 MEM for a normal memory access or PARALLEL for LDP/STP. */
10633 static bool
10634 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10635 aarch64_addr_query_type type)
10636 {
10637 struct aarch64_address_info addr;
10638 unsigned int size, vec_flags;
10639
10640 /* Check all addresses are Pmode - including ILP32. */
10641 if (GET_MODE (x) != Pmode
10642 && (!CONST_INT_P (x)
10643 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10644 {
10645 output_operand_lossage ("invalid address mode");
10646 return false;
10647 }
10648
10649 if (aarch64_classify_address (&addr, x, mode, true, type))
10650 switch (addr.type)
10651 {
10652 case ADDRESS_REG_IMM:
10653 if (known_eq (addr.const_offset, 0))
10654 {
10655 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10656 return true;
10657 }
10658
10659 vec_flags = aarch64_classify_vector_mode (mode);
10660 if (vec_flags & VEC_ANY_SVE)
10661 {
10662 HOST_WIDE_INT vnum
10663 = exact_div (addr.const_offset,
10664 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10665 asm_fprintf (f, "[%s, #%wd, mul vl]",
10666 reg_names[REGNO (addr.base)], vnum);
10667 return true;
10668 }
10669
10670 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10671 INTVAL (addr.offset));
10672 return true;
10673
10674 case ADDRESS_REG_REG:
10675 if (addr.shift == 0)
10676 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10677 reg_names [REGNO (addr.offset)]);
10678 else
10679 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10680 reg_names [REGNO (addr.offset)], addr.shift);
10681 return true;
10682
10683 case ADDRESS_REG_UXTW:
10684 if (addr.shift == 0)
10685 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10686 REGNO (addr.offset) - R0_REGNUM);
10687 else
10688 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10689 REGNO (addr.offset) - R0_REGNUM, addr.shift);
10690 return true;
10691
10692 case ADDRESS_REG_SXTW:
10693 if (addr.shift == 0)
10694 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10695 REGNO (addr.offset) - R0_REGNUM);
10696 else
10697 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10698 REGNO (addr.offset) - R0_REGNUM, addr.shift);
10699 return true;
10700
10701 case ADDRESS_REG_WB:
10702 /* Writeback is only supported for fixed-width modes. */
10703 size = GET_MODE_SIZE (mode).to_constant ();
10704 switch (GET_CODE (x))
10705 {
10706 case PRE_INC:
10707 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10708 return true;
10709 case POST_INC:
10710 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10711 return true;
10712 case PRE_DEC:
10713 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10714 return true;
10715 case POST_DEC:
10716 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10717 return true;
10718 case PRE_MODIFY:
10719 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10720 INTVAL (addr.offset));
10721 return true;
10722 case POST_MODIFY:
10723 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10724 INTVAL (addr.offset));
10725 return true;
10726 default:
10727 break;
10728 }
10729 break;
10730
10731 case ADDRESS_LO_SUM:
10732 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10733 output_addr_const (f, addr.offset);
10734 asm_fprintf (f, "]");
10735 return true;
10736
10737 case ADDRESS_SYMBOLIC:
10738 output_addr_const (f, x);
10739 return true;
10740 }
10741
10742 return false;
10743 }
10744
10745 /* Print address 'x' of a memory access with mode 'mode'. */
10746 static void
10747 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10748 {
10749 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10750 output_addr_const (f, x);
10751 }
10752
10753 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
10754
10755 static bool
10756 aarch64_output_addr_const_extra (FILE *file, rtx x)
10757 {
10758 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
10759 {
10760 output_addr_const (file, XVECEXP (x, 0, 0));
10761 return true;
10762 }
10763 return false;
10764 }
10765
10766 bool
10767 aarch64_label_mentioned_p (rtx x)
10768 {
10769 const char *fmt;
10770 int i;
10771
10772 if (LABEL_REF_P (x))
10773 return true;
10774
10775 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10776 referencing instruction, but they are constant offsets, not
10777 symbols. */
10778 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10779 return false;
10780
10781 fmt = GET_RTX_FORMAT (GET_CODE (x));
10782 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10783 {
10784 if (fmt[i] == 'E')
10785 {
10786 int j;
10787
10788 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10789 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10790 return 1;
10791 }
10792 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10793 return 1;
10794 }
10795
10796 return 0;
10797 }
10798
10799 /* Implement REGNO_REG_CLASS. */
10800
10801 enum reg_class
10802 aarch64_regno_regclass (unsigned regno)
10803 {
10804 if (STUB_REGNUM_P (regno))
10805 return STUB_REGS;
10806
10807 if (GP_REGNUM_P (regno))
10808 return GENERAL_REGS;
10809
10810 if (regno == SP_REGNUM)
10811 return STACK_REG;
10812
10813 if (regno == FRAME_POINTER_REGNUM
10814 || regno == ARG_POINTER_REGNUM)
10815 return POINTER_REGS;
10816
10817 if (FP_REGNUM_P (regno))
10818 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10819 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10820
10821 if (PR_REGNUM_P (regno))
10822 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10823
10824 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10825 return FFR_REGS;
10826
10827 return NO_REGS;
10828 }
10829
10830 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10831 If OFFSET is out of range, return an offset of an anchor point
10832 that is in range. Return 0 otherwise. */
10833
10834 static HOST_WIDE_INT
10835 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10836 machine_mode mode)
10837 {
10838 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10839 if (size > 16)
10840 return (offset + 0x400) & ~0x7f0;
10841
10842 /* For offsets that aren't a multiple of the access size, the limit is
10843 -256...255. */
10844 if (offset & (size - 1))
10845 {
10846 /* BLKmode typically uses LDP of X-registers. */
10847 if (mode == BLKmode)
10848 return (offset + 512) & ~0x3ff;
10849 return (offset + 0x100) & ~0x1ff;
10850 }
10851
10852 /* Small negative offsets are supported. */
10853 if (IN_RANGE (offset, -256, 0))
10854 return 0;
10855
10856 if (mode == TImode || mode == TFmode)
10857 return (offset + 0x100) & ~0x1ff;
10858
10859 /* Use 12-bit offset by access size. */
10860 return offset & (~0xfff * size);
10861 }
10862
10863 static rtx
10864 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
10865 {
10866 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10867 where mask is selected by alignment and size of the offset.
10868 We try to pick as large a range for the offset as possible to
10869 maximize the chance of a CSE. However, for aligned addresses
10870 we limit the range to 4k so that structures with different sized
10871 elements are likely to use the same base. We need to be careful
10872 not to split a CONST for some forms of address expression, otherwise
10873 it will generate sub-optimal code. */
10874
10875 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10876 {
10877 rtx base = XEXP (x, 0);
10878 rtx offset_rtx = XEXP (x, 1);
10879 HOST_WIDE_INT offset = INTVAL (offset_rtx);
10880
10881 if (GET_CODE (base) == PLUS)
10882 {
10883 rtx op0 = XEXP (base, 0);
10884 rtx op1 = XEXP (base, 1);
10885
10886 /* Force any scaling into a temp for CSE. */
10887 op0 = force_reg (Pmode, op0);
10888 op1 = force_reg (Pmode, op1);
10889
10890 /* Let the pointer register be in op0. */
10891 if (REG_POINTER (op1))
10892 std::swap (op0, op1);
10893
10894 /* If the pointer is virtual or frame related, then we know that
10895 virtual register instantiation or register elimination is going
10896 to apply a second constant. We want the two constants folded
10897 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10898 if (virt_or_elim_regno_p (REGNO (op0)))
10899 {
10900 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10901 NULL_RTX, true, OPTAB_DIRECT);
10902 return gen_rtx_PLUS (Pmode, base, op1);
10903 }
10904
10905 /* Otherwise, in order to encourage CSE (and thence loop strength
10906 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10907 base = expand_binop (Pmode, add_optab, op0, op1,
10908 NULL_RTX, true, OPTAB_DIRECT);
10909 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10910 }
10911
10912 HOST_WIDE_INT size;
10913 if (GET_MODE_SIZE (mode).is_constant (&size))
10914 {
10915 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10916 mode);
10917 if (base_offset != 0)
10918 {
10919 base = plus_constant (Pmode, base, base_offset);
10920 base = force_operand (base, NULL_RTX);
10921 return plus_constant (Pmode, base, offset - base_offset);
10922 }
10923 }
10924 }
10925
10926 return x;
10927 }
10928
10929 static reg_class_t
10930 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10931 reg_class_t rclass,
10932 machine_mode mode,
10933 secondary_reload_info *sri)
10934 {
10935 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10936 LDR and STR. See the comment at the head of aarch64-sve.md for
10937 more details about the big-endian handling. */
10938 if (reg_class_subset_p (rclass, FP_REGS)
10939 && !((REG_P (x) && HARD_REGISTER_P (x))
10940 || aarch64_simd_valid_immediate (x, NULL))
10941 && mode != VNx16QImode)
10942 {
10943 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10944 if ((vec_flags & VEC_SVE_DATA)
10945 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10946 {
10947 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10948 return NO_REGS;
10949 }
10950 }
10951
10952 /* If we have to disable direct literal pool loads and stores because the
10953 function is too big, then we need a scratch register. */
10954 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
10955 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10956 || targetm.vector_mode_supported_p (GET_MODE (x)))
10957 && !aarch64_pcrelative_literal_loads)
10958 {
10959 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10960 return NO_REGS;
10961 }
10962
10963 /* Without the TARGET_SIMD instructions we cannot move a Q register
10964 to a Q register directly. We need a scratch. */
10965 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10966 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10967 && reg_class_subset_p (rclass, FP_REGS))
10968 {
10969 sri->icode = code_for_aarch64_reload_mov (mode);
10970 return NO_REGS;
10971 }
10972
10973 /* A TFmode or TImode memory access should be handled via an FP_REGS
10974 because AArch64 has richer addressing modes for LDR/STR instructions
10975 than LDP/STP instructions. */
10976 if (TARGET_FLOAT && rclass == GENERAL_REGS
10977 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10978 return FP_REGS;
10979
10980 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10981 return GENERAL_REGS;
10982
10983 return NO_REGS;
10984 }
10985
10986 static bool
10987 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10988 {
10989 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10990
10991 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10992 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10993 if (frame_pointer_needed)
10994 return to == HARD_FRAME_POINTER_REGNUM;
10995 return true;
10996 }
10997
10998 poly_int64
10999 aarch64_initial_elimination_offset (unsigned from, unsigned to)
11000 {
11001 if (to == HARD_FRAME_POINTER_REGNUM)
11002 {
11003 if (from == ARG_POINTER_REGNUM)
11004 return cfun->machine->frame.hard_fp_offset;
11005
11006 if (from == FRAME_POINTER_REGNUM)
11007 return cfun->machine->frame.hard_fp_offset
11008 - cfun->machine->frame.locals_offset;
11009 }
11010
11011 if (to == STACK_POINTER_REGNUM)
11012 {
11013 if (from == FRAME_POINTER_REGNUM)
11014 return cfun->machine->frame.frame_size
11015 - cfun->machine->frame.locals_offset;
11016 }
11017
11018 return cfun->machine->frame.frame_size;
11019 }
11020
11021
11022 /* Get return address without mangling. */
11023
11024 rtx
11025 aarch64_return_addr_rtx (void)
11026 {
11027 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11028 /* Note: aarch64_return_address_signing_enabled only
11029 works after cfun->machine->frame.laid_out is set,
11030 so here we don't know if the return address will
11031 be signed or not. */
11032 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11033 emit_move_insn (lr, val);
11034 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11035 return lr;
11036 }
11037
11038
11039 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
11040 previous frame. */
11041
11042 rtx
11043 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11044 {
11045 if (count != 0)
11046 return const0_rtx;
11047 return aarch64_return_addr_rtx ();
11048 }
11049
11050 static void
11051 aarch64_asm_trampoline_template (FILE *f)
11052 {
11053 /* Even if the current function doesn't have branch protection, some
11054 later function might, so since this template is only generated once
11055 we have to add a BTI just in case. */
11056 asm_fprintf (f, "\thint\t34 // bti c\n");
11057
11058 if (TARGET_ILP32)
11059 {
11060 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
11061 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
11062 }
11063 else
11064 {
11065 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
11066 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
11067 }
11068 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
11069
11070 /* We always emit a speculation barrier.
11071 This is because the same trampoline template is used for every nested
11072 function. Since nested functions are not particularly common or
11073 performant we don't worry too much about the extra instructions to copy
11074 around.
11075 This is not yet a problem, since we have not yet implemented function
11076 specific attributes to choose between hardening against straight line
11077 speculation or not, but such function specific attributes are likely to
11078 happen in the future. */
11079 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11080
11081 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11082 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11083 }
11084
11085 static void
11086 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11087 {
11088 rtx fnaddr, mem, a_tramp;
11089 const int tramp_code_sz = 24;
11090
11091 /* Don't need to copy the trailing D-words, we fill those in below. */
11092 /* We create our own memory address in Pmode so that `emit_block_move` can
11093 use parts of the backend which expect Pmode addresses. */
11094 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11095 emit_block_move (gen_rtx_MEM (BLKmode, temp),
11096 assemble_trampoline_template (),
11097 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11098 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
11099 fnaddr = XEXP (DECL_RTL (fndecl), 0);
11100 if (GET_MODE (fnaddr) != ptr_mode)
11101 fnaddr = convert_memory_address (ptr_mode, fnaddr);
11102 emit_move_insn (mem, fnaddr);
11103
11104 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
11105 emit_move_insn (mem, chain_value);
11106
11107 /* XXX We should really define a "clear_cache" pattern and use
11108 gen_clear_cache(). */
11109 a_tramp = XEXP (m_tramp, 0);
11110 maybe_emit_call_builtin___clear_cache (a_tramp,
11111 plus_constant (ptr_mode,
11112 a_tramp,
11113 TRAMPOLINE_SIZE));
11114 }
11115
11116 static unsigned char
11117 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
11118 {
11119 /* ??? Logically we should only need to provide a value when
11120 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11121 can hold MODE, but at the moment we need to handle all modes.
11122 Just ignore any runtime parts for registers that can't store them. */
11123 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
11124 unsigned int nregs, vec_flags;
11125 switch (regclass)
11126 {
11127 case STUB_REGS:
11128 case TAILCALL_ADDR_REGS:
11129 case POINTER_REGS:
11130 case GENERAL_REGS:
11131 case ALL_REGS:
11132 case POINTER_AND_FP_REGS:
11133 case FP_REGS:
11134 case FP_LO_REGS:
11135 case FP_LO8_REGS:
11136 vec_flags = aarch64_classify_vector_mode (mode);
11137 if ((vec_flags & VEC_SVE_DATA)
11138 && constant_multiple_p (GET_MODE_SIZE (mode),
11139 aarch64_vl_bytes (mode, vec_flags), &nregs))
11140 return nregs;
11141 return (vec_flags & VEC_ADVSIMD
11142 ? CEIL (lowest_size, UNITS_PER_VREG)
11143 : CEIL (lowest_size, UNITS_PER_WORD));
11144 case STACK_REG:
11145 case PR_REGS:
11146 case PR_LO_REGS:
11147 case PR_HI_REGS:
11148 case FFR_REGS:
11149 case PR_AND_FFR_REGS:
11150 return 1;
11151
11152 case NO_REGS:
11153 return 0;
11154
11155 default:
11156 break;
11157 }
11158 gcc_unreachable ();
11159 }
11160
11161 static reg_class_t
11162 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
11163 {
11164 if (regclass == POINTER_REGS)
11165 return GENERAL_REGS;
11166
11167 if (regclass == STACK_REG)
11168 {
11169 if (REG_P(x)
11170 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11171 return regclass;
11172
11173 return NO_REGS;
11174 }
11175
11176 /* Register eliminiation can result in a request for
11177 SP+constant->FP_REGS. We cannot support such operations which
11178 use SP as source and an FP_REG as destination, so reject out
11179 right now. */
11180 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11181 {
11182 rtx lhs = XEXP (x, 0);
11183
11184 /* Look through a possible SUBREG introduced by ILP32. */
11185 if (SUBREG_P (lhs))
11186 lhs = SUBREG_REG (lhs);
11187
11188 gcc_assert (REG_P (lhs));
11189 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11190 POINTER_REGS));
11191 return NO_REGS;
11192 }
11193
11194 return regclass;
11195 }
11196
11197 void
11198 aarch64_asm_output_labelref (FILE* f, const char *name)
11199 {
11200 asm_fprintf (f, "%U%s", name);
11201 }
11202
11203 static void
11204 aarch64_elf_asm_constructor (rtx symbol, int priority)
11205 {
11206 if (priority == DEFAULT_INIT_PRIORITY)
11207 default_ctor_section_asm_out_constructor (symbol, priority);
11208 else
11209 {
11210 section *s;
11211 /* While priority is known to be in range [0, 65535], so 18 bytes
11212 would be enough, the compiler might not know that. To avoid
11213 -Wformat-truncation false positive, use a larger size. */
11214 char buf[23];
11215 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11216 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11217 switch_to_section (s);
11218 assemble_align (POINTER_SIZE);
11219 assemble_aligned_integer (POINTER_BYTES, symbol);
11220 }
11221 }
11222
11223 static void
11224 aarch64_elf_asm_destructor (rtx symbol, int priority)
11225 {
11226 if (priority == DEFAULT_INIT_PRIORITY)
11227 default_dtor_section_asm_out_destructor (symbol, priority);
11228 else
11229 {
11230 section *s;
11231 /* While priority is known to be in range [0, 65535], so 18 bytes
11232 would be enough, the compiler might not know that. To avoid
11233 -Wformat-truncation false positive, use a larger size. */
11234 char buf[23];
11235 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11236 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11237 switch_to_section (s);
11238 assemble_align (POINTER_SIZE);
11239 assemble_aligned_integer (POINTER_BYTES, symbol);
11240 }
11241 }
11242
11243 const char*
11244 aarch64_output_casesi (rtx *operands)
11245 {
11246 char buf[100];
11247 char label[100];
11248 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11249 int index;
11250 static const char *const patterns[4][2] =
11251 {
11252 {
11253 "ldrb\t%w3, [%0,%w1,uxtw]",
11254 "add\t%3, %4, %w3, sxtb #2"
11255 },
11256 {
11257 "ldrh\t%w3, [%0,%w1,uxtw #1]",
11258 "add\t%3, %4, %w3, sxth #2"
11259 },
11260 {
11261 "ldr\t%w3, [%0,%w1,uxtw #2]",
11262 "add\t%3, %4, %w3, sxtw #2"
11263 },
11264 /* We assume that DImode is only generated when not optimizing and
11265 that we don't really need 64-bit address offsets. That would
11266 imply an object file with 8GB of code in a single function! */
11267 {
11268 "ldr\t%w3, [%0,%w1,uxtw #2]",
11269 "add\t%3, %4, %w3, sxtw #2"
11270 }
11271 };
11272
11273 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11274
11275 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11276 index = exact_log2 (GET_MODE_SIZE (mode));
11277
11278 gcc_assert (index >= 0 && index <= 3);
11279
11280 /* Need to implement table size reduction, by chaning the code below. */
11281 output_asm_insn (patterns[index][0], operands);
11282 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11283 snprintf (buf, sizeof (buf),
11284 "adr\t%%4, %s", targetm.strip_name_encoding (label));
11285 output_asm_insn (buf, operands);
11286 output_asm_insn (patterns[index][1], operands);
11287 output_asm_insn ("br\t%3", operands);
11288 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11289 operands);
11290 assemble_label (asm_out_file, label);
11291 return "";
11292 }
11293
11294
11295 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11296 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11297 operator. */
11298
11299 int
11300 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11301 {
11302 if (shift >= 0 && shift <= 3)
11303 {
11304 int size;
11305 for (size = 8; size <= 32; size *= 2)
11306 {
11307 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11308 if (mask == bits << shift)
11309 return size;
11310 }
11311 }
11312 return 0;
11313 }
11314
11315 /* Constant pools are per function only when PC relative
11316 literal loads are true or we are in the large memory
11317 model. */
11318
11319 static inline bool
11320 aarch64_can_use_per_function_literal_pools_p (void)
11321 {
11322 return (aarch64_pcrelative_literal_loads
11323 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11324 }
11325
11326 static bool
11327 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11328 {
11329 /* We can't use blocks for constants when we're using a per-function
11330 constant pool. */
11331 return !aarch64_can_use_per_function_literal_pools_p ();
11332 }
11333
11334 /* Select appropriate section for constants depending
11335 on where we place literal pools. */
11336
11337 static section *
11338 aarch64_select_rtx_section (machine_mode mode,
11339 rtx x,
11340 unsigned HOST_WIDE_INT align)
11341 {
11342 if (aarch64_can_use_per_function_literal_pools_p ())
11343 return function_section (current_function_decl);
11344
11345 return default_elf_select_rtx_section (mode, x, align);
11346 }
11347
11348 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
11349 void
11350 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11351 HOST_WIDE_INT offset)
11352 {
11353 /* When using per-function literal pools, we must ensure that any code
11354 section is aligned to the minimal instruction length, lest we get
11355 errors from the assembler re "unaligned instructions". */
11356 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11357 ASM_OUTPUT_ALIGN (f, 2);
11358 }
11359
11360 /* Costs. */
11361
11362 /* Helper function for rtx cost calculation. Strip a shift expression
11363 from X. Returns the inner operand if successful, or the original
11364 expression on failure. */
11365 static rtx
11366 aarch64_strip_shift (rtx x)
11367 {
11368 rtx op = x;
11369
11370 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11371 we can convert both to ROR during final output. */
11372 if ((GET_CODE (op) == ASHIFT
11373 || GET_CODE (op) == ASHIFTRT
11374 || GET_CODE (op) == LSHIFTRT
11375 || GET_CODE (op) == ROTATERT
11376 || GET_CODE (op) == ROTATE)
11377 && CONST_INT_P (XEXP (op, 1)))
11378 return XEXP (op, 0);
11379
11380 if (GET_CODE (op) == MULT
11381 && CONST_INT_P (XEXP (op, 1))
11382 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11383 return XEXP (op, 0);
11384
11385 return x;
11386 }
11387
11388 /* Helper function for rtx cost calculation. Strip an extend
11389 expression from X. Returns the inner operand if successful, or the
11390 original expression on failure. We deal with a number of possible
11391 canonicalization variations here. If STRIP_SHIFT is true, then
11392 we can strip off a shift also. */
11393 static rtx
11394 aarch64_strip_extend (rtx x, bool strip_shift)
11395 {
11396 scalar_int_mode mode;
11397 rtx op = x;
11398
11399 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11400 return op;
11401
11402 if (GET_CODE (op) == AND
11403 && GET_CODE (XEXP (op, 0)) == MULT
11404 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11405 && CONST_INT_P (XEXP (op, 1))
11406 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11407 INTVAL (XEXP (op, 1))) != 0)
11408 return XEXP (XEXP (op, 0), 0);
11409
11410 /* Now handle extended register, as this may also have an optional
11411 left shift by 1..4. */
11412 if (strip_shift
11413 && GET_CODE (op) == ASHIFT
11414 && CONST_INT_P (XEXP (op, 1))
11415 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11416 op = XEXP (op, 0);
11417
11418 if (GET_CODE (op) == ZERO_EXTEND
11419 || GET_CODE (op) == SIGN_EXTEND)
11420 op = XEXP (op, 0);
11421
11422 if (op != x)
11423 return op;
11424
11425 return x;
11426 }
11427
11428 /* Return true iff CODE is a shift supported in combination
11429 with arithmetic instructions. */
11430
11431 static bool
11432 aarch64_shift_p (enum rtx_code code)
11433 {
11434 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11435 }
11436
11437
11438 /* Return true iff X is a cheap shift without a sign extend. */
11439
11440 static bool
11441 aarch64_cheap_mult_shift_p (rtx x)
11442 {
11443 rtx op0, op1;
11444
11445 op0 = XEXP (x, 0);
11446 op1 = XEXP (x, 1);
11447
11448 if (!(aarch64_tune_params.extra_tuning_flags
11449 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11450 return false;
11451
11452 if (GET_CODE (op0) == SIGN_EXTEND)
11453 return false;
11454
11455 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11456 && UINTVAL (op1) <= 4)
11457 return true;
11458
11459 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11460 return false;
11461
11462 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11463
11464 if (l2 > 0 && l2 <= 4)
11465 return true;
11466
11467 return false;
11468 }
11469
11470 /* Helper function for rtx cost calculation. Calculate the cost of
11471 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11472 Return the calculated cost of the expression, recursing manually in to
11473 operands where needed. */
11474
11475 static int
11476 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11477 {
11478 rtx op0, op1;
11479 const struct cpu_cost_table *extra_cost
11480 = aarch64_tune_params.insn_extra_cost;
11481 int cost = 0;
11482 bool compound_p = (outer == PLUS || outer == MINUS);
11483 machine_mode mode = GET_MODE (x);
11484
11485 gcc_checking_assert (code == MULT);
11486
11487 op0 = XEXP (x, 0);
11488 op1 = XEXP (x, 1);
11489
11490 if (VECTOR_MODE_P (mode))
11491 {
11492 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11493 mode = GET_MODE_INNER (mode);
11494 if (vec_flags & VEC_ADVSIMD)
11495 {
11496 /* The by-element versions of the instruction have the same costs as
11497 the normal 3-vector version. So don't add the costs of the
11498 duplicate into the costs of the multiply. We make an assumption
11499 that the input to the VEC_DUPLICATE is already on the FP & SIMD
11500 side. This means costing of a MUL by element pre RA is a bit
11501 optimistic. */
11502 if (GET_CODE (op0) == VEC_DUPLICATE)
11503 op0 = XEXP (op0, 0);
11504 else if (GET_CODE (op1) == VEC_DUPLICATE)
11505 op1 = XEXP (op1, 0);
11506 }
11507 }
11508
11509 /* Integer multiply/fma. */
11510 if (GET_MODE_CLASS (mode) == MODE_INT)
11511 {
11512 /* The multiply will be canonicalized as a shift, cost it as such. */
11513 if (aarch64_shift_p (GET_CODE (x))
11514 || (CONST_INT_P (op1)
11515 && exact_log2 (INTVAL (op1)) > 0))
11516 {
11517 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11518 || GET_CODE (op0) == SIGN_EXTEND;
11519 if (speed)
11520 {
11521 if (compound_p)
11522 {
11523 /* If the shift is considered cheap,
11524 then don't add any cost. */
11525 if (aarch64_cheap_mult_shift_p (x))
11526 ;
11527 else if (REG_P (op1))
11528 /* ARITH + shift-by-register. */
11529 cost += extra_cost->alu.arith_shift_reg;
11530 else if (is_extend)
11531 /* ARITH + extended register. We don't have a cost field
11532 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
11533 cost += extra_cost->alu.extend_arith;
11534 else
11535 /* ARITH + shift-by-immediate. */
11536 cost += extra_cost->alu.arith_shift;
11537 }
11538 else
11539 /* LSL (immediate). */
11540 cost += extra_cost->alu.shift;
11541
11542 }
11543 /* Strip extends as we will have costed them in the case above. */
11544 if (is_extend)
11545 op0 = aarch64_strip_extend (op0, true);
11546
11547 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11548
11549 return cost;
11550 }
11551
11552 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
11553 compound and let the below cases handle it. After all, MNEG is a
11554 special-case alias of MSUB. */
11555 if (GET_CODE (op0) == NEG)
11556 {
11557 op0 = XEXP (op0, 0);
11558 compound_p = true;
11559 }
11560
11561 /* Integer multiplies or FMAs have zero/sign extending variants. */
11562 if ((GET_CODE (op0) == ZERO_EXTEND
11563 && GET_CODE (op1) == ZERO_EXTEND)
11564 || (GET_CODE (op0) == SIGN_EXTEND
11565 && GET_CODE (op1) == SIGN_EXTEND))
11566 {
11567 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11568 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11569
11570 if (speed)
11571 {
11572 if (compound_p)
11573 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
11574 cost += extra_cost->mult[0].extend_add;
11575 else
11576 /* MUL/SMULL/UMULL. */
11577 cost += extra_cost->mult[0].extend;
11578 }
11579
11580 return cost;
11581 }
11582
11583 /* This is either an integer multiply or a MADD. In both cases
11584 we want to recurse and cost the operands. */
11585 cost += rtx_cost (op0, mode, MULT, 0, speed);
11586 cost += rtx_cost (op1, mode, MULT, 1, speed);
11587
11588 if (speed)
11589 {
11590 if (compound_p)
11591 /* MADD/MSUB. */
11592 cost += extra_cost->mult[mode == DImode].add;
11593 else
11594 /* MUL. */
11595 cost += extra_cost->mult[mode == DImode].simple;
11596 }
11597
11598 return cost;
11599 }
11600 else
11601 {
11602 if (speed)
11603 {
11604 /* Floating-point FMA/FMUL can also support negations of the
11605 operands, unless the rounding mode is upward or downward in
11606 which case FNMUL is different than FMUL with operand negation. */
11607 bool neg0 = GET_CODE (op0) == NEG;
11608 bool neg1 = GET_CODE (op1) == NEG;
11609 if (compound_p || !flag_rounding_math || (neg0 && neg1))
11610 {
11611 if (neg0)
11612 op0 = XEXP (op0, 0);
11613 if (neg1)
11614 op1 = XEXP (op1, 0);
11615 }
11616
11617 if (compound_p)
11618 /* FMADD/FNMADD/FNMSUB/FMSUB. */
11619 cost += extra_cost->fp[mode == DFmode].fma;
11620 else
11621 /* FMUL/FNMUL. */
11622 cost += extra_cost->fp[mode == DFmode].mult;
11623 }
11624
11625 cost += rtx_cost (op0, mode, MULT, 0, speed);
11626 cost += rtx_cost (op1, mode, MULT, 1, speed);
11627 return cost;
11628 }
11629 }
11630
11631 static int
11632 aarch64_address_cost (rtx x,
11633 machine_mode mode,
11634 addr_space_t as ATTRIBUTE_UNUSED,
11635 bool speed)
11636 {
11637 enum rtx_code c = GET_CODE (x);
11638 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11639 struct aarch64_address_info info;
11640 int cost = 0;
11641 info.shift = 0;
11642
11643 if (!aarch64_classify_address (&info, x, mode, false))
11644 {
11645 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
11646 {
11647 /* This is a CONST or SYMBOL ref which will be split
11648 in a different way depending on the code model in use.
11649 Cost it through the generic infrastructure. */
11650 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11651 /* Divide through by the cost of one instruction to
11652 bring it to the same units as the address costs. */
11653 cost_symbol_ref /= COSTS_N_INSNS (1);
11654 /* The cost is then the cost of preparing the address,
11655 followed by an immediate (possibly 0) offset. */
11656 return cost_symbol_ref + addr_cost->imm_offset;
11657 }
11658 else
11659 {
11660 /* This is most likely a jump table from a case
11661 statement. */
11662 return addr_cost->register_offset;
11663 }
11664 }
11665
11666 switch (info.type)
11667 {
11668 case ADDRESS_LO_SUM:
11669 case ADDRESS_SYMBOLIC:
11670 case ADDRESS_REG_IMM:
11671 cost += addr_cost->imm_offset;
11672 break;
11673
11674 case ADDRESS_REG_WB:
11675 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11676 cost += addr_cost->pre_modify;
11677 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11678 cost += addr_cost->post_modify;
11679 else
11680 gcc_unreachable ();
11681
11682 break;
11683
11684 case ADDRESS_REG_REG:
11685 cost += addr_cost->register_offset;
11686 break;
11687
11688 case ADDRESS_REG_SXTW:
11689 cost += addr_cost->register_sextend;
11690 break;
11691
11692 case ADDRESS_REG_UXTW:
11693 cost += addr_cost->register_zextend;
11694 break;
11695
11696 default:
11697 gcc_unreachable ();
11698 }
11699
11700
11701 if (info.shift > 0)
11702 {
11703 /* For the sake of calculating the cost of the shifted register
11704 component, we can treat same sized modes in the same way. */
11705 if (known_eq (GET_MODE_BITSIZE (mode), 16))
11706 cost += addr_cost->addr_scale_costs.hi;
11707 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11708 cost += addr_cost->addr_scale_costs.si;
11709 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11710 cost += addr_cost->addr_scale_costs.di;
11711 else
11712 /* We can't tell, or this is a 128-bit vector. */
11713 cost += addr_cost->addr_scale_costs.ti;
11714 }
11715
11716 return cost;
11717 }
11718
11719 /* Return the cost of a branch. If SPEED_P is true then the compiler is
11720 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
11721 to be taken. */
11722
11723 int
11724 aarch64_branch_cost (bool speed_p, bool predictable_p)
11725 {
11726 /* When optimizing for speed, use the cost of unpredictable branches. */
11727 const struct cpu_branch_cost *branch_costs =
11728 aarch64_tune_params.branch_costs;
11729
11730 if (!speed_p || predictable_p)
11731 return branch_costs->predictable;
11732 else
11733 return branch_costs->unpredictable;
11734 }
11735
11736 /* Return true if X is a zero or sign extract
11737 usable in an ADD or SUB (extended register) instruction. */
11738 static bool
11739 aarch64_rtx_arith_op_extract_p (rtx x)
11740 {
11741 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11742 No shift. */
11743 if (GET_CODE (x) == SIGN_EXTEND
11744 || GET_CODE (x) == ZERO_EXTEND)
11745 return REG_P (XEXP (x, 0));
11746
11747 return false;
11748 }
11749
11750 static bool
11751 aarch64_frint_unspec_p (unsigned int u)
11752 {
11753 switch (u)
11754 {
11755 case UNSPEC_FRINTZ:
11756 case UNSPEC_FRINTP:
11757 case UNSPEC_FRINTM:
11758 case UNSPEC_FRINTA:
11759 case UNSPEC_FRINTN:
11760 case UNSPEC_FRINTX:
11761 case UNSPEC_FRINTI:
11762 return true;
11763
11764 default:
11765 return false;
11766 }
11767 }
11768
11769 /* Return true iff X is an rtx that will match an extr instruction
11770 i.e. as described in the *extr<mode>5_insn family of patterns.
11771 OP0 and OP1 will be set to the operands of the shifts involved
11772 on success and will be NULL_RTX otherwise. */
11773
11774 static bool
11775 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11776 {
11777 rtx op0, op1;
11778 scalar_int_mode mode;
11779 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11780 return false;
11781
11782 *res_op0 = NULL_RTX;
11783 *res_op1 = NULL_RTX;
11784
11785 if (GET_CODE (x) != IOR)
11786 return false;
11787
11788 op0 = XEXP (x, 0);
11789 op1 = XEXP (x, 1);
11790
11791 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11792 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11793 {
11794 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11795 if (GET_CODE (op1) == ASHIFT)
11796 std::swap (op0, op1);
11797
11798 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11799 return false;
11800
11801 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11802 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11803
11804 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11805 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11806 {
11807 *res_op0 = XEXP (op0, 0);
11808 *res_op1 = XEXP (op1, 0);
11809 return true;
11810 }
11811 }
11812
11813 return false;
11814 }
11815
11816 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11817 storing it in *COST. Result is true if the total cost of the operation
11818 has now been calculated. */
11819 static bool
11820 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11821 {
11822 rtx inner;
11823 rtx comparator;
11824 enum rtx_code cmpcode;
11825 const struct cpu_cost_table *extra_cost
11826 = aarch64_tune_params.insn_extra_cost;
11827
11828 if (COMPARISON_P (op0))
11829 {
11830 inner = XEXP (op0, 0);
11831 comparator = XEXP (op0, 1);
11832 cmpcode = GET_CODE (op0);
11833 }
11834 else
11835 {
11836 inner = op0;
11837 comparator = const0_rtx;
11838 cmpcode = NE;
11839 }
11840
11841 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11842 {
11843 /* Conditional branch. */
11844 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11845 return true;
11846 else
11847 {
11848 if (cmpcode == NE || cmpcode == EQ)
11849 {
11850 if (comparator == const0_rtx)
11851 {
11852 /* TBZ/TBNZ/CBZ/CBNZ. */
11853 if (GET_CODE (inner) == ZERO_EXTRACT)
11854 /* TBZ/TBNZ. */
11855 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11856 ZERO_EXTRACT, 0, speed);
11857 else
11858 /* CBZ/CBNZ. */
11859 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11860
11861 return true;
11862 }
11863 if (register_operand (inner, VOIDmode)
11864 && aarch64_imm24 (comparator, VOIDmode))
11865 {
11866 /* SUB and SUBS. */
11867 *cost += COSTS_N_INSNS (2);
11868 if (speed)
11869 *cost += extra_cost->alu.arith * 2;
11870 return true;
11871 }
11872 }
11873 else if (cmpcode == LT || cmpcode == GE)
11874 {
11875 /* TBZ/TBNZ. */
11876 if (comparator == const0_rtx)
11877 return true;
11878 }
11879 }
11880 }
11881 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11882 {
11883 /* CCMP. */
11884 if (GET_CODE (op1) == COMPARE)
11885 {
11886 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11887 if (XEXP (op1, 1) == const0_rtx)
11888 *cost += 1;
11889 if (speed)
11890 {
11891 machine_mode mode = GET_MODE (XEXP (op1, 0));
11892
11893 if (GET_MODE_CLASS (mode) == MODE_INT)
11894 *cost += extra_cost->alu.arith;
11895 else
11896 *cost += extra_cost->fp[mode == DFmode].compare;
11897 }
11898 return true;
11899 }
11900
11901 /* It's a conditional operation based on the status flags,
11902 so it must be some flavor of CSEL. */
11903
11904 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11905 if (GET_CODE (op1) == NEG
11906 || GET_CODE (op1) == NOT
11907 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11908 op1 = XEXP (op1, 0);
11909 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11910 {
11911 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11912 op1 = XEXP (op1, 0);
11913 op2 = XEXP (op2, 0);
11914 }
11915 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
11916 {
11917 inner = XEXP (op1, 0);
11918 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
11919 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
11920 op1 = XEXP (inner, 0);
11921 }
11922
11923 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11924 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11925 return true;
11926 }
11927
11928 /* We don't know what this is, cost all operands. */
11929 return false;
11930 }
11931
11932 /* Check whether X is a bitfield operation of the form shift + extend that
11933 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11934 operand to which the bitfield operation is applied. Otherwise return
11935 NULL_RTX. */
11936
11937 static rtx
11938 aarch64_extend_bitfield_pattern_p (rtx x)
11939 {
11940 rtx_code outer_code = GET_CODE (x);
11941 machine_mode outer_mode = GET_MODE (x);
11942
11943 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11944 && outer_mode != SImode && outer_mode != DImode)
11945 return NULL_RTX;
11946
11947 rtx inner = XEXP (x, 0);
11948 rtx_code inner_code = GET_CODE (inner);
11949 machine_mode inner_mode = GET_MODE (inner);
11950 rtx op = NULL_RTX;
11951
11952 switch (inner_code)
11953 {
11954 case ASHIFT:
11955 if (CONST_INT_P (XEXP (inner, 1))
11956 && (inner_mode == QImode || inner_mode == HImode))
11957 op = XEXP (inner, 0);
11958 break;
11959 case LSHIFTRT:
11960 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11961 && (inner_mode == QImode || inner_mode == HImode))
11962 op = XEXP (inner, 0);
11963 break;
11964 case ASHIFTRT:
11965 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11966 && (inner_mode == QImode || inner_mode == HImode))
11967 op = XEXP (inner, 0);
11968 break;
11969 default:
11970 break;
11971 }
11972
11973 return op;
11974 }
11975
11976 /* Return true if the mask and a shift amount from an RTX of the form
11977 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11978 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11979
11980 bool
11981 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11982 rtx shft_amnt)
11983 {
11984 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11985 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11986 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11987 && (INTVAL (mask)
11988 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11989 }
11990
11991 /* Return true if the masks and a shift amount from an RTX of the form
11992 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11993 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11994
11995 bool
11996 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11997 unsigned HOST_WIDE_INT mask1,
11998 unsigned HOST_WIDE_INT shft_amnt,
11999 unsigned HOST_WIDE_INT mask2)
12000 {
12001 unsigned HOST_WIDE_INT t;
12002
12003 /* Verify that there is no overlap in what bits are set in the two masks. */
12004 if (mask1 != ~mask2)
12005 return false;
12006
12007 /* Verify that mask2 is not all zeros or ones. */
12008 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12009 return false;
12010
12011 /* The shift amount should always be less than the mode size. */
12012 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12013
12014 /* Verify that the mask being shifted is contiguous and would be in the
12015 least significant bits after shifting by shft_amnt. */
12016 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12017 return (t == (t & -t));
12018 }
12019
12020 /* Calculate the cost of calculating X, storing it in *COST. Result
12021 is true if the total cost of the operation has now been calculated. */
12022 static bool
12023 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
12024 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12025 {
12026 rtx op0, op1, op2;
12027 const struct cpu_cost_table *extra_cost
12028 = aarch64_tune_params.insn_extra_cost;
12029 int code = GET_CODE (x);
12030 scalar_int_mode int_mode;
12031
12032 /* By default, assume that everything has equivalent cost to the
12033 cheapest instruction. Any additional costs are applied as a delta
12034 above this default. */
12035 *cost = COSTS_N_INSNS (1);
12036
12037 switch (code)
12038 {
12039 case SET:
12040 /* The cost depends entirely on the operands to SET. */
12041 *cost = 0;
12042 op0 = SET_DEST (x);
12043 op1 = SET_SRC (x);
12044
12045 switch (GET_CODE (op0))
12046 {
12047 case MEM:
12048 if (speed)
12049 {
12050 rtx address = XEXP (op0, 0);
12051 if (VECTOR_MODE_P (mode))
12052 *cost += extra_cost->ldst.storev;
12053 else if (GET_MODE_CLASS (mode) == MODE_INT)
12054 *cost += extra_cost->ldst.store;
12055 else if (mode == SFmode)
12056 *cost += extra_cost->ldst.storef;
12057 else if (mode == DFmode)
12058 *cost += extra_cost->ldst.stored;
12059
12060 *cost +=
12061 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12062 0, speed));
12063 }
12064
12065 *cost += rtx_cost (op1, mode, SET, 1, speed);
12066 return true;
12067
12068 case SUBREG:
12069 if (! REG_P (SUBREG_REG (op0)))
12070 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
12071
12072 /* Fall through. */
12073 case REG:
12074 /* The cost is one per vector-register copied. */
12075 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12076 {
12077 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12078 *cost = COSTS_N_INSNS (nregs);
12079 }
12080 /* const0_rtx is in general free, but we will use an
12081 instruction to set a register to 0. */
12082 else if (REG_P (op1) || op1 == const0_rtx)
12083 {
12084 /* The cost is 1 per register copied. */
12085 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12086 *cost = COSTS_N_INSNS (nregs);
12087 }
12088 else
12089 /* Cost is just the cost of the RHS of the set. */
12090 *cost += rtx_cost (op1, mode, SET, 1, speed);
12091 return true;
12092
12093 case ZERO_EXTRACT:
12094 case SIGN_EXTRACT:
12095 /* Bit-field insertion. Strip any redundant widening of
12096 the RHS to meet the width of the target. */
12097 if (GET_CODE (op1) == SUBREG)
12098 op1 = SUBREG_REG (op1);
12099 if ((GET_CODE (op1) == ZERO_EXTEND
12100 || GET_CODE (op1) == SIGN_EXTEND)
12101 && CONST_INT_P (XEXP (op0, 1))
12102 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12103 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
12104 op1 = XEXP (op1, 0);
12105
12106 if (CONST_INT_P (op1))
12107 {
12108 /* MOV immediate is assumed to always be cheap. */
12109 *cost = COSTS_N_INSNS (1);
12110 }
12111 else
12112 {
12113 /* BFM. */
12114 if (speed)
12115 *cost += extra_cost->alu.bfi;
12116 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
12117 }
12118
12119 return true;
12120
12121 default:
12122 /* We can't make sense of this, assume default cost. */
12123 *cost = COSTS_N_INSNS (1);
12124 return false;
12125 }
12126 return false;
12127
12128 case CONST_INT:
12129 /* If an instruction can incorporate a constant within the
12130 instruction, the instruction's expression avoids calling
12131 rtx_cost() on the constant. If rtx_cost() is called on a
12132 constant, then it is usually because the constant must be
12133 moved into a register by one or more instructions.
12134
12135 The exception is constant 0, which can be expressed
12136 as XZR/WZR and is therefore free. The exception to this is
12137 if we have (set (reg) (const0_rtx)) in which case we must cost
12138 the move. However, we can catch that when we cost the SET, so
12139 we don't need to consider that here. */
12140 if (x == const0_rtx)
12141 *cost = 0;
12142 else
12143 {
12144 /* To an approximation, building any other constant is
12145 proportionally expensive to the number of instructions
12146 required to build that constant. This is true whether we
12147 are compiling for SPEED or otherwise. */
12148 if (!is_a <scalar_int_mode> (mode, &int_mode))
12149 int_mode = word_mode;
12150 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12151 (NULL_RTX, x, false, int_mode));
12152 }
12153 return true;
12154
12155 case CONST_DOUBLE:
12156
12157 /* First determine number of instructions to do the move
12158 as an integer constant. */
12159 if (!aarch64_float_const_representable_p (x)
12160 && !aarch64_can_const_movi_rtx_p (x, mode)
12161 && aarch64_float_const_rtx_p (x))
12162 {
12163 unsigned HOST_WIDE_INT ival;
12164 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12165 gcc_assert (succeed);
12166
12167 scalar_int_mode imode = (mode == HFmode
12168 ? SImode
12169 : int_mode_for_mode (mode).require ());
12170 int ncost = aarch64_internal_mov_immediate
12171 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12172 *cost += COSTS_N_INSNS (ncost);
12173 return true;
12174 }
12175
12176 if (speed)
12177 {
12178 /* mov[df,sf]_aarch64. */
12179 if (aarch64_float_const_representable_p (x))
12180 /* FMOV (scalar immediate). */
12181 *cost += extra_cost->fp[mode == DFmode].fpconst;
12182 else if (!aarch64_float_const_zero_rtx_p (x))
12183 {
12184 /* This will be a load from memory. */
12185 if (mode == DFmode)
12186 *cost += extra_cost->ldst.loadd;
12187 else
12188 *cost += extra_cost->ldst.loadf;
12189 }
12190 else
12191 /* Otherwise this is +0.0. We get this using MOVI d0, #0
12192 or MOV v0.s[0], wzr - neither of which are modeled by the
12193 cost tables. Just use the default cost. */
12194 {
12195 }
12196 }
12197
12198 return true;
12199
12200 case MEM:
12201 if (speed)
12202 {
12203 /* For loads we want the base cost of a load, plus an
12204 approximation for the additional cost of the addressing
12205 mode. */
12206 rtx address = XEXP (x, 0);
12207 if (VECTOR_MODE_P (mode))
12208 *cost += extra_cost->ldst.loadv;
12209 else if (GET_MODE_CLASS (mode) == MODE_INT)
12210 *cost += extra_cost->ldst.load;
12211 else if (mode == SFmode)
12212 *cost += extra_cost->ldst.loadf;
12213 else if (mode == DFmode)
12214 *cost += extra_cost->ldst.loadd;
12215
12216 *cost +=
12217 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12218 0, speed));
12219 }
12220
12221 return true;
12222
12223 case NEG:
12224 op0 = XEXP (x, 0);
12225
12226 if (VECTOR_MODE_P (mode))
12227 {
12228 if (speed)
12229 {
12230 /* FNEG. */
12231 *cost += extra_cost->vect.alu;
12232 }
12233 return false;
12234 }
12235
12236 if (GET_MODE_CLASS (mode) == MODE_INT)
12237 {
12238 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12239 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12240 {
12241 /* CSETM. */
12242 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12243 return true;
12244 }
12245
12246 /* Cost this as SUB wzr, X. */
12247 op0 = CONST0_RTX (mode);
12248 op1 = XEXP (x, 0);
12249 goto cost_minus;
12250 }
12251
12252 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12253 {
12254 /* Support (neg(fma...)) as a single instruction only if
12255 sign of zeros is unimportant. This matches the decision
12256 making in aarch64.md. */
12257 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12258 {
12259 /* FNMADD. */
12260 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12261 return true;
12262 }
12263 if (GET_CODE (op0) == MULT)
12264 {
12265 /* FNMUL. */
12266 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12267 return true;
12268 }
12269 if (speed)
12270 /* FNEG. */
12271 *cost += extra_cost->fp[mode == DFmode].neg;
12272 return false;
12273 }
12274
12275 return false;
12276
12277 case CLRSB:
12278 case CLZ:
12279 if (speed)
12280 {
12281 if (VECTOR_MODE_P (mode))
12282 *cost += extra_cost->vect.alu;
12283 else
12284 *cost += extra_cost->alu.clz;
12285 }
12286
12287 return false;
12288
12289 case CTZ:
12290 *cost = COSTS_N_INSNS (2);
12291
12292 if (speed)
12293 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12294 return false;
12295
12296 case COMPARE:
12297 op0 = XEXP (x, 0);
12298 op1 = XEXP (x, 1);
12299
12300 if (op1 == const0_rtx
12301 && GET_CODE (op0) == AND)
12302 {
12303 x = op0;
12304 mode = GET_MODE (op0);
12305 goto cost_logic;
12306 }
12307
12308 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12309 {
12310 /* TODO: A write to the CC flags possibly costs extra, this
12311 needs encoding in the cost tables. */
12312
12313 mode = GET_MODE (op0);
12314 /* ANDS. */
12315 if (GET_CODE (op0) == AND)
12316 {
12317 x = op0;
12318 goto cost_logic;
12319 }
12320
12321 if (GET_CODE (op0) == PLUS)
12322 {
12323 /* ADDS (and CMN alias). */
12324 x = op0;
12325 goto cost_plus;
12326 }
12327
12328 if (GET_CODE (op0) == MINUS)
12329 {
12330 /* SUBS. */
12331 x = op0;
12332 goto cost_minus;
12333 }
12334
12335 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12336 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12337 && CONST_INT_P (XEXP (op0, 2)))
12338 {
12339 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12340 Handle it here directly rather than going to cost_logic
12341 since we know the immediate generated for the TST is valid
12342 so we can avoid creating an intermediate rtx for it only
12343 for costing purposes. */
12344 if (speed)
12345 *cost += extra_cost->alu.logical;
12346
12347 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12348 ZERO_EXTRACT, 0, speed);
12349 return true;
12350 }
12351
12352 if (GET_CODE (op1) == NEG)
12353 {
12354 /* CMN. */
12355 if (speed)
12356 *cost += extra_cost->alu.arith;
12357
12358 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12359 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12360 return true;
12361 }
12362
12363 /* CMP.
12364
12365 Compare can freely swap the order of operands, and
12366 canonicalization puts the more complex operation first.
12367 But the integer MINUS logic expects the shift/extend
12368 operation in op1. */
12369 if (! (REG_P (op0)
12370 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12371 {
12372 op0 = XEXP (x, 1);
12373 op1 = XEXP (x, 0);
12374 }
12375 goto cost_minus;
12376 }
12377
12378 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12379 {
12380 /* FCMP. */
12381 if (speed)
12382 *cost += extra_cost->fp[mode == DFmode].compare;
12383
12384 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12385 {
12386 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12387 /* FCMP supports constant 0.0 for no extra cost. */
12388 return true;
12389 }
12390 return false;
12391 }
12392
12393 if (VECTOR_MODE_P (mode))
12394 {
12395 /* Vector compare. */
12396 if (speed)
12397 *cost += extra_cost->vect.alu;
12398
12399 if (aarch64_float_const_zero_rtx_p (op1))
12400 {
12401 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12402 cost. */
12403 return true;
12404 }
12405 return false;
12406 }
12407 return false;
12408
12409 case MINUS:
12410 {
12411 op0 = XEXP (x, 0);
12412 op1 = XEXP (x, 1);
12413
12414 cost_minus:
12415 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12416
12417 /* Detect valid immediates. */
12418 if ((GET_MODE_CLASS (mode) == MODE_INT
12419 || (GET_MODE_CLASS (mode) == MODE_CC
12420 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12421 && CONST_INT_P (op1)
12422 && aarch64_uimm12_shift (INTVAL (op1)))
12423 {
12424 if (speed)
12425 /* SUB(S) (immediate). */
12426 *cost += extra_cost->alu.arith;
12427 return true;
12428 }
12429
12430 /* Look for SUB (extended register). */
12431 if (is_a <scalar_int_mode> (mode)
12432 && aarch64_rtx_arith_op_extract_p (op1))
12433 {
12434 if (speed)
12435 *cost += extra_cost->alu.extend_arith;
12436
12437 op1 = aarch64_strip_extend (op1, true);
12438 *cost += rtx_cost (op1, VOIDmode,
12439 (enum rtx_code) GET_CODE (op1), 0, speed);
12440 return true;
12441 }
12442
12443 rtx new_op1 = aarch64_strip_extend (op1, false);
12444
12445 /* Cost this as an FMA-alike operation. */
12446 if ((GET_CODE (new_op1) == MULT
12447 || aarch64_shift_p (GET_CODE (new_op1)))
12448 && code != COMPARE)
12449 {
12450 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12451 (enum rtx_code) code,
12452 speed);
12453 return true;
12454 }
12455
12456 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12457
12458 if (speed)
12459 {
12460 if (VECTOR_MODE_P (mode))
12461 {
12462 /* Vector SUB. */
12463 *cost += extra_cost->vect.alu;
12464 }
12465 else if (GET_MODE_CLASS (mode) == MODE_INT)
12466 {
12467 /* SUB(S). */
12468 *cost += extra_cost->alu.arith;
12469 }
12470 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12471 {
12472 /* FSUB. */
12473 *cost += extra_cost->fp[mode == DFmode].addsub;
12474 }
12475 }
12476 return true;
12477 }
12478
12479 case PLUS:
12480 {
12481 rtx new_op0;
12482
12483 op0 = XEXP (x, 0);
12484 op1 = XEXP (x, 1);
12485
12486 cost_plus:
12487 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12488 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12489 {
12490 /* CSINC. */
12491 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12492 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12493 return true;
12494 }
12495
12496 if (GET_MODE_CLASS (mode) == MODE_INT
12497 && (aarch64_plus_immediate (op1, mode)
12498 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
12499 {
12500 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12501
12502 if (speed)
12503 /* ADD (immediate). */
12504 *cost += extra_cost->alu.arith;
12505 return true;
12506 }
12507
12508 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12509
12510 /* Look for ADD (extended register). */
12511 if (is_a <scalar_int_mode> (mode)
12512 && aarch64_rtx_arith_op_extract_p (op0))
12513 {
12514 if (speed)
12515 *cost += extra_cost->alu.extend_arith;
12516
12517 op0 = aarch64_strip_extend (op0, true);
12518 *cost += rtx_cost (op0, VOIDmode,
12519 (enum rtx_code) GET_CODE (op0), 0, speed);
12520 return true;
12521 }
12522
12523 /* Strip any extend, leave shifts behind as we will
12524 cost them through mult_cost. */
12525 new_op0 = aarch64_strip_extend (op0, false);
12526
12527 if (GET_CODE (new_op0) == MULT
12528 || aarch64_shift_p (GET_CODE (new_op0)))
12529 {
12530 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12531 speed);
12532 return true;
12533 }
12534
12535 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12536
12537 if (speed)
12538 {
12539 if (VECTOR_MODE_P (mode))
12540 {
12541 /* Vector ADD. */
12542 *cost += extra_cost->vect.alu;
12543 }
12544 else if (GET_MODE_CLASS (mode) == MODE_INT)
12545 {
12546 /* ADD. */
12547 *cost += extra_cost->alu.arith;
12548 }
12549 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12550 {
12551 /* FADD. */
12552 *cost += extra_cost->fp[mode == DFmode].addsub;
12553 }
12554 }
12555 return true;
12556 }
12557
12558 case BSWAP:
12559 *cost = COSTS_N_INSNS (1);
12560
12561 if (speed)
12562 {
12563 if (VECTOR_MODE_P (mode))
12564 *cost += extra_cost->vect.alu;
12565 else
12566 *cost += extra_cost->alu.rev;
12567 }
12568 return false;
12569
12570 case IOR:
12571 if (aarch_rev16_p (x))
12572 {
12573 *cost = COSTS_N_INSNS (1);
12574
12575 if (speed)
12576 {
12577 if (VECTOR_MODE_P (mode))
12578 *cost += extra_cost->vect.alu;
12579 else
12580 *cost += extra_cost->alu.rev;
12581 }
12582 return true;
12583 }
12584
12585 if (aarch64_extr_rtx_p (x, &op0, &op1))
12586 {
12587 *cost += rtx_cost (op0, mode, IOR, 0, speed);
12588 *cost += rtx_cost (op1, mode, IOR, 1, speed);
12589 if (speed)
12590 *cost += extra_cost->alu.shift;
12591
12592 return true;
12593 }
12594 /* Fall through. */
12595 case XOR:
12596 case AND:
12597 cost_logic:
12598 op0 = XEXP (x, 0);
12599 op1 = XEXP (x, 1);
12600
12601 if (VECTOR_MODE_P (mode))
12602 {
12603 if (speed)
12604 *cost += extra_cost->vect.alu;
12605 return true;
12606 }
12607
12608 if (code == AND
12609 && GET_CODE (op0) == MULT
12610 && CONST_INT_P (XEXP (op0, 1))
12611 && CONST_INT_P (op1)
12612 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12613 INTVAL (op1)) != 0)
12614 {
12615 /* This is a UBFM/SBFM. */
12616 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12617 if (speed)
12618 *cost += extra_cost->alu.bfx;
12619 return true;
12620 }
12621
12622 if (is_int_mode (mode, &int_mode))
12623 {
12624 if (CONST_INT_P (op1))
12625 {
12626 /* We have a mask + shift version of a UBFIZ
12627 i.e. the *andim_ashift<mode>_bfiz pattern. */
12628 if (GET_CODE (op0) == ASHIFT
12629 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12630 XEXP (op0, 1)))
12631 {
12632 *cost += rtx_cost (XEXP (op0, 0), int_mode,
12633 (enum rtx_code) code, 0, speed);
12634 if (speed)
12635 *cost += extra_cost->alu.bfx;
12636
12637 return true;
12638 }
12639 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12640 {
12641 /* We possibly get the immediate for free, this is not
12642 modelled. */
12643 *cost += rtx_cost (op0, int_mode,
12644 (enum rtx_code) code, 0, speed);
12645 if (speed)
12646 *cost += extra_cost->alu.logical;
12647
12648 return true;
12649 }
12650 }
12651 else
12652 {
12653 rtx new_op0 = op0;
12654
12655 /* Handle ORN, EON, or BIC. */
12656 if (GET_CODE (op0) == NOT)
12657 op0 = XEXP (op0, 0);
12658
12659 new_op0 = aarch64_strip_shift (op0);
12660
12661 /* If we had a shift on op0 then this is a logical-shift-
12662 by-register/immediate operation. Otherwise, this is just
12663 a logical operation. */
12664 if (speed)
12665 {
12666 if (new_op0 != op0)
12667 {
12668 /* Shift by immediate. */
12669 if (CONST_INT_P (XEXP (op0, 1)))
12670 *cost += extra_cost->alu.log_shift;
12671 else
12672 *cost += extra_cost->alu.log_shift_reg;
12673 }
12674 else
12675 *cost += extra_cost->alu.logical;
12676 }
12677
12678 /* In both cases we want to cost both operands. */
12679 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12680 0, speed);
12681 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12682 1, speed);
12683
12684 return true;
12685 }
12686 }
12687 return false;
12688
12689 case NOT:
12690 x = XEXP (x, 0);
12691 op0 = aarch64_strip_shift (x);
12692
12693 if (VECTOR_MODE_P (mode))
12694 {
12695 /* Vector NOT. */
12696 *cost += extra_cost->vect.alu;
12697 return false;
12698 }
12699
12700 /* MVN-shifted-reg. */
12701 if (op0 != x)
12702 {
12703 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12704
12705 if (speed)
12706 *cost += extra_cost->alu.log_shift;
12707
12708 return true;
12709 }
12710 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12711 Handle the second form here taking care that 'a' in the above can
12712 be a shift. */
12713 else if (GET_CODE (op0) == XOR)
12714 {
12715 rtx newop0 = XEXP (op0, 0);
12716 rtx newop1 = XEXP (op0, 1);
12717 rtx op0_stripped = aarch64_strip_shift (newop0);
12718
12719 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12720 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12721
12722 if (speed)
12723 {
12724 if (op0_stripped != newop0)
12725 *cost += extra_cost->alu.log_shift;
12726 else
12727 *cost += extra_cost->alu.logical;
12728 }
12729
12730 return true;
12731 }
12732 /* MVN. */
12733 if (speed)
12734 *cost += extra_cost->alu.logical;
12735
12736 return false;
12737
12738 case ZERO_EXTEND:
12739
12740 op0 = XEXP (x, 0);
12741 /* If a value is written in SI mode, then zero extended to DI
12742 mode, the operation will in general be free as a write to
12743 a 'w' register implicitly zeroes the upper bits of an 'x'
12744 register. However, if this is
12745
12746 (set (reg) (zero_extend (reg)))
12747
12748 we must cost the explicit register move. */
12749 if (mode == DImode
12750 && GET_MODE (op0) == SImode
12751 && outer == SET)
12752 {
12753 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12754
12755 /* If OP_COST is non-zero, then the cost of the zero extend
12756 is effectively the cost of the inner operation. Otherwise
12757 we have a MOV instruction and we take the cost from the MOV
12758 itself. This is true independently of whether we are
12759 optimizing for space or time. */
12760 if (op_cost)
12761 *cost = op_cost;
12762
12763 return true;
12764 }
12765 else if (MEM_P (op0))
12766 {
12767 /* All loads can zero extend to any size for free. */
12768 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12769 return true;
12770 }
12771
12772 op0 = aarch64_extend_bitfield_pattern_p (x);
12773 if (op0)
12774 {
12775 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12776 if (speed)
12777 *cost += extra_cost->alu.bfx;
12778 return true;
12779 }
12780
12781 if (speed)
12782 {
12783 if (VECTOR_MODE_P (mode))
12784 {
12785 /* UMOV. */
12786 *cost += extra_cost->vect.alu;
12787 }
12788 else
12789 {
12790 /* We generate an AND instead of UXTB/UXTH. */
12791 *cost += extra_cost->alu.logical;
12792 }
12793 }
12794 return false;
12795
12796 case SIGN_EXTEND:
12797 if (MEM_P (XEXP (x, 0)))
12798 {
12799 /* LDRSH. */
12800 if (speed)
12801 {
12802 rtx address = XEXP (XEXP (x, 0), 0);
12803 *cost += extra_cost->ldst.load_sign_extend;
12804
12805 *cost +=
12806 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12807 0, speed));
12808 }
12809 return true;
12810 }
12811
12812 op0 = aarch64_extend_bitfield_pattern_p (x);
12813 if (op0)
12814 {
12815 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12816 if (speed)
12817 *cost += extra_cost->alu.bfx;
12818 return true;
12819 }
12820
12821 if (speed)
12822 {
12823 if (VECTOR_MODE_P (mode))
12824 *cost += extra_cost->vect.alu;
12825 else
12826 *cost += extra_cost->alu.extend;
12827 }
12828 return false;
12829
12830 case ASHIFT:
12831 op0 = XEXP (x, 0);
12832 op1 = XEXP (x, 1);
12833
12834 if (CONST_INT_P (op1))
12835 {
12836 if (speed)
12837 {
12838 if (VECTOR_MODE_P (mode))
12839 {
12840 /* Vector shift (immediate). */
12841 *cost += extra_cost->vect.alu;
12842 }
12843 else
12844 {
12845 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12846 aliases. */
12847 *cost += extra_cost->alu.shift;
12848 }
12849 }
12850
12851 /* We can incorporate zero/sign extend for free. */
12852 if (GET_CODE (op0) == ZERO_EXTEND
12853 || GET_CODE (op0) == SIGN_EXTEND)
12854 op0 = XEXP (op0, 0);
12855
12856 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12857 return true;
12858 }
12859 else
12860 {
12861 if (VECTOR_MODE_P (mode))
12862 {
12863 if (speed)
12864 /* Vector shift (register). */
12865 *cost += extra_cost->vect.alu;
12866 }
12867 else
12868 {
12869 if (speed)
12870 /* LSLV. */
12871 *cost += extra_cost->alu.shift_reg;
12872
12873 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12874 && CONST_INT_P (XEXP (op1, 1))
12875 && known_eq (INTVAL (XEXP (op1, 1)),
12876 GET_MODE_BITSIZE (mode) - 1))
12877 {
12878 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12879 /* We already demanded XEXP (op1, 0) to be REG_P, so
12880 don't recurse into it. */
12881 return true;
12882 }
12883 }
12884 return false; /* All arguments need to be in registers. */
12885 }
12886
12887 case ROTATE:
12888 case ROTATERT:
12889 case LSHIFTRT:
12890 case ASHIFTRT:
12891 op0 = XEXP (x, 0);
12892 op1 = XEXP (x, 1);
12893
12894 if (CONST_INT_P (op1))
12895 {
12896 /* ASR (immediate) and friends. */
12897 if (speed)
12898 {
12899 if (VECTOR_MODE_P (mode))
12900 *cost += extra_cost->vect.alu;
12901 else
12902 *cost += extra_cost->alu.shift;
12903 }
12904
12905 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12906 return true;
12907 }
12908 else
12909 {
12910 if (VECTOR_MODE_P (mode))
12911 {
12912 if (speed)
12913 /* Vector shift (register). */
12914 *cost += extra_cost->vect.alu;
12915 }
12916 else
12917 {
12918 if (speed)
12919 /* ASR (register) and friends. */
12920 *cost += extra_cost->alu.shift_reg;
12921
12922 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12923 && CONST_INT_P (XEXP (op1, 1))
12924 && known_eq (INTVAL (XEXP (op1, 1)),
12925 GET_MODE_BITSIZE (mode) - 1))
12926 {
12927 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12928 /* We already demanded XEXP (op1, 0) to be REG_P, so
12929 don't recurse into it. */
12930 return true;
12931 }
12932 }
12933 return false; /* All arguments need to be in registers. */
12934 }
12935
12936 case SYMBOL_REF:
12937
12938 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12939 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12940 {
12941 /* LDR. */
12942 if (speed)
12943 *cost += extra_cost->ldst.load;
12944 }
12945 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12946 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12947 {
12948 /* ADRP, followed by ADD. */
12949 *cost += COSTS_N_INSNS (1);
12950 if (speed)
12951 *cost += 2 * extra_cost->alu.arith;
12952 }
12953 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12954 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12955 {
12956 /* ADR. */
12957 if (speed)
12958 *cost += extra_cost->alu.arith;
12959 }
12960
12961 if (flag_pic)
12962 {
12963 /* One extra load instruction, after accessing the GOT. */
12964 *cost += COSTS_N_INSNS (1);
12965 if (speed)
12966 *cost += extra_cost->ldst.load;
12967 }
12968 return true;
12969
12970 case HIGH:
12971 case LO_SUM:
12972 /* ADRP/ADD (immediate). */
12973 if (speed)
12974 *cost += extra_cost->alu.arith;
12975 return true;
12976
12977 case ZERO_EXTRACT:
12978 case SIGN_EXTRACT:
12979 /* UBFX/SBFX. */
12980 if (speed)
12981 {
12982 if (VECTOR_MODE_P (mode))
12983 *cost += extra_cost->vect.alu;
12984 else
12985 *cost += extra_cost->alu.bfx;
12986 }
12987
12988 /* We can trust that the immediates used will be correct (there
12989 are no by-register forms), so we need only cost op0. */
12990 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12991 return true;
12992
12993 case MULT:
12994 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12995 /* aarch64_rtx_mult_cost always handles recursion to its
12996 operands. */
12997 return true;
12998
12999 case MOD:
13000 /* We can expand signed mod by power of 2 using a NEGS, two parallel
13001 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
13002 an unconditional negate. This case should only ever be reached through
13003 the set_smod_pow2_cheap check in expmed.c. */
13004 if (CONST_INT_P (XEXP (x, 1))
13005 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13006 && (mode == SImode || mode == DImode))
13007 {
13008 /* We expand to 4 instructions. Reset the baseline. */
13009 *cost = COSTS_N_INSNS (4);
13010
13011 if (speed)
13012 *cost += 2 * extra_cost->alu.logical
13013 + 2 * extra_cost->alu.arith;
13014
13015 return true;
13016 }
13017
13018 /* Fall-through. */
13019 case UMOD:
13020 if (speed)
13021 {
13022 /* Slighly prefer UMOD over SMOD. */
13023 if (VECTOR_MODE_P (mode))
13024 *cost += extra_cost->vect.alu;
13025 else if (GET_MODE_CLASS (mode) == MODE_INT)
13026 *cost += (extra_cost->mult[mode == DImode].add
13027 + extra_cost->mult[mode == DImode].idiv
13028 + (code == MOD ? 1 : 0));
13029 }
13030 return false; /* All arguments need to be in registers. */
13031
13032 case DIV:
13033 case UDIV:
13034 case SQRT:
13035 if (speed)
13036 {
13037 if (VECTOR_MODE_P (mode))
13038 *cost += extra_cost->vect.alu;
13039 else if (GET_MODE_CLASS (mode) == MODE_INT)
13040 /* There is no integer SQRT, so only DIV and UDIV can get
13041 here. */
13042 *cost += (extra_cost->mult[mode == DImode].idiv
13043 /* Slighly prefer UDIV over SDIV. */
13044 + (code == DIV ? 1 : 0));
13045 else
13046 *cost += extra_cost->fp[mode == DFmode].div;
13047 }
13048 return false; /* All arguments need to be in registers. */
13049
13050 case IF_THEN_ELSE:
13051 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13052 XEXP (x, 2), cost, speed);
13053
13054 case EQ:
13055 case NE:
13056 case GT:
13057 case GTU:
13058 case LT:
13059 case LTU:
13060 case GE:
13061 case GEU:
13062 case LE:
13063 case LEU:
13064
13065 return false; /* All arguments must be in registers. */
13066
13067 case FMA:
13068 op0 = XEXP (x, 0);
13069 op1 = XEXP (x, 1);
13070 op2 = XEXP (x, 2);
13071
13072 if (speed)
13073 {
13074 if (VECTOR_MODE_P (mode))
13075 *cost += extra_cost->vect.alu;
13076 else
13077 *cost += extra_cost->fp[mode == DFmode].fma;
13078 }
13079
13080 /* FMSUB, FNMADD, and FNMSUB are free. */
13081 if (GET_CODE (op0) == NEG)
13082 op0 = XEXP (op0, 0);
13083
13084 if (GET_CODE (op2) == NEG)
13085 op2 = XEXP (op2, 0);
13086
13087 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13088 and the by-element operand as operand 0. */
13089 if (GET_CODE (op1) == NEG)
13090 op1 = XEXP (op1, 0);
13091
13092 /* Catch vector-by-element operations. The by-element operand can
13093 either be (vec_duplicate (vec_select (x))) or just
13094 (vec_select (x)), depending on whether we are multiplying by
13095 a vector or a scalar.
13096
13097 Canonicalization is not very good in these cases, FMA4 will put the
13098 by-element operand as operand 0, FNMA4 will have it as operand 1. */
13099 if (GET_CODE (op0) == VEC_DUPLICATE)
13100 op0 = XEXP (op0, 0);
13101 else if (GET_CODE (op1) == VEC_DUPLICATE)
13102 op1 = XEXP (op1, 0);
13103
13104 if (GET_CODE (op0) == VEC_SELECT)
13105 op0 = XEXP (op0, 0);
13106 else if (GET_CODE (op1) == VEC_SELECT)
13107 op1 = XEXP (op1, 0);
13108
13109 /* If the remaining parameters are not registers,
13110 get the cost to put them into registers. */
13111 *cost += rtx_cost (op0, mode, FMA, 0, speed);
13112 *cost += rtx_cost (op1, mode, FMA, 1, speed);
13113 *cost += rtx_cost (op2, mode, FMA, 2, speed);
13114 return true;
13115
13116 case FLOAT:
13117 case UNSIGNED_FLOAT:
13118 if (speed)
13119 *cost += extra_cost->fp[mode == DFmode].fromint;
13120 return false;
13121
13122 case FLOAT_EXTEND:
13123 if (speed)
13124 {
13125 if (VECTOR_MODE_P (mode))
13126 {
13127 /*Vector truncate. */
13128 *cost += extra_cost->vect.alu;
13129 }
13130 else
13131 *cost += extra_cost->fp[mode == DFmode].widen;
13132 }
13133 return false;
13134
13135 case FLOAT_TRUNCATE:
13136 if (speed)
13137 {
13138 if (VECTOR_MODE_P (mode))
13139 {
13140 /*Vector conversion. */
13141 *cost += extra_cost->vect.alu;
13142 }
13143 else
13144 *cost += extra_cost->fp[mode == DFmode].narrow;
13145 }
13146 return false;
13147
13148 case FIX:
13149 case UNSIGNED_FIX:
13150 x = XEXP (x, 0);
13151 /* Strip the rounding part. They will all be implemented
13152 by the fcvt* family of instructions anyway. */
13153 if (GET_CODE (x) == UNSPEC)
13154 {
13155 unsigned int uns_code = XINT (x, 1);
13156
13157 if (uns_code == UNSPEC_FRINTA
13158 || uns_code == UNSPEC_FRINTM
13159 || uns_code == UNSPEC_FRINTN
13160 || uns_code == UNSPEC_FRINTP
13161 || uns_code == UNSPEC_FRINTZ)
13162 x = XVECEXP (x, 0, 0);
13163 }
13164
13165 if (speed)
13166 {
13167 if (VECTOR_MODE_P (mode))
13168 *cost += extra_cost->vect.alu;
13169 else
13170 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13171 }
13172
13173 /* We can combine fmul by a power of 2 followed by a fcvt into a single
13174 fixed-point fcvt. */
13175 if (GET_CODE (x) == MULT
13176 && ((VECTOR_MODE_P (mode)
13177 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13178 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13179 {
13180 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13181 0, speed);
13182 return true;
13183 }
13184
13185 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13186 return true;
13187
13188 case ABS:
13189 if (VECTOR_MODE_P (mode))
13190 {
13191 /* ABS (vector). */
13192 if (speed)
13193 *cost += extra_cost->vect.alu;
13194 }
13195 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13196 {
13197 op0 = XEXP (x, 0);
13198
13199 /* FABD, which is analogous to FADD. */
13200 if (GET_CODE (op0) == MINUS)
13201 {
13202 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13203 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13204 if (speed)
13205 *cost += extra_cost->fp[mode == DFmode].addsub;
13206
13207 return true;
13208 }
13209 /* Simple FABS is analogous to FNEG. */
13210 if (speed)
13211 *cost += extra_cost->fp[mode == DFmode].neg;
13212 }
13213 else
13214 {
13215 /* Integer ABS will either be split to
13216 two arithmetic instructions, or will be an ABS
13217 (scalar), which we don't model. */
13218 *cost = COSTS_N_INSNS (2);
13219 if (speed)
13220 *cost += 2 * extra_cost->alu.arith;
13221 }
13222 return false;
13223
13224 case SMAX:
13225 case SMIN:
13226 if (speed)
13227 {
13228 if (VECTOR_MODE_P (mode))
13229 *cost += extra_cost->vect.alu;
13230 else
13231 {
13232 /* FMAXNM/FMINNM/FMAX/FMIN.
13233 TODO: This may not be accurate for all implementations, but
13234 we do not model this in the cost tables. */
13235 *cost += extra_cost->fp[mode == DFmode].addsub;
13236 }
13237 }
13238 return false;
13239
13240 case UNSPEC:
13241 /* The floating point round to integer frint* instructions. */
13242 if (aarch64_frint_unspec_p (XINT (x, 1)))
13243 {
13244 if (speed)
13245 *cost += extra_cost->fp[mode == DFmode].roundint;
13246
13247 return false;
13248 }
13249
13250 if (XINT (x, 1) == UNSPEC_RBIT)
13251 {
13252 if (speed)
13253 *cost += extra_cost->alu.rev;
13254
13255 return false;
13256 }
13257 break;
13258
13259 case TRUNCATE:
13260
13261 /* Decompose <su>muldi3_highpart. */
13262 if (/* (truncate:DI */
13263 mode == DImode
13264 /* (lshiftrt:TI */
13265 && GET_MODE (XEXP (x, 0)) == TImode
13266 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13267 /* (mult:TI */
13268 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13269 /* (ANY_EXTEND:TI (reg:DI))
13270 (ANY_EXTEND:TI (reg:DI))) */
13271 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13272 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13273 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13274 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13275 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13276 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13277 /* (const_int 64) */
13278 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13279 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13280 {
13281 /* UMULH/SMULH. */
13282 if (speed)
13283 *cost += extra_cost->mult[mode == DImode].extend;
13284 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13285 mode, MULT, 0, speed);
13286 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13287 mode, MULT, 1, speed);
13288 return true;
13289 }
13290
13291 /* Fall through. */
13292 default:
13293 break;
13294 }
13295
13296 if (dump_file
13297 && flag_aarch64_verbose_cost)
13298 fprintf (dump_file,
13299 "\nFailed to cost RTX. Assuming default cost.\n");
13300
13301 return true;
13302 }
13303
13304 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13305 calculated for X. This cost is stored in *COST. Returns true
13306 if the total cost of X was calculated. */
13307 static bool
13308 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13309 int param, int *cost, bool speed)
13310 {
13311 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13312
13313 if (dump_file
13314 && flag_aarch64_verbose_cost)
13315 {
13316 print_rtl_single (dump_file, x);
13317 fprintf (dump_file, "\n%s cost: %d (%s)\n",
13318 speed ? "Hot" : "Cold",
13319 *cost, result ? "final" : "partial");
13320 }
13321
13322 return result;
13323 }
13324
13325 static int
13326 aarch64_register_move_cost (machine_mode mode,
13327 reg_class_t from_i, reg_class_t to_i)
13328 {
13329 enum reg_class from = (enum reg_class) from_i;
13330 enum reg_class to = (enum reg_class) to_i;
13331 const struct cpu_regmove_cost *regmove_cost
13332 = aarch64_tune_params.regmove_cost;
13333
13334 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
13335 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13336 || to == STUB_REGS)
13337 to = GENERAL_REGS;
13338
13339 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13340 || from == STUB_REGS)
13341 from = GENERAL_REGS;
13342
13343 /* Make RDFFR very expensive. In particular, if we know that the FFR
13344 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13345 as a way of obtaining a PTRUE. */
13346 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13347 && hard_reg_set_subset_p (reg_class_contents[from_i],
13348 reg_class_contents[FFR_REGS]))
13349 return 80;
13350
13351 /* Moving between GPR and stack cost is the same as GP2GP. */
13352 if ((from == GENERAL_REGS && to == STACK_REG)
13353 || (to == GENERAL_REGS && from == STACK_REG))
13354 return regmove_cost->GP2GP;
13355
13356 /* To/From the stack register, we move via the gprs. */
13357 if (to == STACK_REG || from == STACK_REG)
13358 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13359 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13360
13361 if (known_eq (GET_MODE_SIZE (mode), 16))
13362 {
13363 /* 128-bit operations on general registers require 2 instructions. */
13364 if (from == GENERAL_REGS && to == GENERAL_REGS)
13365 return regmove_cost->GP2GP * 2;
13366 else if (from == GENERAL_REGS)
13367 return regmove_cost->GP2FP * 2;
13368 else if (to == GENERAL_REGS)
13369 return regmove_cost->FP2GP * 2;
13370
13371 /* When AdvSIMD instructions are disabled it is not possible to move
13372 a 128-bit value directly between Q registers. This is handled in
13373 secondary reload. A general register is used as a scratch to move
13374 the upper DI value and the lower DI value is moved directly,
13375 hence the cost is the sum of three moves. */
13376 if (! TARGET_SIMD)
13377 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13378
13379 return regmove_cost->FP2FP;
13380 }
13381
13382 if (from == GENERAL_REGS && to == GENERAL_REGS)
13383 return regmove_cost->GP2GP;
13384 else if (from == GENERAL_REGS)
13385 return regmove_cost->GP2FP;
13386 else if (to == GENERAL_REGS)
13387 return regmove_cost->FP2GP;
13388
13389 return regmove_cost->FP2FP;
13390 }
13391
13392 static int
13393 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13394 reg_class_t rclass ATTRIBUTE_UNUSED,
13395 bool in ATTRIBUTE_UNUSED)
13396 {
13397 return aarch64_tune_params.memmov_cost;
13398 }
13399
13400 /* Implement TARGET_INIT_BUILTINS. */
13401 static void
13402 aarch64_init_builtins ()
13403 {
13404 aarch64_general_init_builtins ();
13405 aarch64_sve::init_builtins ();
13406 }
13407
13408 /* Implement TARGET_FOLD_BUILTIN. */
13409 static tree
13410 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13411 {
13412 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13413 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13414 tree type = TREE_TYPE (TREE_TYPE (fndecl));
13415 switch (code & AARCH64_BUILTIN_CLASS)
13416 {
13417 case AARCH64_BUILTIN_GENERAL:
13418 return aarch64_general_fold_builtin (subcode, type, nargs, args);
13419
13420 case AARCH64_BUILTIN_SVE:
13421 return NULL_TREE;
13422 }
13423 gcc_unreachable ();
13424 }
13425
13426 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
13427 static bool
13428 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13429 {
13430 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13431 tree fndecl = gimple_call_fndecl (stmt);
13432 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13433 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13434 gimple *new_stmt = NULL;
13435 switch (code & AARCH64_BUILTIN_CLASS)
13436 {
13437 case AARCH64_BUILTIN_GENERAL:
13438 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13439 break;
13440
13441 case AARCH64_BUILTIN_SVE:
13442 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13443 break;
13444 }
13445
13446 if (!new_stmt)
13447 return false;
13448
13449 gsi_replace (gsi, new_stmt, true);
13450 return true;
13451 }
13452
13453 /* Implement TARGET_EXPAND_BUILTIN. */
13454 static rtx
13455 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13456 {
13457 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13458 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13459 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13460 switch (code & AARCH64_BUILTIN_CLASS)
13461 {
13462 case AARCH64_BUILTIN_GENERAL:
13463 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13464
13465 case AARCH64_BUILTIN_SVE:
13466 return aarch64_sve::expand_builtin (subcode, exp, target);
13467 }
13468 gcc_unreachable ();
13469 }
13470
13471 /* Implement TARGET_BUILTIN_DECL. */
13472 static tree
13473 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13474 {
13475 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13476 switch (code & AARCH64_BUILTIN_CLASS)
13477 {
13478 case AARCH64_BUILTIN_GENERAL:
13479 return aarch64_general_builtin_decl (subcode, initialize_p);
13480
13481 case AARCH64_BUILTIN_SVE:
13482 return aarch64_sve::builtin_decl (subcode, initialize_p);
13483 }
13484 gcc_unreachable ();
13485 }
13486
13487 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13488 to optimize 1.0/sqrt. */
13489
13490 static bool
13491 use_rsqrt_p (machine_mode mode)
13492 {
13493 return (!flag_trapping_math
13494 && flag_unsafe_math_optimizations
13495 && ((aarch64_tune_params.approx_modes->recip_sqrt
13496 & AARCH64_APPROX_MODE (mode))
13497 || flag_mrecip_low_precision_sqrt));
13498 }
13499
13500 /* Function to decide when to use the approximate reciprocal square root
13501 builtin. */
13502
13503 static tree
13504 aarch64_builtin_reciprocal (tree fndecl)
13505 {
13506 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13507
13508 if (!use_rsqrt_p (mode))
13509 return NULL_TREE;
13510 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13511 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13512 switch (code & AARCH64_BUILTIN_CLASS)
13513 {
13514 case AARCH64_BUILTIN_GENERAL:
13515 return aarch64_general_builtin_rsqrt (subcode);
13516
13517 case AARCH64_BUILTIN_SVE:
13518 return NULL_TREE;
13519 }
13520 gcc_unreachable ();
13521 }
13522
13523 /* Emit code to perform the floating-point operation:
13524
13525 DST = SRC1 * SRC2
13526
13527 where all three operands are already known to be registers.
13528 If the operation is an SVE one, PTRUE is a suitable all-true
13529 predicate. */
13530
13531 static void
13532 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13533 {
13534 if (ptrue)
13535 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13536 dst, ptrue, src1, src2,
13537 gen_int_mode (SVE_RELAXED_GP, SImode)));
13538 else
13539 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13540 }
13541
13542 /* Emit instruction sequence to compute either the approximate square root
13543 or its approximate reciprocal, depending on the flag RECP, and return
13544 whether the sequence was emitted or not. */
13545
13546 bool
13547 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13548 {
13549 machine_mode mode = GET_MODE (dst);
13550
13551 if (GET_MODE_INNER (mode) == HFmode)
13552 {
13553 gcc_assert (!recp);
13554 return false;
13555 }
13556
13557 if (!recp)
13558 {
13559 if (!(flag_mlow_precision_sqrt
13560 || (aarch64_tune_params.approx_modes->sqrt
13561 & AARCH64_APPROX_MODE (mode))))
13562 return false;
13563
13564 if (!flag_finite_math_only
13565 || flag_trapping_math
13566 || !flag_unsafe_math_optimizations
13567 || optimize_function_for_size_p (cfun))
13568 return false;
13569 }
13570 else
13571 /* Caller assumes we cannot fail. */
13572 gcc_assert (use_rsqrt_p (mode));
13573
13574 rtx pg = NULL_RTX;
13575 if (aarch64_sve_mode_p (mode))
13576 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13577 machine_mode mmsk = (VECTOR_MODE_P (mode)
13578 ? related_int_vector_mode (mode).require ()
13579 : int_mode_for_mode (mode).require ());
13580 rtx xmsk = NULL_RTX;
13581 if (!recp)
13582 {
13583 /* When calculating the approximate square root, compare the
13584 argument with 0.0 and create a mask. */
13585 rtx zero = CONST0_RTX (mode);
13586 if (pg)
13587 {
13588 xmsk = gen_reg_rtx (GET_MODE (pg));
13589 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13590 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13591 xmsk, pg, hint, src, zero));
13592 }
13593 else
13594 {
13595 xmsk = gen_reg_rtx (mmsk);
13596 emit_insn (gen_rtx_SET (xmsk,
13597 gen_rtx_NEG (mmsk,
13598 gen_rtx_EQ (mmsk, src, zero))));
13599 }
13600 }
13601
13602 /* Estimate the approximate reciprocal square root. */
13603 rtx xdst = gen_reg_rtx (mode);
13604 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13605
13606 /* Iterate over the series twice for SF and thrice for DF. */
13607 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13608
13609 /* Optionally iterate over the series once less for faster performance
13610 while sacrificing the accuracy. */
13611 if ((recp && flag_mrecip_low_precision_sqrt)
13612 || (!recp && flag_mlow_precision_sqrt))
13613 iterations--;
13614
13615 /* Iterate over the series to calculate the approximate reciprocal square
13616 root. */
13617 rtx x1 = gen_reg_rtx (mode);
13618 while (iterations--)
13619 {
13620 rtx x2 = gen_reg_rtx (mode);
13621 aarch64_emit_mult (x2, pg, xdst, xdst);
13622
13623 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13624
13625 if (iterations > 0)
13626 aarch64_emit_mult (xdst, pg, xdst, x1);
13627 }
13628
13629 if (!recp)
13630 {
13631 if (pg)
13632 /* Multiply nonzero source values by the corresponding intermediate
13633 result elements, so that the final calculation is the approximate
13634 square root rather than its reciprocal. Select a zero result for
13635 zero source values, to avoid the Inf * 0 -> NaN that we'd get
13636 otherwise. */
13637 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13638 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13639 else
13640 {
13641 /* Qualify the approximate reciprocal square root when the
13642 argument is 0.0 by squashing the intermediary result to 0.0. */
13643 rtx xtmp = gen_reg_rtx (mmsk);
13644 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13645 gen_rtx_SUBREG (mmsk, xdst, 0)));
13646 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13647
13648 /* Calculate the approximate square root. */
13649 aarch64_emit_mult (xdst, pg, xdst, src);
13650 }
13651 }
13652
13653 /* Finalize the approximation. */
13654 aarch64_emit_mult (dst, pg, xdst, x1);
13655
13656 return true;
13657 }
13658
13659 /* Emit the instruction sequence to compute the approximation for the division
13660 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
13661
13662 bool
13663 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13664 {
13665 machine_mode mode = GET_MODE (quo);
13666
13667 if (GET_MODE_INNER (mode) == HFmode)
13668 return false;
13669
13670 bool use_approx_division_p = (flag_mlow_precision_div
13671 || (aarch64_tune_params.approx_modes->division
13672 & AARCH64_APPROX_MODE (mode)));
13673
13674 if (!flag_finite_math_only
13675 || flag_trapping_math
13676 || !flag_unsafe_math_optimizations
13677 || optimize_function_for_size_p (cfun)
13678 || !use_approx_division_p)
13679 return false;
13680
13681 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13682 return false;
13683
13684 rtx pg = NULL_RTX;
13685 if (aarch64_sve_mode_p (mode))
13686 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13687
13688 /* Estimate the approximate reciprocal. */
13689 rtx xrcp = gen_reg_rtx (mode);
13690 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13691
13692 /* Iterate over the series twice for SF and thrice for DF. */
13693 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13694
13695 /* Optionally iterate over the series less for faster performance,
13696 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
13697 if (flag_mlow_precision_div)
13698 iterations = (GET_MODE_INNER (mode) == DFmode
13699 ? aarch64_double_recp_precision
13700 : aarch64_float_recp_precision);
13701
13702 /* Iterate over the series to calculate the approximate reciprocal. */
13703 rtx xtmp = gen_reg_rtx (mode);
13704 while (iterations--)
13705 {
13706 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13707
13708 if (iterations > 0)
13709 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13710 }
13711
13712 if (num != CONST1_RTX (mode))
13713 {
13714 /* As the approximate reciprocal of DEN is already calculated, only
13715 calculate the approximate division when NUM is not 1.0. */
13716 rtx xnum = force_reg (mode, num);
13717 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13718 }
13719
13720 /* Finalize the approximation. */
13721 aarch64_emit_mult (quo, pg, xrcp, xtmp);
13722 return true;
13723 }
13724
13725 /* Return the number of instructions that can be issued per cycle. */
13726 static int
13727 aarch64_sched_issue_rate (void)
13728 {
13729 return aarch64_tune_params.issue_rate;
13730 }
13731
13732 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
13733 static int
13734 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13735 {
13736 if (DEBUG_INSN_P (insn))
13737 return more;
13738
13739 rtx_code code = GET_CODE (PATTERN (insn));
13740 if (code == USE || code == CLOBBER)
13741 return more;
13742
13743 if (get_attr_type (insn) == TYPE_NO_INSN)
13744 return more;
13745
13746 return more - 1;
13747 }
13748
13749 static int
13750 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13751 {
13752 int issue_rate = aarch64_sched_issue_rate ();
13753
13754 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13755 }
13756
13757
13758 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13759 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
13760 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
13761
13762 static int
13763 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13764 int ready_index)
13765 {
13766 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13767 }
13768
13769
13770 /* Vectorizer cost model target hooks. */
13771
13772 /* Implement targetm.vectorize.builtin_vectorization_cost. */
13773 static int
13774 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13775 tree vectype,
13776 int misalign ATTRIBUTE_UNUSED)
13777 {
13778 unsigned elements;
13779 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13780 bool fp = false;
13781
13782 if (vectype != NULL)
13783 fp = FLOAT_TYPE_P (vectype);
13784
13785 const simd_vec_cost *simd_costs;
13786 if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype))
13787 && costs->sve != NULL)
13788 simd_costs = costs->sve;
13789 else
13790 simd_costs = costs->advsimd;
13791
13792 switch (type_of_cost)
13793 {
13794 case scalar_stmt:
13795 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13796
13797 case scalar_load:
13798 return costs->scalar_load_cost;
13799
13800 case scalar_store:
13801 return costs->scalar_store_cost;
13802
13803 case vector_stmt:
13804 return fp ? simd_costs->fp_stmt_cost
13805 : simd_costs->int_stmt_cost;
13806
13807 case vector_load:
13808 return simd_costs->align_load_cost;
13809
13810 case vector_store:
13811 return simd_costs->store_cost;
13812
13813 case vec_to_scalar:
13814 return simd_costs->vec_to_scalar_cost;
13815
13816 case scalar_to_vec:
13817 return simd_costs->scalar_to_vec_cost;
13818
13819 case unaligned_load:
13820 case vector_gather_load:
13821 return simd_costs->unalign_load_cost;
13822
13823 case unaligned_store:
13824 case vector_scatter_store:
13825 return simd_costs->unalign_store_cost;
13826
13827 case cond_branch_taken:
13828 return costs->cond_taken_branch_cost;
13829
13830 case cond_branch_not_taken:
13831 return costs->cond_not_taken_branch_cost;
13832
13833 case vec_perm:
13834 return simd_costs->permute_cost;
13835
13836 case vec_promote_demote:
13837 return fp ? simd_costs->fp_stmt_cost
13838 : simd_costs->int_stmt_cost;
13839
13840 case vec_construct:
13841 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13842 return elements / 2 + 1;
13843
13844 default:
13845 gcc_unreachable ();
13846 }
13847 }
13848
13849 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
13850 vectors would produce a series of LDP or STP operations. KIND is the
13851 kind of statement that STMT_INFO represents. */
13852 static bool
13853 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
13854 stmt_vec_info stmt_info)
13855 {
13856 switch (kind)
13857 {
13858 case vector_load:
13859 case vector_store:
13860 case unaligned_load:
13861 case unaligned_store:
13862 break;
13863
13864 default:
13865 return false;
13866 }
13867
13868 if (aarch64_tune_params.extra_tuning_flags
13869 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
13870 return false;
13871
13872 return is_gimple_assign (stmt_info->stmt);
13873 }
13874
13875 /* Return true if STMT_INFO extends the result of a load. */
13876 static bool
13877 aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
13878 {
13879 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13880 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13881 return false;
13882
13883 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13884 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13885 tree rhs_type = TREE_TYPE (rhs);
13886 if (!INTEGRAL_TYPE_P (lhs_type)
13887 || !INTEGRAL_TYPE_P (rhs_type)
13888 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13889 return false;
13890
13891 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
13892 return (def_stmt_info
13893 && STMT_VINFO_DATA_REF (def_stmt_info)
13894 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13895 }
13896
13897 /* Return true if STMT_INFO is an integer truncation. */
13898 static bool
13899 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13900 {
13901 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13902 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13903 return false;
13904
13905 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13906 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13907 return (INTEGRAL_TYPE_P (lhs_type)
13908 && INTEGRAL_TYPE_P (rhs_type)
13909 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13910 }
13911
13912 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13913 for STMT_INFO, which has cost kind KIND and which when vectorized would
13914 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
13915 targets. */
13916 static unsigned int
13917 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
13918 stmt_vec_info stmt_info, tree vectype,
13919 unsigned int stmt_cost)
13920 {
13921 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13922 vector register size or number of units. Integer promotions of this
13923 type therefore map to SXT[BHW] or UXT[BHW].
13924
13925 Most loads have extending forms that can do the sign or zero extension
13926 on the fly. Optimistically assume that a load followed by an extension
13927 will fold to this form during combine, and that the extension therefore
13928 comes for free. */
13929 if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
13930 stmt_cost = 0;
13931
13932 /* For similar reasons, vector_stmt integer truncations are a no-op,
13933 because we can just ignore the unused upper bits of the source. */
13934 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13935 stmt_cost = 0;
13936
13937 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
13938 but there are no equivalent instructions for SVE. This means that
13939 (all other things being equal) 128-bit SVE needs twice as many load
13940 and store instructions as Advanced SIMD in order to process vector pairs.
13941
13942 Also, scalar code can often use LDP and STP to access pairs of values,
13943 so it is too simplistic to say that one SVE load or store replaces
13944 VF scalar loads and stores.
13945
13946 Ideally we would account for this in the scalar and Advanced SIMD
13947 costs by making suitable load/store pairs as cheap as a single
13948 load/store. However, that would be a very invasive change and in
13949 practice it tends to stress other parts of the cost model too much.
13950 E.g. stores of scalar constants currently count just a store,
13951 whereas stores of vector constants count a store and a vec_init.
13952 This is an artificial distinction for AArch64, where stores of
13953 nonzero scalar constants need the same kind of register invariant
13954 as vector stores.
13955
13956 An alternative would be to double the cost of any SVE loads and stores
13957 that could be paired in Advanced SIMD (and possibly also paired in
13958 scalar code). But this tends to stress other parts of the cost model
13959 in the same way. It also means that we can fall back to Advanced SIMD
13960 even if full-loop predication would have been useful.
13961
13962 Here we go for a more conservative version: double the costs of SVE
13963 loads and stores if one iteration of the scalar loop processes enough
13964 elements for it to use a whole number of Advanced SIMD LDP or STP
13965 instructions. This makes it very likely that the VF would be 1 for
13966 Advanced SIMD, and so no epilogue should be needed. */
13967 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
13968 {
13969 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
13970 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
13971 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
13972 if (multiple_p (count * elt_bits, 256)
13973 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
13974 stmt_cost *= 2;
13975 }
13976
13977 return stmt_cost;
13978 }
13979
13980 /* Implement targetm.vectorize.add_stmt_cost. */
13981 static unsigned
13982 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
13983 enum vect_cost_for_stmt kind,
13984 struct _stmt_vec_info *stmt_info, tree vectype,
13985 int misalign, enum vect_cost_model_location where)
13986 {
13987 unsigned *cost = (unsigned *) data;
13988 unsigned retval = 0;
13989
13990 if (flag_vect_cost_model)
13991 {
13992 int stmt_cost =
13993 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13994
13995 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13996 stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
13997 vectype, stmt_cost);
13998
13999 /* Statements in an inner loop relative to the loop being
14000 vectorized are weighted more heavily. The value here is
14001 arbitrary and could potentially be improved with analysis. */
14002 if (where == vect_body && stmt_info
14003 && stmt_in_inner_loop_p (vinfo, stmt_info))
14004 count *= 50; /* FIXME */
14005
14006 retval = (unsigned) (count * stmt_cost);
14007 cost[where] += retval;
14008 }
14009
14010 return retval;
14011 }
14012
14013 static void initialize_aarch64_code_model (struct gcc_options *);
14014
14015 /* Parse the TO_PARSE string and put the architecture struct that it
14016 selects into RES and the architectural features into ISA_FLAGS.
14017 Return an aarch64_parse_opt_result describing the parse result.
14018 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
14019 When the TO_PARSE string contains an invalid extension,
14020 a copy of the string is created and stored to INVALID_EXTENSION. */
14021
14022 static enum aarch64_parse_opt_result
14023 aarch64_parse_arch (const char *to_parse, const struct processor **res,
14024 uint64_t *isa_flags, std::string *invalid_extension)
14025 {
14026 const char *ext;
14027 const struct processor *arch;
14028 size_t len;
14029
14030 ext = strchr (to_parse, '+');
14031
14032 if (ext != NULL)
14033 len = ext - to_parse;
14034 else
14035 len = strlen (to_parse);
14036
14037 if (len == 0)
14038 return AARCH64_PARSE_MISSING_ARG;
14039
14040
14041 /* Loop through the list of supported ARCHes to find a match. */
14042 for (arch = all_architectures; arch->name != NULL; arch++)
14043 {
14044 if (strlen (arch->name) == len
14045 && strncmp (arch->name, to_parse, len) == 0)
14046 {
14047 uint64_t isa_temp = arch->flags;
14048
14049 if (ext != NULL)
14050 {
14051 /* TO_PARSE string contains at least one extension. */
14052 enum aarch64_parse_opt_result ext_res
14053 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
14054
14055 if (ext_res != AARCH64_PARSE_OK)
14056 return ext_res;
14057 }
14058 /* Extension parsing was successful. Confirm the result
14059 arch and ISA flags. */
14060 *res = arch;
14061 *isa_flags = isa_temp;
14062 return AARCH64_PARSE_OK;
14063 }
14064 }
14065
14066 /* ARCH name not found in list. */
14067 return AARCH64_PARSE_INVALID_ARG;
14068 }
14069
14070 /* Parse the TO_PARSE string and put the result tuning in RES and the
14071 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
14072 describing the parse result. If there is an error parsing, RES and
14073 ISA_FLAGS are left unchanged.
14074 When the TO_PARSE string contains an invalid extension,
14075 a copy of the string is created and stored to INVALID_EXTENSION. */
14076
14077 static enum aarch64_parse_opt_result
14078 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
14079 uint64_t *isa_flags, std::string *invalid_extension)
14080 {
14081 const char *ext;
14082 const struct processor *cpu;
14083 size_t len;
14084
14085 ext = strchr (to_parse, '+');
14086
14087 if (ext != NULL)
14088 len = ext - to_parse;
14089 else
14090 len = strlen (to_parse);
14091
14092 if (len == 0)
14093 return AARCH64_PARSE_MISSING_ARG;
14094
14095
14096 /* Loop through the list of supported CPUs to find a match. */
14097 for (cpu = all_cores; cpu->name != NULL; cpu++)
14098 {
14099 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
14100 {
14101 uint64_t isa_temp = cpu->flags;
14102
14103
14104 if (ext != NULL)
14105 {
14106 /* TO_PARSE string contains at least one extension. */
14107 enum aarch64_parse_opt_result ext_res
14108 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
14109
14110 if (ext_res != AARCH64_PARSE_OK)
14111 return ext_res;
14112 }
14113 /* Extension parsing was successfull. Confirm the result
14114 cpu and ISA flags. */
14115 *res = cpu;
14116 *isa_flags = isa_temp;
14117 return AARCH64_PARSE_OK;
14118 }
14119 }
14120
14121 /* CPU name not found in list. */
14122 return AARCH64_PARSE_INVALID_ARG;
14123 }
14124
14125 /* Parse the TO_PARSE string and put the cpu it selects into RES.
14126 Return an aarch64_parse_opt_result describing the parse result.
14127 If the parsing fails the RES does not change. */
14128
14129 static enum aarch64_parse_opt_result
14130 aarch64_parse_tune (const char *to_parse, const struct processor **res)
14131 {
14132 const struct processor *cpu;
14133
14134 /* Loop through the list of supported CPUs to find a match. */
14135 for (cpu = all_cores; cpu->name != NULL; cpu++)
14136 {
14137 if (strcmp (cpu->name, to_parse) == 0)
14138 {
14139 *res = cpu;
14140 return AARCH64_PARSE_OK;
14141 }
14142 }
14143
14144 /* CPU name not found in list. */
14145 return AARCH64_PARSE_INVALID_ARG;
14146 }
14147
14148 /* Parse TOKEN, which has length LENGTH to see if it is an option
14149 described in FLAG. If it is, return the index bit for that fusion type.
14150 If not, error (printing OPTION_NAME) and return zero. */
14151
14152 static unsigned int
14153 aarch64_parse_one_option_token (const char *token,
14154 size_t length,
14155 const struct aarch64_flag_desc *flag,
14156 const char *option_name)
14157 {
14158 for (; flag->name != NULL; flag++)
14159 {
14160 if (length == strlen (flag->name)
14161 && !strncmp (flag->name, token, length))
14162 return flag->flag;
14163 }
14164
14165 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
14166 return 0;
14167 }
14168
14169 /* Parse OPTION which is a comma-separated list of flags to enable.
14170 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
14171 default state we inherit from the CPU tuning structures. OPTION_NAME
14172 gives the top-level option we are parsing in the -moverride string,
14173 for use in error messages. */
14174
14175 static unsigned int
14176 aarch64_parse_boolean_options (const char *option,
14177 const struct aarch64_flag_desc *flags,
14178 unsigned int initial_state,
14179 const char *option_name)
14180 {
14181 const char separator = '.';
14182 const char* specs = option;
14183 const char* ntoken = option;
14184 unsigned int found_flags = initial_state;
14185
14186 while ((ntoken = strchr (specs, separator)))
14187 {
14188 size_t token_length = ntoken - specs;
14189 unsigned token_ops = aarch64_parse_one_option_token (specs,
14190 token_length,
14191 flags,
14192 option_name);
14193 /* If we find "none" (or, for simplicity's sake, an error) anywhere
14194 in the token stream, reset the supported operations. So:
14195
14196 adrp+add.cmp+branch.none.adrp+add
14197
14198 would have the result of turning on only adrp+add fusion. */
14199 if (!token_ops)
14200 found_flags = 0;
14201
14202 found_flags |= token_ops;
14203 specs = ++ntoken;
14204 }
14205
14206 /* We ended with a comma, print something. */
14207 if (!(*specs))
14208 {
14209 error ("%s string ill-formed\n", option_name);
14210 return 0;
14211 }
14212
14213 /* We still have one more token to parse. */
14214 size_t token_length = strlen (specs);
14215 unsigned token_ops = aarch64_parse_one_option_token (specs,
14216 token_length,
14217 flags,
14218 option_name);
14219 if (!token_ops)
14220 found_flags = 0;
14221
14222 found_flags |= token_ops;
14223 return found_flags;
14224 }
14225
14226 /* Support for overriding instruction fusion. */
14227
14228 static void
14229 aarch64_parse_fuse_string (const char *fuse_string,
14230 struct tune_params *tune)
14231 {
14232 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14233 aarch64_fusible_pairs,
14234 tune->fusible_ops,
14235 "fuse=");
14236 }
14237
14238 /* Support for overriding other tuning flags. */
14239
14240 static void
14241 aarch64_parse_tune_string (const char *tune_string,
14242 struct tune_params *tune)
14243 {
14244 tune->extra_tuning_flags
14245 = aarch64_parse_boolean_options (tune_string,
14246 aarch64_tuning_flags,
14247 tune->extra_tuning_flags,
14248 "tune=");
14249 }
14250
14251 /* Parse the sve_width tuning moverride string in TUNE_STRING.
14252 Accept the valid SVE vector widths allowed by
14253 aarch64_sve_vector_bits_enum and use it to override sve_width
14254 in TUNE. */
14255
14256 static void
14257 aarch64_parse_sve_width_string (const char *tune_string,
14258 struct tune_params *tune)
14259 {
14260 int width = -1;
14261
14262 int n = sscanf (tune_string, "%d", &width);
14263 if (n == EOF)
14264 {
14265 error ("invalid format for sve_width");
14266 return;
14267 }
14268 switch (width)
14269 {
14270 case SVE_128:
14271 case SVE_256:
14272 case SVE_512:
14273 case SVE_1024:
14274 case SVE_2048:
14275 break;
14276 default:
14277 error ("invalid sve_width value: %d", width);
14278 }
14279 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14280 }
14281
14282 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14283 we understand. If it is, extract the option string and handoff to
14284 the appropriate function. */
14285
14286 void
14287 aarch64_parse_one_override_token (const char* token,
14288 size_t length,
14289 struct tune_params *tune)
14290 {
14291 const struct aarch64_tuning_override_function *fn
14292 = aarch64_tuning_override_functions;
14293
14294 const char *option_part = strchr (token, '=');
14295 if (!option_part)
14296 {
14297 error ("tuning string missing in option (%s)", token);
14298 return;
14299 }
14300
14301 /* Get the length of the option name. */
14302 length = option_part - token;
14303 /* Skip the '=' to get to the option string. */
14304 option_part++;
14305
14306 for (; fn->name != NULL; fn++)
14307 {
14308 if (!strncmp (fn->name, token, length))
14309 {
14310 fn->parse_override (option_part, tune);
14311 return;
14312 }
14313 }
14314
14315 error ("unknown tuning option (%s)",token);
14316 return;
14317 }
14318
14319 /* A checking mechanism for the implementation of the tls size. */
14320
14321 static void
14322 initialize_aarch64_tls_size (struct gcc_options *opts)
14323 {
14324 if (aarch64_tls_size == 0)
14325 aarch64_tls_size = 24;
14326
14327 switch (opts->x_aarch64_cmodel_var)
14328 {
14329 case AARCH64_CMODEL_TINY:
14330 /* Both the default and maximum TLS size allowed under tiny is 1M which
14331 needs two instructions to address, so we clamp the size to 24. */
14332 if (aarch64_tls_size > 24)
14333 aarch64_tls_size = 24;
14334 break;
14335 case AARCH64_CMODEL_SMALL:
14336 /* The maximum TLS size allowed under small is 4G. */
14337 if (aarch64_tls_size > 32)
14338 aarch64_tls_size = 32;
14339 break;
14340 case AARCH64_CMODEL_LARGE:
14341 /* The maximum TLS size allowed under large is 16E.
14342 FIXME: 16E should be 64bit, we only support 48bit offset now. */
14343 if (aarch64_tls_size > 48)
14344 aarch64_tls_size = 48;
14345 break;
14346 default:
14347 gcc_unreachable ();
14348 }
14349
14350 return;
14351 }
14352
14353 /* Parse STRING looking for options in the format:
14354 string :: option:string
14355 option :: name=substring
14356 name :: {a-z}
14357 substring :: defined by option. */
14358
14359 static void
14360 aarch64_parse_override_string (const char* input_string,
14361 struct tune_params* tune)
14362 {
14363 const char separator = ':';
14364 size_t string_length = strlen (input_string) + 1;
14365 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14366 char *string = string_root;
14367 strncpy (string, input_string, string_length);
14368 string[string_length - 1] = '\0';
14369
14370 char* ntoken = string;
14371
14372 while ((ntoken = strchr (string, separator)))
14373 {
14374 size_t token_length = ntoken - string;
14375 /* Make this substring look like a string. */
14376 *ntoken = '\0';
14377 aarch64_parse_one_override_token (string, token_length, tune);
14378 string = ++ntoken;
14379 }
14380
14381 /* One last option to parse. */
14382 aarch64_parse_one_override_token (string, strlen (string), tune);
14383 free (string_root);
14384 }
14385
14386
14387 static void
14388 aarch64_override_options_after_change_1 (struct gcc_options *opts)
14389 {
14390 if (accepted_branch_protection_string)
14391 {
14392 opts->x_aarch64_branch_protection_string
14393 = xstrdup (accepted_branch_protection_string);
14394 }
14395
14396 /* PR 70044: We have to be careful about being called multiple times for the
14397 same function. This means all changes should be repeatable. */
14398
14399 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14400 Disable the frame pointer flag so the mid-end will not use a frame
14401 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14402 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14403 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
14404 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14405 if (opts->x_flag_omit_frame_pointer == 0)
14406 opts->x_flag_omit_frame_pointer = 2;
14407
14408 /* If not optimizing for size, set the default
14409 alignment to what the target wants. */
14410 if (!opts->x_optimize_size)
14411 {
14412 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14413 opts->x_str_align_loops = aarch64_tune_params.loop_align;
14414 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14415 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14416 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14417 opts->x_str_align_functions = aarch64_tune_params.function_align;
14418 }
14419
14420 /* We default to no pc-relative literal loads. */
14421
14422 aarch64_pcrelative_literal_loads = false;
14423
14424 /* If -mpc-relative-literal-loads is set on the command line, this
14425 implies that the user asked for PC relative literal loads. */
14426 if (opts->x_pcrelative_literal_loads == 1)
14427 aarch64_pcrelative_literal_loads = true;
14428
14429 /* In the tiny memory model it makes no sense to disallow PC relative
14430 literal pool loads. */
14431 if (aarch64_cmodel == AARCH64_CMODEL_TINY
14432 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14433 aarch64_pcrelative_literal_loads = true;
14434
14435 /* When enabling the lower precision Newton series for the square root, also
14436 enable it for the reciprocal square root, since the latter is an
14437 intermediary step for the former. */
14438 if (flag_mlow_precision_sqrt)
14439 flag_mrecip_low_precision_sqrt = true;
14440 }
14441
14442 /* 'Unpack' up the internal tuning structs and update the options
14443 in OPTS. The caller must have set up selected_tune and selected_arch
14444 as all the other target-specific codegen decisions are
14445 derived from them. */
14446
14447 void
14448 aarch64_override_options_internal (struct gcc_options *opts)
14449 {
14450 aarch64_tune_flags = selected_tune->flags;
14451 aarch64_tune = selected_tune->sched_core;
14452 /* Make a copy of the tuning parameters attached to the core, which
14453 we may later overwrite. */
14454 aarch64_tune_params = *(selected_tune->tune);
14455 aarch64_architecture_version = selected_arch->architecture_version;
14456
14457 if (opts->x_aarch64_override_tune_string)
14458 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14459 &aarch64_tune_params);
14460
14461 /* This target defaults to strict volatile bitfields. */
14462 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14463 opts->x_flag_strict_volatile_bitfields = 1;
14464
14465 if (aarch64_stack_protector_guard == SSP_GLOBAL
14466 && opts->x_aarch64_stack_protector_guard_offset_str)
14467 {
14468 error ("incompatible options %<-mstack-protector-guard=global%> and "
14469 "%<-mstack-protector-guard-offset=%s%>",
14470 aarch64_stack_protector_guard_offset_str);
14471 }
14472
14473 if (aarch64_stack_protector_guard == SSP_SYSREG
14474 && !(opts->x_aarch64_stack_protector_guard_offset_str
14475 && opts->x_aarch64_stack_protector_guard_reg_str))
14476 {
14477 error ("both %<-mstack-protector-guard-offset%> and "
14478 "%<-mstack-protector-guard-reg%> must be used "
14479 "with %<-mstack-protector-guard=sysreg%>");
14480 }
14481
14482 if (opts->x_aarch64_stack_protector_guard_reg_str)
14483 {
14484 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14485 error ("specify a system register with a small string length.");
14486 }
14487
14488 if (opts->x_aarch64_stack_protector_guard_offset_str)
14489 {
14490 char *end;
14491 const char *str = aarch64_stack_protector_guard_offset_str;
14492 errno = 0;
14493 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14494 if (!*str || *end || errno)
14495 error ("%qs is not a valid offset in %qs", str,
14496 "-mstack-protector-guard-offset=");
14497 aarch64_stack_protector_guard_offset = offs;
14498 }
14499
14500 initialize_aarch64_code_model (opts);
14501 initialize_aarch64_tls_size (opts);
14502
14503 int queue_depth = 0;
14504 switch (aarch64_tune_params.autoprefetcher_model)
14505 {
14506 case tune_params::AUTOPREFETCHER_OFF:
14507 queue_depth = -1;
14508 break;
14509 case tune_params::AUTOPREFETCHER_WEAK:
14510 queue_depth = 0;
14511 break;
14512 case tune_params::AUTOPREFETCHER_STRONG:
14513 queue_depth = max_insn_queue_index + 1;
14514 break;
14515 default:
14516 gcc_unreachable ();
14517 }
14518
14519 /* We don't mind passing in global_options_set here as we don't use
14520 the *options_set structs anyway. */
14521 SET_OPTION_IF_UNSET (opts, &global_options_set,
14522 param_sched_autopref_queue_depth, queue_depth);
14523
14524 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
14525 comparison. */
14526 if (aarch64_autovec_preference == 1)
14527 SET_OPTION_IF_UNSET (opts, &global_options_set,
14528 aarch64_sve_compare_costs, 0);
14529
14530 /* Set up parameters to be used in prefetching algorithm. Do not
14531 override the defaults unless we are tuning for a core we have
14532 researched values for. */
14533 if (aarch64_tune_params.prefetch->num_slots > 0)
14534 SET_OPTION_IF_UNSET (opts, &global_options_set,
14535 param_simultaneous_prefetches,
14536 aarch64_tune_params.prefetch->num_slots);
14537 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14538 SET_OPTION_IF_UNSET (opts, &global_options_set,
14539 param_l1_cache_size,
14540 aarch64_tune_params.prefetch->l1_cache_size);
14541 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14542 SET_OPTION_IF_UNSET (opts, &global_options_set,
14543 param_l1_cache_line_size,
14544 aarch64_tune_params.prefetch->l1_cache_line_size);
14545 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14546 SET_OPTION_IF_UNSET (opts, &global_options_set,
14547 param_l2_cache_size,
14548 aarch64_tune_params.prefetch->l2_cache_size);
14549 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14550 SET_OPTION_IF_UNSET (opts, &global_options_set,
14551 param_prefetch_dynamic_strides, 0);
14552 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14553 SET_OPTION_IF_UNSET (opts, &global_options_set,
14554 param_prefetch_minimum_stride,
14555 aarch64_tune_params.prefetch->minimum_stride);
14556
14557 /* Use the alternative scheduling-pressure algorithm by default. */
14558 SET_OPTION_IF_UNSET (opts, &global_options_set,
14559 param_sched_pressure_algorithm,
14560 SCHED_PRESSURE_MODEL);
14561
14562 /* Validate the guard size. */
14563 int guard_size = param_stack_clash_protection_guard_size;
14564
14565 if (guard_size != 12 && guard_size != 16)
14566 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14567 "size. Given value %d (%llu KB) is out of range",
14568 guard_size, (1ULL << guard_size) / 1024ULL);
14569
14570 /* Enforce that interval is the same size as size so the mid-end does the
14571 right thing. */
14572 SET_OPTION_IF_UNSET (opts, &global_options_set,
14573 param_stack_clash_protection_probe_interval,
14574 guard_size);
14575
14576 /* The maybe_set calls won't update the value if the user has explicitly set
14577 one. Which means we need to validate that probing interval and guard size
14578 are equal. */
14579 int probe_interval
14580 = param_stack_clash_protection_probe_interval;
14581 if (guard_size != probe_interval)
14582 error ("stack clash guard size %<%d%> must be equal to probing interval "
14583 "%<%d%>", guard_size, probe_interval);
14584
14585 /* Enable sw prefetching at specified optimization level for
14586 CPUS that have prefetch. Lower optimization level threshold by 1
14587 when profiling is enabled. */
14588 if (opts->x_flag_prefetch_loop_arrays < 0
14589 && !opts->x_optimize_size
14590 && aarch64_tune_params.prefetch->default_opt_level >= 0
14591 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14592 opts->x_flag_prefetch_loop_arrays = 1;
14593
14594 if (opts->x_aarch64_arch_string == NULL)
14595 opts->x_aarch64_arch_string = selected_arch->name;
14596 if (opts->x_aarch64_cpu_string == NULL)
14597 opts->x_aarch64_cpu_string = selected_cpu->name;
14598 if (opts->x_aarch64_tune_string == NULL)
14599 opts->x_aarch64_tune_string = selected_tune->name;
14600
14601 aarch64_override_options_after_change_1 (opts);
14602 }
14603
14604 /* Print a hint with a suggestion for a core or architecture name that
14605 most closely resembles what the user passed in STR. ARCH is true if
14606 the user is asking for an architecture name. ARCH is false if the user
14607 is asking for a core name. */
14608
14609 static void
14610 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14611 {
14612 auto_vec<const char *> candidates;
14613 const struct processor *entry = arch ? all_architectures : all_cores;
14614 for (; entry->name != NULL; entry++)
14615 candidates.safe_push (entry->name);
14616
14617 #ifdef HAVE_LOCAL_CPU_DETECT
14618 /* Add also "native" as possible value. */
14619 if (arch)
14620 candidates.safe_push ("native");
14621 #endif
14622
14623 char *s;
14624 const char *hint = candidates_list_and_hint (str, s, candidates);
14625 if (hint)
14626 inform (input_location, "valid arguments are: %s;"
14627 " did you mean %qs?", s, hint);
14628 else
14629 inform (input_location, "valid arguments are: %s", s);
14630
14631 XDELETEVEC (s);
14632 }
14633
14634 /* Print a hint with a suggestion for a core name that most closely resembles
14635 what the user passed in STR. */
14636
14637 inline static void
14638 aarch64_print_hint_for_core (const char *str)
14639 {
14640 aarch64_print_hint_for_core_or_arch (str, false);
14641 }
14642
14643 /* Print a hint with a suggestion for an architecture name that most closely
14644 resembles what the user passed in STR. */
14645
14646 inline static void
14647 aarch64_print_hint_for_arch (const char *str)
14648 {
14649 aarch64_print_hint_for_core_or_arch (str, true);
14650 }
14651
14652
14653 /* Print a hint with a suggestion for an extension name
14654 that most closely resembles what the user passed in STR. */
14655
14656 void
14657 aarch64_print_hint_for_extensions (const std::string &str)
14658 {
14659 auto_vec<const char *> candidates;
14660 aarch64_get_all_extension_candidates (&candidates);
14661 char *s;
14662 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14663 if (hint)
14664 inform (input_location, "valid arguments are: %s;"
14665 " did you mean %qs?", s, hint);
14666 else
14667 inform (input_location, "valid arguments are: %s;", s);
14668
14669 XDELETEVEC (s);
14670 }
14671
14672 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
14673 specified in STR and throw errors if appropriate. Put the results if
14674 they are valid in RES and ISA_FLAGS. Return whether the option is
14675 valid. */
14676
14677 static bool
14678 aarch64_validate_mcpu (const char *str, const struct processor **res,
14679 uint64_t *isa_flags)
14680 {
14681 std::string invalid_extension;
14682 enum aarch64_parse_opt_result parse_res
14683 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14684
14685 if (parse_res == AARCH64_PARSE_OK)
14686 return true;
14687
14688 switch (parse_res)
14689 {
14690 case AARCH64_PARSE_MISSING_ARG:
14691 error ("missing cpu name in %<-mcpu=%s%>", str);
14692 break;
14693 case AARCH64_PARSE_INVALID_ARG:
14694 error ("unknown value %qs for %<-mcpu%>", str);
14695 aarch64_print_hint_for_core (str);
14696 break;
14697 case AARCH64_PARSE_INVALID_FEATURE:
14698 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14699 invalid_extension.c_str (), str);
14700 aarch64_print_hint_for_extensions (invalid_extension);
14701 break;
14702 default:
14703 gcc_unreachable ();
14704 }
14705
14706 return false;
14707 }
14708
14709 /* Straight line speculation indicators. */
14710 enum aarch64_sls_hardening_type
14711 {
14712 SLS_NONE = 0,
14713 SLS_RETBR = 1,
14714 SLS_BLR = 2,
14715 SLS_ALL = 3,
14716 };
14717 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
14718
14719 /* Return whether we should mitigatate Straight Line Speculation for the RET
14720 and BR instructions. */
14721 bool
14722 aarch64_harden_sls_retbr_p (void)
14723 {
14724 return aarch64_sls_hardening & SLS_RETBR;
14725 }
14726
14727 /* Return whether we should mitigatate Straight Line Speculation for the BLR
14728 instruction. */
14729 bool
14730 aarch64_harden_sls_blr_p (void)
14731 {
14732 return aarch64_sls_hardening & SLS_BLR;
14733 }
14734
14735 /* As of yet we only allow setting these options globally, in the future we may
14736 allow setting them per function. */
14737 static void
14738 aarch64_validate_sls_mitigation (const char *const_str)
14739 {
14740 char *token_save = NULL;
14741 char *str = NULL;
14742
14743 if (strcmp (const_str, "none") == 0)
14744 {
14745 aarch64_sls_hardening = SLS_NONE;
14746 return;
14747 }
14748 if (strcmp (const_str, "all") == 0)
14749 {
14750 aarch64_sls_hardening = SLS_ALL;
14751 return;
14752 }
14753
14754 char *str_root = xstrdup (const_str);
14755 str = strtok_r (str_root, ",", &token_save);
14756 if (!str)
14757 error ("invalid argument given to %<-mharden-sls=%>");
14758
14759 int temp = SLS_NONE;
14760 while (str)
14761 {
14762 if (strcmp (str, "blr") == 0)
14763 temp |= SLS_BLR;
14764 else if (strcmp (str, "retbr") == 0)
14765 temp |= SLS_RETBR;
14766 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
14767 {
14768 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
14769 break;
14770 }
14771 else
14772 {
14773 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
14774 break;
14775 }
14776 str = strtok_r (NULL, ",", &token_save);
14777 }
14778 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
14779 free (str_root);
14780 }
14781
14782 /* Parses CONST_STR for branch protection features specified in
14783 aarch64_branch_protect_types, and set any global variables required. Returns
14784 the parsing result and assigns LAST_STR to the last processed token from
14785 CONST_STR so that it can be used for error reporting. */
14786
14787 static enum
14788 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14789 char** last_str)
14790 {
14791 char *str_root = xstrdup (const_str);
14792 char* token_save = NULL;
14793 char *str = strtok_r (str_root, "+", &token_save);
14794 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14795 if (!str)
14796 res = AARCH64_PARSE_MISSING_ARG;
14797 else
14798 {
14799 char *next_str = strtok_r (NULL, "+", &token_save);
14800 /* Reset the branch protection features to their defaults. */
14801 aarch64_handle_no_branch_protection (NULL, NULL);
14802
14803 while (str && res == AARCH64_PARSE_OK)
14804 {
14805 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14806 bool found = false;
14807 /* Search for this type. */
14808 while (type && type->name && !found && res == AARCH64_PARSE_OK)
14809 {
14810 if (strcmp (str, type->name) == 0)
14811 {
14812 found = true;
14813 res = type->handler (str, next_str);
14814 str = next_str;
14815 next_str = strtok_r (NULL, "+", &token_save);
14816 }
14817 else
14818 type++;
14819 }
14820 if (found && res == AARCH64_PARSE_OK)
14821 {
14822 bool found_subtype = true;
14823 /* Loop through each token until we find one that isn't a
14824 subtype. */
14825 while (found_subtype)
14826 {
14827 found_subtype = false;
14828 const aarch64_branch_protect_type *subtype = type->subtypes;
14829 /* Search for the subtype. */
14830 while (str && subtype && subtype->name && !found_subtype
14831 && res == AARCH64_PARSE_OK)
14832 {
14833 if (strcmp (str, subtype->name) == 0)
14834 {
14835 found_subtype = true;
14836 res = subtype->handler (str, next_str);
14837 str = next_str;
14838 next_str = strtok_r (NULL, "+", &token_save);
14839 }
14840 else
14841 subtype++;
14842 }
14843 }
14844 }
14845 else if (!found)
14846 res = AARCH64_PARSE_INVALID_ARG;
14847 }
14848 }
14849 /* Copy the last processed token into the argument to pass it back.
14850 Used by option and attribute validation to print the offending token. */
14851 if (last_str)
14852 {
14853 if (str) strcpy (*last_str, str);
14854 else *last_str = NULL;
14855 }
14856 if (res == AARCH64_PARSE_OK)
14857 {
14858 /* If needed, alloc the accepted string then copy in const_str.
14859 Used by override_option_after_change_1. */
14860 if (!accepted_branch_protection_string)
14861 accepted_branch_protection_string = (char *) xmalloc (
14862 BRANCH_PROTECT_STR_MAX
14863 + 1);
14864 strncpy (accepted_branch_protection_string, const_str,
14865 BRANCH_PROTECT_STR_MAX + 1);
14866 /* Forcibly null-terminate. */
14867 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14868 }
14869 return res;
14870 }
14871
14872 static bool
14873 aarch64_validate_mbranch_protection (const char *const_str)
14874 {
14875 char *str = (char *) xmalloc (strlen (const_str));
14876 enum aarch64_parse_opt_result res =
14877 aarch64_parse_branch_protection (const_str, &str);
14878 if (res == AARCH64_PARSE_INVALID_ARG)
14879 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
14880 else if (res == AARCH64_PARSE_MISSING_ARG)
14881 error ("missing argument for %<-mbranch-protection=%>");
14882 free (str);
14883 return res == AARCH64_PARSE_OK;
14884 }
14885
14886 /* Validate a command-line -march option. Parse the arch and extensions
14887 (if any) specified in STR and throw errors if appropriate. Put the
14888 results, if they are valid, in RES and ISA_FLAGS. Return whether the
14889 option is valid. */
14890
14891 static bool
14892 aarch64_validate_march (const char *str, const struct processor **res,
14893 uint64_t *isa_flags)
14894 {
14895 std::string invalid_extension;
14896 enum aarch64_parse_opt_result parse_res
14897 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
14898
14899 if (parse_res == AARCH64_PARSE_OK)
14900 return true;
14901
14902 switch (parse_res)
14903 {
14904 case AARCH64_PARSE_MISSING_ARG:
14905 error ("missing arch name in %<-march=%s%>", str);
14906 break;
14907 case AARCH64_PARSE_INVALID_ARG:
14908 error ("unknown value %qs for %<-march%>", str);
14909 aarch64_print_hint_for_arch (str);
14910 break;
14911 case AARCH64_PARSE_INVALID_FEATURE:
14912 error ("invalid feature modifier %qs in %<-march=%s%>",
14913 invalid_extension.c_str (), str);
14914 aarch64_print_hint_for_extensions (invalid_extension);
14915 break;
14916 default:
14917 gcc_unreachable ();
14918 }
14919
14920 return false;
14921 }
14922
14923 /* Validate a command-line -mtune option. Parse the cpu
14924 specified in STR and throw errors if appropriate. Put the
14925 result, if it is valid, in RES. Return whether the option is
14926 valid. */
14927
14928 static bool
14929 aarch64_validate_mtune (const char *str, const struct processor **res)
14930 {
14931 enum aarch64_parse_opt_result parse_res
14932 = aarch64_parse_tune (str, res);
14933
14934 if (parse_res == AARCH64_PARSE_OK)
14935 return true;
14936
14937 switch (parse_res)
14938 {
14939 case AARCH64_PARSE_MISSING_ARG:
14940 error ("missing cpu name in %<-mtune=%s%>", str);
14941 break;
14942 case AARCH64_PARSE_INVALID_ARG:
14943 error ("unknown value %qs for %<-mtune%>", str);
14944 aarch64_print_hint_for_core (str);
14945 break;
14946 default:
14947 gcc_unreachable ();
14948 }
14949 return false;
14950 }
14951
14952 /* Return the CPU corresponding to the enum CPU.
14953 If it doesn't specify a cpu, return the default. */
14954
14955 static const struct processor *
14956 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14957 {
14958 if (cpu != aarch64_none)
14959 return &all_cores[cpu];
14960
14961 /* The & 0x3f is to extract the bottom 6 bits that encode the
14962 default cpu as selected by the --with-cpu GCC configure option
14963 in config.gcc.
14964 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14965 flags mechanism should be reworked to make it more sane. */
14966 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14967 }
14968
14969 /* Return the architecture corresponding to the enum ARCH.
14970 If it doesn't specify a valid architecture, return the default. */
14971
14972 static const struct processor *
14973 aarch64_get_arch (enum aarch64_arch arch)
14974 {
14975 if (arch != aarch64_no_arch)
14976 return &all_architectures[arch];
14977
14978 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14979
14980 return &all_architectures[cpu->arch];
14981 }
14982
14983 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
14984
14985 static poly_uint16
14986 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14987 {
14988 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14989 on big-endian targets, so we would need to forbid subregs that convert
14990 from one to the other. By default a reinterpret sequence would then
14991 involve a store to memory in one mode and a load back in the other.
14992 Even if we optimize that sequence using reverse instructions,
14993 it would still be a significant potential overhead.
14994
14995 For now, it seems better to generate length-agnostic code for that
14996 case instead. */
14997 if (value == SVE_SCALABLE
14998 || (value == SVE_128 && BYTES_BIG_ENDIAN))
14999 return poly_uint16 (2, 2);
15000 else
15001 return (int) value / 64;
15002 }
15003
15004 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
15005 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
15006 tuning structs. In particular it must set selected_tune and
15007 aarch64_isa_flags that define the available ISA features and tuning
15008 decisions. It must also set selected_arch as this will be used to
15009 output the .arch asm tags for each function. */
15010
15011 static void
15012 aarch64_override_options (void)
15013 {
15014 uint64_t cpu_isa = 0;
15015 uint64_t arch_isa = 0;
15016 aarch64_isa_flags = 0;
15017
15018 bool valid_cpu = true;
15019 bool valid_tune = true;
15020 bool valid_arch = true;
15021
15022 selected_cpu = NULL;
15023 selected_arch = NULL;
15024 selected_tune = NULL;
15025
15026 if (aarch64_harden_sls_string)
15027 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
15028
15029 if (aarch64_branch_protection_string)
15030 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
15031
15032 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
15033 If either of -march or -mtune is given, they override their
15034 respective component of -mcpu. */
15035 if (aarch64_cpu_string)
15036 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
15037 &cpu_isa);
15038
15039 if (aarch64_arch_string)
15040 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
15041 &arch_isa);
15042
15043 if (aarch64_tune_string)
15044 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
15045
15046 #ifdef SUBTARGET_OVERRIDE_OPTIONS
15047 SUBTARGET_OVERRIDE_OPTIONS;
15048 #endif
15049
15050 /* If the user did not specify a processor, choose the default
15051 one for them. This will be the CPU set during configuration using
15052 --with-cpu, otherwise it is "generic". */
15053 if (!selected_cpu)
15054 {
15055 if (selected_arch)
15056 {
15057 selected_cpu = &all_cores[selected_arch->ident];
15058 aarch64_isa_flags = arch_isa;
15059 explicit_arch = selected_arch->arch;
15060 }
15061 else
15062 {
15063 /* Get default configure-time CPU. */
15064 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
15065 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
15066 }
15067
15068 if (selected_tune)
15069 explicit_tune_core = selected_tune->ident;
15070 }
15071 /* If both -mcpu and -march are specified check that they are architecturally
15072 compatible, warn if they're not and prefer the -march ISA flags. */
15073 else if (selected_arch)
15074 {
15075 if (selected_arch->arch != selected_cpu->arch)
15076 {
15077 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
15078 aarch64_cpu_string,
15079 aarch64_arch_string);
15080 }
15081 aarch64_isa_flags = arch_isa;
15082 explicit_arch = selected_arch->arch;
15083 explicit_tune_core = selected_tune ? selected_tune->ident
15084 : selected_cpu->ident;
15085 }
15086 else
15087 {
15088 /* -mcpu but no -march. */
15089 aarch64_isa_flags = cpu_isa;
15090 explicit_tune_core = selected_tune ? selected_tune->ident
15091 : selected_cpu->ident;
15092 gcc_assert (selected_cpu);
15093 selected_arch = &all_architectures[selected_cpu->arch];
15094 explicit_arch = selected_arch->arch;
15095 }
15096
15097 /* Set the arch as well as we will need it when outputing
15098 the .arch directive in assembly. */
15099 if (!selected_arch)
15100 {
15101 gcc_assert (selected_cpu);
15102 selected_arch = &all_architectures[selected_cpu->arch];
15103 }
15104
15105 if (!selected_tune)
15106 selected_tune = selected_cpu;
15107
15108 if (aarch64_enable_bti == 2)
15109 {
15110 #ifdef TARGET_ENABLE_BTI
15111 aarch64_enable_bti = 1;
15112 #else
15113 aarch64_enable_bti = 0;
15114 #endif
15115 }
15116
15117 /* Return address signing is currently not supported for ILP32 targets. For
15118 LP64 targets use the configured option in the absence of a command-line
15119 option for -mbranch-protection. */
15120 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
15121 {
15122 #ifdef TARGET_ENABLE_PAC_RET
15123 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
15124 #else
15125 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
15126 #endif
15127 }
15128
15129 #ifndef HAVE_AS_MABI_OPTION
15130 /* The compiler may have been configured with 2.23.* binutils, which does
15131 not have support for ILP32. */
15132 if (TARGET_ILP32)
15133 error ("assembler does not support %<-mabi=ilp32%>");
15134 #endif
15135
15136 /* Convert -msve-vector-bits to a VG count. */
15137 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
15138
15139 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
15140 sorry ("return address signing is only supported for %<-mabi=lp64%>");
15141
15142 /* Make sure we properly set up the explicit options. */
15143 if ((aarch64_cpu_string && valid_cpu)
15144 || (aarch64_tune_string && valid_tune))
15145 gcc_assert (explicit_tune_core != aarch64_none);
15146
15147 if ((aarch64_cpu_string && valid_cpu)
15148 || (aarch64_arch_string && valid_arch))
15149 gcc_assert (explicit_arch != aarch64_no_arch);
15150
15151 /* The pass to insert speculation tracking runs before
15152 shrink-wrapping and the latter does not know how to update the
15153 tracking status. So disable it in this case. */
15154 if (aarch64_track_speculation)
15155 flag_shrink_wrap = 0;
15156
15157 aarch64_override_options_internal (&global_options);
15158
15159 /* Save these options as the default ones in case we push and pop them later
15160 while processing functions with potential target attributes. */
15161 target_option_default_node = target_option_current_node
15162 = build_target_option_node (&global_options, &global_options_set);
15163 }
15164
15165 /* Implement targetm.override_options_after_change. */
15166
15167 static void
15168 aarch64_override_options_after_change (void)
15169 {
15170 aarch64_override_options_after_change_1 (&global_options);
15171 }
15172
15173 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
15174 static char *
15175 aarch64_offload_options (void)
15176 {
15177 if (TARGET_ILP32)
15178 return xstrdup ("-foffload-abi=ilp32");
15179 else
15180 return xstrdup ("-foffload-abi=lp64");
15181 }
15182
15183 static struct machine_function *
15184 aarch64_init_machine_status (void)
15185 {
15186 struct machine_function *machine;
15187 machine = ggc_cleared_alloc<machine_function> ();
15188 return machine;
15189 }
15190
15191 void
15192 aarch64_init_expanders (void)
15193 {
15194 init_machine_status = aarch64_init_machine_status;
15195 }
15196
15197 /* A checking mechanism for the implementation of the various code models. */
15198 static void
15199 initialize_aarch64_code_model (struct gcc_options *opts)
15200 {
15201 aarch64_cmodel = opts->x_aarch64_cmodel_var;
15202 switch (opts->x_aarch64_cmodel_var)
15203 {
15204 case AARCH64_CMODEL_TINY:
15205 if (opts->x_flag_pic)
15206 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
15207 break;
15208 case AARCH64_CMODEL_SMALL:
15209 if (opts->x_flag_pic)
15210 {
15211 #ifdef HAVE_AS_SMALL_PIC_RELOCS
15212 aarch64_cmodel = (flag_pic == 2
15213 ? AARCH64_CMODEL_SMALL_PIC
15214 : AARCH64_CMODEL_SMALL_SPIC);
15215 #else
15216 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
15217 #endif
15218 }
15219 break;
15220 case AARCH64_CMODEL_LARGE:
15221 if (opts->x_flag_pic)
15222 sorry ("code model %qs with %<-f%s%>", "large",
15223 opts->x_flag_pic > 1 ? "PIC" : "pic");
15224 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
15225 sorry ("code model %qs not supported in ilp32 mode", "large");
15226 break;
15227 case AARCH64_CMODEL_TINY_PIC:
15228 case AARCH64_CMODEL_SMALL_PIC:
15229 case AARCH64_CMODEL_SMALL_SPIC:
15230 gcc_unreachable ();
15231 }
15232 }
15233
15234 /* Implement TARGET_OPTION_SAVE. */
15235
15236 static void
15237 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
15238 struct gcc_options */* opts_set */)
15239 {
15240 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
15241 ptr->x_aarch64_branch_protection_string
15242 = opts->x_aarch64_branch_protection_string;
15243 }
15244
15245 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
15246 using the information saved in PTR. */
15247
15248 static void
15249 aarch64_option_restore (struct gcc_options *opts,
15250 struct gcc_options */* opts_set */,
15251 struct cl_target_option *ptr)
15252 {
15253 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
15254 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15255 opts->x_explicit_arch = ptr->x_explicit_arch;
15256 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
15257 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
15258 opts->x_aarch64_branch_protection_string
15259 = ptr->x_aarch64_branch_protection_string;
15260 if (opts->x_aarch64_branch_protection_string)
15261 {
15262 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
15263 NULL);
15264 }
15265
15266 aarch64_override_options_internal (opts);
15267 }
15268
15269 /* Implement TARGET_OPTION_PRINT. */
15270
15271 static void
15272 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
15273 {
15274 const struct processor *cpu
15275 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15276 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
15277 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
15278 std::string extension
15279 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
15280
15281 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
15282 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
15283 arch->name, extension.c_str ());
15284 }
15285
15286 static GTY(()) tree aarch64_previous_fndecl;
15287
15288 void
15289 aarch64_reset_previous_fndecl (void)
15290 {
15291 aarch64_previous_fndecl = NULL;
15292 }
15293
15294 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
15295 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
15296 make sure optab availability predicates are recomputed when necessary. */
15297
15298 void
15299 aarch64_save_restore_target_globals (tree new_tree)
15300 {
15301 if (TREE_TARGET_GLOBALS (new_tree))
15302 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15303 else if (new_tree == target_option_default_node)
15304 restore_target_globals (&default_target_globals);
15305 else
15306 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15307 }
15308
15309 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
15310 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15311 of the function, if such exists. This function may be called multiple
15312 times on a single function so use aarch64_previous_fndecl to avoid
15313 setting up identical state. */
15314
15315 static void
15316 aarch64_set_current_function (tree fndecl)
15317 {
15318 if (!fndecl || fndecl == aarch64_previous_fndecl)
15319 return;
15320
15321 tree old_tree = (aarch64_previous_fndecl
15322 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15323 : NULL_TREE);
15324
15325 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15326
15327 /* If current function has no attributes but the previous one did,
15328 use the default node. */
15329 if (!new_tree && old_tree)
15330 new_tree = target_option_default_node;
15331
15332 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
15333 the default have been handled by aarch64_save_restore_target_globals from
15334 aarch64_pragma_target_parse. */
15335 if (old_tree == new_tree)
15336 return;
15337
15338 aarch64_previous_fndecl = fndecl;
15339
15340 /* First set the target options. */
15341 cl_target_option_restore (&global_options, &global_options_set,
15342 TREE_TARGET_OPTION (new_tree));
15343
15344 aarch64_save_restore_target_globals (new_tree);
15345 }
15346
15347 /* Enum describing the various ways we can handle attributes.
15348 In many cases we can reuse the generic option handling machinery. */
15349
15350 enum aarch64_attr_opt_type
15351 {
15352 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
15353 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
15354 aarch64_attr_enum, /* Attribute sets an enum variable. */
15355 aarch64_attr_custom /* Attribute requires a custom handling function. */
15356 };
15357
15358 /* All the information needed to handle a target attribute.
15359 NAME is the name of the attribute.
15360 ATTR_TYPE specifies the type of behavior of the attribute as described
15361 in the definition of enum aarch64_attr_opt_type.
15362 ALLOW_NEG is true if the attribute supports a "no-" form.
15363 HANDLER is the function that takes the attribute string as an argument
15364 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
15365 OPT_NUM is the enum specifying the option that the attribute modifies.
15366 This is needed for attributes that mirror the behavior of a command-line
15367 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15368 aarch64_attr_enum. */
15369
15370 struct aarch64_attribute_info
15371 {
15372 const char *name;
15373 enum aarch64_attr_opt_type attr_type;
15374 bool allow_neg;
15375 bool (*handler) (const char *);
15376 enum opt_code opt_num;
15377 };
15378
15379 /* Handle the ARCH_STR argument to the arch= target attribute. */
15380
15381 static bool
15382 aarch64_handle_attr_arch (const char *str)
15383 {
15384 const struct processor *tmp_arch = NULL;
15385 std::string invalid_extension;
15386 enum aarch64_parse_opt_result parse_res
15387 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
15388
15389 if (parse_res == AARCH64_PARSE_OK)
15390 {
15391 gcc_assert (tmp_arch);
15392 selected_arch = tmp_arch;
15393 explicit_arch = selected_arch->arch;
15394 return true;
15395 }
15396
15397 switch (parse_res)
15398 {
15399 case AARCH64_PARSE_MISSING_ARG:
15400 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
15401 break;
15402 case AARCH64_PARSE_INVALID_ARG:
15403 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
15404 aarch64_print_hint_for_arch (str);
15405 break;
15406 case AARCH64_PARSE_INVALID_FEATURE:
15407 error ("invalid feature modifier %s of value (\"%s\") in "
15408 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15409 aarch64_print_hint_for_extensions (invalid_extension);
15410 break;
15411 default:
15412 gcc_unreachable ();
15413 }
15414
15415 return false;
15416 }
15417
15418 /* Handle the argument CPU_STR to the cpu= target attribute. */
15419
15420 static bool
15421 aarch64_handle_attr_cpu (const char *str)
15422 {
15423 const struct processor *tmp_cpu = NULL;
15424 std::string invalid_extension;
15425 enum aarch64_parse_opt_result parse_res
15426 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
15427
15428 if (parse_res == AARCH64_PARSE_OK)
15429 {
15430 gcc_assert (tmp_cpu);
15431 selected_tune = tmp_cpu;
15432 explicit_tune_core = selected_tune->ident;
15433
15434 selected_arch = &all_architectures[tmp_cpu->arch];
15435 explicit_arch = selected_arch->arch;
15436 return true;
15437 }
15438
15439 switch (parse_res)
15440 {
15441 case AARCH64_PARSE_MISSING_ARG:
15442 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
15443 break;
15444 case AARCH64_PARSE_INVALID_ARG:
15445 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
15446 aarch64_print_hint_for_core (str);
15447 break;
15448 case AARCH64_PARSE_INVALID_FEATURE:
15449 error ("invalid feature modifier %s of value (\"%s\") in "
15450 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15451 aarch64_print_hint_for_extensions (invalid_extension);
15452 break;
15453 default:
15454 gcc_unreachable ();
15455 }
15456
15457 return false;
15458 }
15459
15460 /* Handle the argument STR to the branch-protection= attribute. */
15461
15462 static bool
15463 aarch64_handle_attr_branch_protection (const char* str)
15464 {
15465 char *err_str = (char *) xmalloc (strlen (str) + 1);
15466 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15467 &err_str);
15468 bool success = false;
15469 switch (res)
15470 {
15471 case AARCH64_PARSE_MISSING_ARG:
15472 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15473 " attribute");
15474 break;
15475 case AARCH64_PARSE_INVALID_ARG:
15476 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15477 "=\")%> pragma or attribute", err_str);
15478 break;
15479 case AARCH64_PARSE_OK:
15480 success = true;
15481 /* Fall through. */
15482 case AARCH64_PARSE_INVALID_FEATURE:
15483 break;
15484 default:
15485 gcc_unreachable ();
15486 }
15487 free (err_str);
15488 return success;
15489 }
15490
15491 /* Handle the argument STR to the tune= target attribute. */
15492
15493 static bool
15494 aarch64_handle_attr_tune (const char *str)
15495 {
15496 const struct processor *tmp_tune = NULL;
15497 enum aarch64_parse_opt_result parse_res
15498 = aarch64_parse_tune (str, &tmp_tune);
15499
15500 if (parse_res == AARCH64_PARSE_OK)
15501 {
15502 gcc_assert (tmp_tune);
15503 selected_tune = tmp_tune;
15504 explicit_tune_core = selected_tune->ident;
15505 return true;
15506 }
15507
15508 switch (parse_res)
15509 {
15510 case AARCH64_PARSE_INVALID_ARG:
15511 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15512 aarch64_print_hint_for_core (str);
15513 break;
15514 default:
15515 gcc_unreachable ();
15516 }
15517
15518 return false;
15519 }
15520
15521 /* Parse an architecture extensions target attribute string specified in STR.
15522 For example "+fp+nosimd". Show any errors if needed. Return TRUE
15523 if successful. Update aarch64_isa_flags to reflect the ISA features
15524 modified. */
15525
15526 static bool
15527 aarch64_handle_attr_isa_flags (char *str)
15528 {
15529 enum aarch64_parse_opt_result parse_res;
15530 uint64_t isa_flags = aarch64_isa_flags;
15531
15532 /* We allow "+nothing" in the beginning to clear out all architectural
15533 features if the user wants to handpick specific features. */
15534 if (strncmp ("+nothing", str, 8) == 0)
15535 {
15536 isa_flags = 0;
15537 str += 8;
15538 }
15539
15540 std::string invalid_extension;
15541 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15542
15543 if (parse_res == AARCH64_PARSE_OK)
15544 {
15545 aarch64_isa_flags = isa_flags;
15546 return true;
15547 }
15548
15549 switch (parse_res)
15550 {
15551 case AARCH64_PARSE_MISSING_ARG:
15552 error ("missing value in %<target()%> pragma or attribute");
15553 break;
15554
15555 case AARCH64_PARSE_INVALID_FEATURE:
15556 error ("invalid feature modifier %s of value (\"%s\") in "
15557 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15558 break;
15559
15560 default:
15561 gcc_unreachable ();
15562 }
15563
15564 return false;
15565 }
15566
15567 /* The target attributes that we support. On top of these we also support just
15568 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
15569 handled explicitly in aarch64_process_one_target_attr. */
15570
15571 static const struct aarch64_attribute_info aarch64_attributes[] =
15572 {
15573 { "general-regs-only", aarch64_attr_mask, false, NULL,
15574 OPT_mgeneral_regs_only },
15575 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15576 OPT_mfix_cortex_a53_835769 },
15577 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15578 OPT_mfix_cortex_a53_843419 },
15579 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15580 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15581 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15582 OPT_momit_leaf_frame_pointer },
15583 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15584 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15585 OPT_march_ },
15586 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15587 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15588 OPT_mtune_ },
15589 { "branch-protection", aarch64_attr_custom, false,
15590 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15591 { "sign-return-address", aarch64_attr_enum, false, NULL,
15592 OPT_msign_return_address_ },
15593 { "outline-atomics", aarch64_attr_bool, true, NULL,
15594 OPT_moutline_atomics},
15595 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15596 };
15597
15598 /* Parse ARG_STR which contains the definition of one target attribute.
15599 Show appropriate errors if any or return true if the attribute is valid. */
15600
15601 static bool
15602 aarch64_process_one_target_attr (char *arg_str)
15603 {
15604 bool invert = false;
15605
15606 size_t len = strlen (arg_str);
15607
15608 if (len == 0)
15609 {
15610 error ("malformed %<target()%> pragma or attribute");
15611 return false;
15612 }
15613
15614 char *str_to_check = (char *) alloca (len + 1);
15615 strcpy (str_to_check, arg_str);
15616
15617 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15618 It is easier to detect and handle it explicitly here rather than going
15619 through the machinery for the rest of the target attributes in this
15620 function. */
15621 if (*str_to_check == '+')
15622 return aarch64_handle_attr_isa_flags (str_to_check);
15623
15624 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15625 {
15626 invert = true;
15627 str_to_check += 3;
15628 }
15629 char *arg = strchr (str_to_check, '=');
15630
15631 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15632 and point ARG to "foo". */
15633 if (arg)
15634 {
15635 *arg = '\0';
15636 arg++;
15637 }
15638 const struct aarch64_attribute_info *p_attr;
15639 bool found = false;
15640 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15641 {
15642 /* If the names don't match up, or the user has given an argument
15643 to an attribute that doesn't accept one, or didn't give an argument
15644 to an attribute that expects one, fail to match. */
15645 if (strcmp (str_to_check, p_attr->name) != 0)
15646 continue;
15647
15648 found = true;
15649 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15650 || p_attr->attr_type == aarch64_attr_enum;
15651
15652 if (attr_need_arg_p ^ (arg != NULL))
15653 {
15654 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15655 return false;
15656 }
15657
15658 /* If the name matches but the attribute does not allow "no-" versions
15659 then we can't match. */
15660 if (invert && !p_attr->allow_neg)
15661 {
15662 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15663 return false;
15664 }
15665
15666 switch (p_attr->attr_type)
15667 {
15668 /* Has a custom handler registered.
15669 For example, cpu=, arch=, tune=. */
15670 case aarch64_attr_custom:
15671 gcc_assert (p_attr->handler);
15672 if (!p_attr->handler (arg))
15673 return false;
15674 break;
15675
15676 /* Either set or unset a boolean option. */
15677 case aarch64_attr_bool:
15678 {
15679 struct cl_decoded_option decoded;
15680
15681 generate_option (p_attr->opt_num, NULL, !invert,
15682 CL_TARGET, &decoded);
15683 aarch64_handle_option (&global_options, &global_options_set,
15684 &decoded, input_location);
15685 break;
15686 }
15687 /* Set or unset a bit in the target_flags. aarch64_handle_option
15688 should know what mask to apply given the option number. */
15689 case aarch64_attr_mask:
15690 {
15691 struct cl_decoded_option decoded;
15692 /* We only need to specify the option number.
15693 aarch64_handle_option will know which mask to apply. */
15694 decoded.opt_index = p_attr->opt_num;
15695 decoded.value = !invert;
15696 aarch64_handle_option (&global_options, &global_options_set,
15697 &decoded, input_location);
15698 break;
15699 }
15700 /* Use the option setting machinery to set an option to an enum. */
15701 case aarch64_attr_enum:
15702 {
15703 gcc_assert (arg);
15704 bool valid;
15705 int value;
15706 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15707 &value, CL_TARGET);
15708 if (valid)
15709 {
15710 set_option (&global_options, NULL, p_attr->opt_num, value,
15711 NULL, DK_UNSPECIFIED, input_location,
15712 global_dc);
15713 }
15714 else
15715 {
15716 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15717 }
15718 break;
15719 }
15720 default:
15721 gcc_unreachable ();
15722 }
15723 }
15724
15725 /* If we reached here we either have found an attribute and validated
15726 it or didn't match any. If we matched an attribute but its arguments
15727 were malformed we will have returned false already. */
15728 return found;
15729 }
15730
15731 /* Count how many times the character C appears in
15732 NULL-terminated string STR. */
15733
15734 static unsigned int
15735 num_occurences_in_str (char c, char *str)
15736 {
15737 unsigned int res = 0;
15738 while (*str != '\0')
15739 {
15740 if (*str == c)
15741 res++;
15742
15743 str++;
15744 }
15745
15746 return res;
15747 }
15748
15749 /* Parse the tree in ARGS that contains the target attribute information
15750 and update the global target options space. */
15751
15752 bool
15753 aarch64_process_target_attr (tree args)
15754 {
15755 if (TREE_CODE (args) == TREE_LIST)
15756 {
15757 do
15758 {
15759 tree head = TREE_VALUE (args);
15760 if (head)
15761 {
15762 if (!aarch64_process_target_attr (head))
15763 return false;
15764 }
15765 args = TREE_CHAIN (args);
15766 } while (args);
15767
15768 return true;
15769 }
15770
15771 if (TREE_CODE (args) != STRING_CST)
15772 {
15773 error ("attribute %<target%> argument not a string");
15774 return false;
15775 }
15776
15777 size_t len = strlen (TREE_STRING_POINTER (args));
15778 char *str_to_check = (char *) alloca (len + 1);
15779 strcpy (str_to_check, TREE_STRING_POINTER (args));
15780
15781 if (len == 0)
15782 {
15783 error ("malformed %<target()%> pragma or attribute");
15784 return false;
15785 }
15786
15787 /* Used to catch empty spaces between commas i.e.
15788 attribute ((target ("attr1,,attr2"))). */
15789 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15790
15791 /* Handle multiple target attributes separated by ','. */
15792 char *token = strtok_r (str_to_check, ",", &str_to_check);
15793
15794 unsigned int num_attrs = 0;
15795 while (token)
15796 {
15797 num_attrs++;
15798 if (!aarch64_process_one_target_attr (token))
15799 {
15800 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
15801 return false;
15802 }
15803
15804 token = strtok_r (NULL, ",", &str_to_check);
15805 }
15806
15807 if (num_attrs != num_commas + 1)
15808 {
15809 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
15810 return false;
15811 }
15812
15813 return true;
15814 }
15815
15816 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
15817 process attribute ((target ("..."))). */
15818
15819 static bool
15820 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15821 {
15822 struct cl_target_option cur_target;
15823 bool ret;
15824 tree old_optimize;
15825 tree new_target, new_optimize;
15826 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15827
15828 /* If what we're processing is the current pragma string then the
15829 target option node is already stored in target_option_current_node
15830 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
15831 having to re-parse the string. This is especially useful to keep
15832 arm_neon.h compile times down since that header contains a lot
15833 of intrinsics enclosed in pragmas. */
15834 if (!existing_target && args == current_target_pragma)
15835 {
15836 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15837 return true;
15838 }
15839 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15840
15841 old_optimize
15842 = build_optimization_node (&global_options, &global_options_set);
15843 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15844
15845 /* If the function changed the optimization levels as well as setting
15846 target options, start with the optimizations specified. */
15847 if (func_optimize && func_optimize != old_optimize)
15848 cl_optimization_restore (&global_options, &global_options_set,
15849 TREE_OPTIMIZATION (func_optimize));
15850
15851 /* Save the current target options to restore at the end. */
15852 cl_target_option_save (&cur_target, &global_options, &global_options_set);
15853
15854 /* If fndecl already has some target attributes applied to it, unpack
15855 them so that we add this attribute on top of them, rather than
15856 overwriting them. */
15857 if (existing_target)
15858 {
15859 struct cl_target_option *existing_options
15860 = TREE_TARGET_OPTION (existing_target);
15861
15862 if (existing_options)
15863 cl_target_option_restore (&global_options, &global_options_set,
15864 existing_options);
15865 }
15866 else
15867 cl_target_option_restore (&global_options, &global_options_set,
15868 TREE_TARGET_OPTION (target_option_current_node));
15869
15870 ret = aarch64_process_target_attr (args);
15871
15872 /* Set up any additional state. */
15873 if (ret)
15874 {
15875 aarch64_override_options_internal (&global_options);
15876 /* Initialize SIMD builtins if we haven't already.
15877 Set current_target_pragma to NULL for the duration so that
15878 the builtin initialization code doesn't try to tag the functions
15879 being built with the attributes specified by any current pragma, thus
15880 going into an infinite recursion. */
15881 if (TARGET_SIMD)
15882 {
15883 tree saved_current_target_pragma = current_target_pragma;
15884 current_target_pragma = NULL;
15885 aarch64_init_simd_builtins ();
15886 current_target_pragma = saved_current_target_pragma;
15887 }
15888 new_target = build_target_option_node (&global_options,
15889 &global_options_set);
15890 }
15891 else
15892 new_target = NULL;
15893
15894 new_optimize = build_optimization_node (&global_options,
15895 &global_options_set);
15896
15897 if (fndecl && ret)
15898 {
15899 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15900
15901 if (old_optimize != new_optimize)
15902 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15903 }
15904
15905 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
15906
15907 if (old_optimize != new_optimize)
15908 cl_optimization_restore (&global_options, &global_options_set,
15909 TREE_OPTIMIZATION (old_optimize));
15910 return ret;
15911 }
15912
15913 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
15914 tri-bool options (yes, no, don't care) and the default value is
15915 DEF, determine whether to reject inlining. */
15916
15917 static bool
15918 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15919 int dont_care, int def)
15920 {
15921 /* If the callee doesn't care, always allow inlining. */
15922 if (callee == dont_care)
15923 return true;
15924
15925 /* If the caller doesn't care, always allow inlining. */
15926 if (caller == dont_care)
15927 return true;
15928
15929 /* Otherwise, allow inlining if either the callee and caller values
15930 agree, or if the callee is using the default value. */
15931 return (callee == caller || callee == def);
15932 }
15933
15934 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
15935 to inline CALLEE into CALLER based on target-specific info.
15936 Make sure that the caller and callee have compatible architectural
15937 features. Then go through the other possible target attributes
15938 and see if they can block inlining. Try not to reject always_inline
15939 callees unless they are incompatible architecturally. */
15940
15941 static bool
15942 aarch64_can_inline_p (tree caller, tree callee)
15943 {
15944 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15945 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15946
15947 struct cl_target_option *caller_opts
15948 = TREE_TARGET_OPTION (caller_tree ? caller_tree
15949 : target_option_default_node);
15950
15951 struct cl_target_option *callee_opts
15952 = TREE_TARGET_OPTION (callee_tree ? callee_tree
15953 : target_option_default_node);
15954
15955 /* Callee's ISA flags should be a subset of the caller's. */
15956 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15957 != callee_opts->x_aarch64_isa_flags)
15958 return false;
15959
15960 /* Allow non-strict aligned functions inlining into strict
15961 aligned ones. */
15962 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15963 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15964 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15965 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15966 return false;
15967
15968 bool always_inline = lookup_attribute ("always_inline",
15969 DECL_ATTRIBUTES (callee));
15970
15971 /* If the architectural features match up and the callee is always_inline
15972 then the other attributes don't matter. */
15973 if (always_inline)
15974 return true;
15975
15976 if (caller_opts->x_aarch64_cmodel_var
15977 != callee_opts->x_aarch64_cmodel_var)
15978 return false;
15979
15980 if (caller_opts->x_aarch64_tls_dialect
15981 != callee_opts->x_aarch64_tls_dialect)
15982 return false;
15983
15984 /* Honour explicit requests to workaround errata. */
15985 if (!aarch64_tribools_ok_for_inlining_p (
15986 caller_opts->x_aarch64_fix_a53_err835769,
15987 callee_opts->x_aarch64_fix_a53_err835769,
15988 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15989 return false;
15990
15991 if (!aarch64_tribools_ok_for_inlining_p (
15992 caller_opts->x_aarch64_fix_a53_err843419,
15993 callee_opts->x_aarch64_fix_a53_err843419,
15994 2, TARGET_FIX_ERR_A53_843419))
15995 return false;
15996
15997 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15998 caller and calle and they don't match up, reject inlining. */
15999 if (!aarch64_tribools_ok_for_inlining_p (
16000 caller_opts->x_flag_omit_leaf_frame_pointer,
16001 callee_opts->x_flag_omit_leaf_frame_pointer,
16002 2, 1))
16003 return false;
16004
16005 /* If the callee has specific tuning overrides, respect them. */
16006 if (callee_opts->x_aarch64_override_tune_string != NULL
16007 && caller_opts->x_aarch64_override_tune_string == NULL)
16008 return false;
16009
16010 /* If the user specified tuning override strings for the
16011 caller and callee and they don't match up, reject inlining.
16012 We just do a string compare here, we don't analyze the meaning
16013 of the string, as it would be too costly for little gain. */
16014 if (callee_opts->x_aarch64_override_tune_string
16015 && caller_opts->x_aarch64_override_tune_string
16016 && (strcmp (callee_opts->x_aarch64_override_tune_string,
16017 caller_opts->x_aarch64_override_tune_string) != 0))
16018 return false;
16019
16020 return true;
16021 }
16022
16023 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
16024 been already. */
16025
16026 unsigned int
16027 aarch64_tlsdesc_abi_id ()
16028 {
16029 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
16030 if (!tlsdesc_abi.initialized_p ())
16031 {
16032 HARD_REG_SET full_reg_clobbers;
16033 CLEAR_HARD_REG_SET (full_reg_clobbers);
16034 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
16035 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
16036 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
16037 SET_HARD_REG_BIT (full_reg_clobbers, regno);
16038 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
16039 }
16040 return tlsdesc_abi.id ();
16041 }
16042
16043 /* Return true if SYMBOL_REF X binds locally. */
16044
16045 static bool
16046 aarch64_symbol_binds_local_p (const_rtx x)
16047 {
16048 return (SYMBOL_REF_DECL (x)
16049 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
16050 : SYMBOL_REF_LOCAL_P (x));
16051 }
16052
16053 /* Return true if SYMBOL_REF X is thread local */
16054 static bool
16055 aarch64_tls_symbol_p (rtx x)
16056 {
16057 if (! TARGET_HAVE_TLS)
16058 return false;
16059
16060 x = strip_salt (x);
16061 if (!SYMBOL_REF_P (x))
16062 return false;
16063
16064 return SYMBOL_REF_TLS_MODEL (x) != 0;
16065 }
16066
16067 /* Classify a TLS symbol into one of the TLS kinds. */
16068 enum aarch64_symbol_type
16069 aarch64_classify_tls_symbol (rtx x)
16070 {
16071 enum tls_model tls_kind = tls_symbolic_operand_type (x);
16072
16073 switch (tls_kind)
16074 {
16075 case TLS_MODEL_GLOBAL_DYNAMIC:
16076 case TLS_MODEL_LOCAL_DYNAMIC:
16077 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
16078
16079 case TLS_MODEL_INITIAL_EXEC:
16080 switch (aarch64_cmodel)
16081 {
16082 case AARCH64_CMODEL_TINY:
16083 case AARCH64_CMODEL_TINY_PIC:
16084 return SYMBOL_TINY_TLSIE;
16085 default:
16086 return SYMBOL_SMALL_TLSIE;
16087 }
16088
16089 case TLS_MODEL_LOCAL_EXEC:
16090 if (aarch64_tls_size == 12)
16091 return SYMBOL_TLSLE12;
16092 else if (aarch64_tls_size == 24)
16093 return SYMBOL_TLSLE24;
16094 else if (aarch64_tls_size == 32)
16095 return SYMBOL_TLSLE32;
16096 else if (aarch64_tls_size == 48)
16097 return SYMBOL_TLSLE48;
16098 else
16099 gcc_unreachable ();
16100
16101 case TLS_MODEL_EMULATED:
16102 case TLS_MODEL_NONE:
16103 return SYMBOL_FORCE_TO_MEM;
16104
16105 default:
16106 gcc_unreachable ();
16107 }
16108 }
16109
16110 /* Return the correct method for accessing X + OFFSET, where X is either
16111 a SYMBOL_REF or LABEL_REF. */
16112
16113 enum aarch64_symbol_type
16114 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
16115 {
16116 x = strip_salt (x);
16117
16118 if (LABEL_REF_P (x))
16119 {
16120 switch (aarch64_cmodel)
16121 {
16122 case AARCH64_CMODEL_LARGE:
16123 return SYMBOL_FORCE_TO_MEM;
16124
16125 case AARCH64_CMODEL_TINY_PIC:
16126 case AARCH64_CMODEL_TINY:
16127 return SYMBOL_TINY_ABSOLUTE;
16128
16129 case AARCH64_CMODEL_SMALL_SPIC:
16130 case AARCH64_CMODEL_SMALL_PIC:
16131 case AARCH64_CMODEL_SMALL:
16132 return SYMBOL_SMALL_ABSOLUTE;
16133
16134 default:
16135 gcc_unreachable ();
16136 }
16137 }
16138
16139 if (SYMBOL_REF_P (x))
16140 {
16141 if (aarch64_tls_symbol_p (x))
16142 return aarch64_classify_tls_symbol (x);
16143
16144 switch (aarch64_cmodel)
16145 {
16146 case AARCH64_CMODEL_TINY:
16147 /* When we retrieve symbol + offset address, we have to make sure
16148 the offset does not cause overflow of the final address. But
16149 we have no way of knowing the address of symbol at compile time
16150 so we can't accurately say if the distance between the PC and
16151 symbol + offset is outside the addressible range of +/-1MB in the
16152 TINY code model. So we limit the maximum offset to +/-64KB and
16153 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
16154 If offset_within_block_p is true we allow larger offsets.
16155 Furthermore force to memory if the symbol is a weak reference to
16156 something that doesn't resolve to a symbol in this module. */
16157
16158 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
16159 return SYMBOL_FORCE_TO_MEM;
16160 if (!(IN_RANGE (offset, -0x10000, 0x10000)
16161 || offset_within_block_p (x, offset)))
16162 return SYMBOL_FORCE_TO_MEM;
16163
16164 return SYMBOL_TINY_ABSOLUTE;
16165
16166 case AARCH64_CMODEL_SMALL:
16167 /* Same reasoning as the tiny code model, but the offset cap here is
16168 1MB, allowing +/-3.9GB for the offset to the symbol. */
16169
16170 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
16171 return SYMBOL_FORCE_TO_MEM;
16172 if (!(IN_RANGE (offset, -0x100000, 0x100000)
16173 || offset_within_block_p (x, offset)))
16174 return SYMBOL_FORCE_TO_MEM;
16175
16176 return SYMBOL_SMALL_ABSOLUTE;
16177
16178 case AARCH64_CMODEL_TINY_PIC:
16179 if (!aarch64_symbol_binds_local_p (x))
16180 return SYMBOL_TINY_GOT;
16181 return SYMBOL_TINY_ABSOLUTE;
16182
16183 case AARCH64_CMODEL_SMALL_SPIC:
16184 case AARCH64_CMODEL_SMALL_PIC:
16185 if (!aarch64_symbol_binds_local_p (x))
16186 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
16187 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
16188 return SYMBOL_SMALL_ABSOLUTE;
16189
16190 case AARCH64_CMODEL_LARGE:
16191 /* This is alright even in PIC code as the constant
16192 pool reference is always PC relative and within
16193 the same translation unit. */
16194 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
16195 return SYMBOL_SMALL_ABSOLUTE;
16196 else
16197 return SYMBOL_FORCE_TO_MEM;
16198
16199 default:
16200 gcc_unreachable ();
16201 }
16202 }
16203
16204 /* By default push everything into the constant pool. */
16205 return SYMBOL_FORCE_TO_MEM;
16206 }
16207
16208 bool
16209 aarch64_constant_address_p (rtx x)
16210 {
16211 return (CONSTANT_P (x) && memory_address_p (DImode, x));
16212 }
16213
16214 bool
16215 aarch64_legitimate_pic_operand_p (rtx x)
16216 {
16217 poly_int64 offset;
16218 x = strip_offset_and_salt (x, &offset);
16219 if (SYMBOL_REF_P (x))
16220 return false;
16221
16222 return true;
16223 }
16224
16225 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
16226 that should be rematerialized rather than spilled. */
16227
16228 static bool
16229 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
16230 {
16231 /* Support CSE and rematerialization of common constants. */
16232 if (CONST_INT_P (x)
16233 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
16234 || GET_CODE (x) == CONST_VECTOR)
16235 return true;
16236
16237 /* Do not allow vector struct mode constants for Advanced SIMD.
16238 We could support 0 and -1 easily, but they need support in
16239 aarch64-simd.md. */
16240 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16241 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16242 return false;
16243
16244 /* Only accept variable-length vector constants if they can be
16245 handled directly.
16246
16247 ??? It would be possible to handle rematerialization of other
16248 constants via secondary reloads. */
16249 if (vec_flags & VEC_ANY_SVE)
16250 return aarch64_simd_valid_immediate (x, NULL);
16251
16252 if (GET_CODE (x) == HIGH)
16253 x = XEXP (x, 0);
16254
16255 /* Accept polynomial constants that can be calculated by using the
16256 destination of a move as the sole temporary. Constants that
16257 require a second temporary cannot be rematerialized (they can't be
16258 forced to memory and also aren't legitimate constants). */
16259 poly_int64 offset;
16260 if (poly_int_rtx_p (x, &offset))
16261 return aarch64_offset_temporaries (false, offset) <= 1;
16262
16263 /* If an offset is being added to something else, we need to allow the
16264 base to be moved into the destination register, meaning that there
16265 are no free temporaries for the offset. */
16266 x = strip_offset_and_salt (x, &offset);
16267 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
16268 return false;
16269
16270 /* Do not allow const (plus (anchor_symbol, const_int)). */
16271 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
16272 return false;
16273
16274 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
16275 so spilling them is better than rematerialization. */
16276 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
16277 return true;
16278
16279 /* Label references are always constant. */
16280 if (LABEL_REF_P (x))
16281 return true;
16282
16283 return false;
16284 }
16285
16286 rtx
16287 aarch64_load_tp (rtx target)
16288 {
16289 if (!target
16290 || GET_MODE (target) != Pmode
16291 || !register_operand (target, Pmode))
16292 target = gen_reg_rtx (Pmode);
16293
16294 /* Can return in any reg. */
16295 emit_insn (gen_aarch64_load_tp_hard (target));
16296 return target;
16297 }
16298
16299 /* On AAPCS systems, this is the "struct __va_list". */
16300 static GTY(()) tree va_list_type;
16301
16302 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
16303 Return the type to use as __builtin_va_list.
16304
16305 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
16306
16307 struct __va_list
16308 {
16309 void *__stack;
16310 void *__gr_top;
16311 void *__vr_top;
16312 int __gr_offs;
16313 int __vr_offs;
16314 }; */
16315
16316 static tree
16317 aarch64_build_builtin_va_list (void)
16318 {
16319 tree va_list_name;
16320 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16321
16322 /* Create the type. */
16323 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16324 /* Give it the required name. */
16325 va_list_name = build_decl (BUILTINS_LOCATION,
16326 TYPE_DECL,
16327 get_identifier ("__va_list"),
16328 va_list_type);
16329 DECL_ARTIFICIAL (va_list_name) = 1;
16330 TYPE_NAME (va_list_type) = va_list_name;
16331 TYPE_STUB_DECL (va_list_type) = va_list_name;
16332
16333 /* Create the fields. */
16334 f_stack = build_decl (BUILTINS_LOCATION,
16335 FIELD_DECL, get_identifier ("__stack"),
16336 ptr_type_node);
16337 f_grtop = build_decl (BUILTINS_LOCATION,
16338 FIELD_DECL, get_identifier ("__gr_top"),
16339 ptr_type_node);
16340 f_vrtop = build_decl (BUILTINS_LOCATION,
16341 FIELD_DECL, get_identifier ("__vr_top"),
16342 ptr_type_node);
16343 f_groff = build_decl (BUILTINS_LOCATION,
16344 FIELD_DECL, get_identifier ("__gr_offs"),
16345 integer_type_node);
16346 f_vroff = build_decl (BUILTINS_LOCATION,
16347 FIELD_DECL, get_identifier ("__vr_offs"),
16348 integer_type_node);
16349
16350 /* Tell tree-stdarg pass about our internal offset fields.
16351 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16352 purpose to identify whether the code is updating va_list internal
16353 offset fields through irregular way. */
16354 va_list_gpr_counter_field = f_groff;
16355 va_list_fpr_counter_field = f_vroff;
16356
16357 DECL_ARTIFICIAL (f_stack) = 1;
16358 DECL_ARTIFICIAL (f_grtop) = 1;
16359 DECL_ARTIFICIAL (f_vrtop) = 1;
16360 DECL_ARTIFICIAL (f_groff) = 1;
16361 DECL_ARTIFICIAL (f_vroff) = 1;
16362
16363 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16364 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16365 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16366 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16367 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16368
16369 TYPE_FIELDS (va_list_type) = f_stack;
16370 DECL_CHAIN (f_stack) = f_grtop;
16371 DECL_CHAIN (f_grtop) = f_vrtop;
16372 DECL_CHAIN (f_vrtop) = f_groff;
16373 DECL_CHAIN (f_groff) = f_vroff;
16374
16375 /* Compute its layout. */
16376 layout_type (va_list_type);
16377
16378 return va_list_type;
16379 }
16380
16381 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
16382 static void
16383 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16384 {
16385 const CUMULATIVE_ARGS *cum;
16386 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16387 tree stack, grtop, vrtop, groff, vroff;
16388 tree t;
16389 int gr_save_area_size = cfun->va_list_gpr_size;
16390 int vr_save_area_size = cfun->va_list_fpr_size;
16391 int vr_offset;
16392
16393 cum = &crtl->args.info;
16394 if (cfun->va_list_gpr_size)
16395 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16396 cfun->va_list_gpr_size);
16397 if (cfun->va_list_fpr_size)
16398 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16399 * UNITS_PER_VREG, cfun->va_list_fpr_size);
16400
16401 if (!TARGET_FLOAT)
16402 {
16403 gcc_assert (cum->aapcs_nvrn == 0);
16404 vr_save_area_size = 0;
16405 }
16406
16407 f_stack = TYPE_FIELDS (va_list_type_node);
16408 f_grtop = DECL_CHAIN (f_stack);
16409 f_vrtop = DECL_CHAIN (f_grtop);
16410 f_groff = DECL_CHAIN (f_vrtop);
16411 f_vroff = DECL_CHAIN (f_groff);
16412
16413 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16414 NULL_TREE);
16415 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16416 NULL_TREE);
16417 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16418 NULL_TREE);
16419 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16420 NULL_TREE);
16421 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16422 NULL_TREE);
16423
16424 /* Emit code to initialize STACK, which points to the next varargs stack
16425 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
16426 by named arguments. STACK is 8-byte aligned. */
16427 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16428 if (cum->aapcs_stack_size > 0)
16429 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16430 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16431 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16432
16433 /* Emit code to initialize GRTOP, the top of the GR save area.
16434 virtual_incoming_args_rtx should have been 16 byte aligned. */
16435 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16436 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16437 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16438
16439 /* Emit code to initialize VRTOP, the top of the VR save area.
16440 This address is gr_save_area_bytes below GRTOP, rounded
16441 down to the next 16-byte boundary. */
16442 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
16443 vr_offset = ROUND_UP (gr_save_area_size,
16444 STACK_BOUNDARY / BITS_PER_UNIT);
16445
16446 if (vr_offset)
16447 t = fold_build_pointer_plus_hwi (t, -vr_offset);
16448 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16449 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16450
16451 /* Emit code to initialize GROFF, the offset from GRTOP of the
16452 next GPR argument. */
16453 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16454 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16455 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16456
16457 /* Likewise emit code to initialize VROFF, the offset from FTOP
16458 of the next VR argument. */
16459 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16460 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16461 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16462 }
16463
16464 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
16465
16466 static tree
16467 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16468 gimple_seq *post_p ATTRIBUTE_UNUSED)
16469 {
16470 tree addr;
16471 bool indirect_p;
16472 bool is_ha; /* is HFA or HVA. */
16473 bool dw_align; /* double-word align. */
16474 machine_mode ag_mode = VOIDmode;
16475 int nregs;
16476 machine_mode mode;
16477
16478 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16479 tree stack, f_top, f_off, off, arg, roundup, on_stack;
16480 HOST_WIDE_INT size, rsize, adjust, align;
16481 tree t, u, cond1, cond2;
16482
16483 indirect_p = pass_va_arg_by_reference (type);
16484 if (indirect_p)
16485 type = build_pointer_type (type);
16486
16487 mode = TYPE_MODE (type);
16488
16489 f_stack = TYPE_FIELDS (va_list_type_node);
16490 f_grtop = DECL_CHAIN (f_stack);
16491 f_vrtop = DECL_CHAIN (f_grtop);
16492 f_groff = DECL_CHAIN (f_vrtop);
16493 f_vroff = DECL_CHAIN (f_groff);
16494
16495 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16496 f_stack, NULL_TREE);
16497 size = int_size_in_bytes (type);
16498
16499 bool abi_break;
16500 align
16501 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
16502
16503 dw_align = false;
16504 adjust = 0;
16505 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16506 &is_ha, false))
16507 {
16508 /* No frontends can create types with variable-sized modes, so we
16509 shouldn't be asked to pass or return them. */
16510 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16511
16512 /* TYPE passed in fp/simd registers. */
16513 if (!TARGET_FLOAT)
16514 aarch64_err_no_fpadvsimd (mode);
16515
16516 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16517 unshare_expr (valist), f_vrtop, NULL_TREE);
16518 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16519 unshare_expr (valist), f_vroff, NULL_TREE);
16520
16521 rsize = nregs * UNITS_PER_VREG;
16522
16523 if (is_ha)
16524 {
16525 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16526 adjust = UNITS_PER_VREG - ag_size;
16527 }
16528 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16529 && size < UNITS_PER_VREG)
16530 {
16531 adjust = UNITS_PER_VREG - size;
16532 }
16533 }
16534 else
16535 {
16536 /* TYPE passed in general registers. */
16537 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16538 unshare_expr (valist), f_grtop, NULL_TREE);
16539 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16540 unshare_expr (valist), f_groff, NULL_TREE);
16541 rsize = ROUND_UP (size, UNITS_PER_WORD);
16542 nregs = rsize / UNITS_PER_WORD;
16543
16544 if (align > 8)
16545 {
16546 if (abi_break && warn_psabi)
16547 inform (input_location, "parameter passing for argument of type "
16548 "%qT changed in GCC 9.1", type);
16549 dw_align = true;
16550 }
16551
16552 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16553 && size < UNITS_PER_WORD)
16554 {
16555 adjust = UNITS_PER_WORD - size;
16556 }
16557 }
16558
16559 /* Get a local temporary for the field value. */
16560 off = get_initialized_tmp_var (f_off, pre_p, NULL);
16561
16562 /* Emit code to branch if off >= 0. */
16563 t = build2 (GE_EXPR, boolean_type_node, off,
16564 build_int_cst (TREE_TYPE (off), 0));
16565 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16566
16567 if (dw_align)
16568 {
16569 /* Emit: offs = (offs + 15) & -16. */
16570 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16571 build_int_cst (TREE_TYPE (off), 15));
16572 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16573 build_int_cst (TREE_TYPE (off), -16));
16574 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16575 }
16576 else
16577 roundup = NULL;
16578
16579 /* Update ap.__[g|v]r_offs */
16580 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16581 build_int_cst (TREE_TYPE (off), rsize));
16582 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16583
16584 /* String up. */
16585 if (roundup)
16586 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16587
16588 /* [cond2] if (ap.__[g|v]r_offs > 0) */
16589 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16590 build_int_cst (TREE_TYPE (f_off), 0));
16591 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16592
16593 /* String up: make sure the assignment happens before the use. */
16594 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16595 COND_EXPR_ELSE (cond1) = t;
16596
16597 /* Prepare the trees handling the argument that is passed on the stack;
16598 the top level node will store in ON_STACK. */
16599 arg = get_initialized_tmp_var (stack, pre_p, NULL);
16600 if (align > 8)
16601 {
16602 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
16603 t = fold_build_pointer_plus_hwi (arg, 15);
16604 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16605 build_int_cst (TREE_TYPE (t), -16));
16606 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16607 }
16608 else
16609 roundup = NULL;
16610 /* Advance ap.__stack */
16611 t = fold_build_pointer_plus_hwi (arg, size + 7);
16612 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16613 build_int_cst (TREE_TYPE (t), -8));
16614 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16615 /* String up roundup and advance. */
16616 if (roundup)
16617 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16618 /* String up with arg */
16619 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16620 /* Big-endianness related address adjustment. */
16621 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16622 && size < UNITS_PER_WORD)
16623 {
16624 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16625 size_int (UNITS_PER_WORD - size));
16626 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16627 }
16628
16629 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16630 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16631
16632 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
16633 t = off;
16634 if (adjust)
16635 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16636 build_int_cst (TREE_TYPE (off), adjust));
16637
16638 t = fold_convert (sizetype, t);
16639 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16640
16641 if (is_ha)
16642 {
16643 /* type ha; // treat as "struct {ftype field[n];}"
16644 ... [computing offs]
16645 for (i = 0; i <nregs; ++i, offs += 16)
16646 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16647 return ha; */
16648 int i;
16649 tree tmp_ha, field_t, field_ptr_t;
16650
16651 /* Declare a local variable. */
16652 tmp_ha = create_tmp_var_raw (type, "ha");
16653 gimple_add_tmp_var (tmp_ha);
16654
16655 /* Establish the base type. */
16656 switch (ag_mode)
16657 {
16658 case E_SFmode:
16659 field_t = float_type_node;
16660 field_ptr_t = float_ptr_type_node;
16661 break;
16662 case E_DFmode:
16663 field_t = double_type_node;
16664 field_ptr_t = double_ptr_type_node;
16665 break;
16666 case E_TFmode:
16667 field_t = long_double_type_node;
16668 field_ptr_t = long_double_ptr_type_node;
16669 break;
16670 case E_HFmode:
16671 field_t = aarch64_fp16_type_node;
16672 field_ptr_t = aarch64_fp16_ptr_type_node;
16673 break;
16674 case E_BFmode:
16675 field_t = aarch64_bf16_type_node;
16676 field_ptr_t = aarch64_bf16_ptr_type_node;
16677 break;
16678 case E_V2SImode:
16679 case E_V4SImode:
16680 {
16681 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16682 field_t = build_vector_type_for_mode (innertype, ag_mode);
16683 field_ptr_t = build_pointer_type (field_t);
16684 }
16685 break;
16686 default:
16687 gcc_assert (0);
16688 }
16689
16690 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
16691 TREE_ADDRESSABLE (tmp_ha) = 1;
16692 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16693 addr = t;
16694 t = fold_convert (field_ptr_t, addr);
16695 t = build2 (MODIFY_EXPR, field_t,
16696 build1 (INDIRECT_REF, field_t, tmp_ha),
16697 build1 (INDIRECT_REF, field_t, t));
16698
16699 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
16700 for (i = 1; i < nregs; ++i)
16701 {
16702 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16703 u = fold_convert (field_ptr_t, addr);
16704 u = build2 (MODIFY_EXPR, field_t,
16705 build2 (MEM_REF, field_t, tmp_ha,
16706 build_int_cst (field_ptr_t,
16707 (i *
16708 int_size_in_bytes (field_t)))),
16709 build1 (INDIRECT_REF, field_t, u));
16710 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16711 }
16712
16713 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16714 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16715 }
16716
16717 COND_EXPR_ELSE (cond2) = t;
16718 addr = fold_convert (build_pointer_type (type), cond1);
16719 addr = build_va_arg_indirect_ref (addr);
16720
16721 if (indirect_p)
16722 addr = build_va_arg_indirect_ref (addr);
16723
16724 return addr;
16725 }
16726
16727 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
16728
16729 static void
16730 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16731 const function_arg_info &arg,
16732 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16733 {
16734 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16735 CUMULATIVE_ARGS local_cum;
16736 int gr_saved = cfun->va_list_gpr_size;
16737 int vr_saved = cfun->va_list_fpr_size;
16738
16739 /* The caller has advanced CUM up to, but not beyond, the last named
16740 argument. Advance a local copy of CUM past the last "real" named
16741 argument, to find out how many registers are left over. */
16742 local_cum = *cum;
16743 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16744
16745 /* Found out how many registers we need to save.
16746 Honor tree-stdvar analysis results. */
16747 if (cfun->va_list_gpr_size)
16748 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16749 cfun->va_list_gpr_size / UNITS_PER_WORD);
16750 if (cfun->va_list_fpr_size)
16751 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16752 cfun->va_list_fpr_size / UNITS_PER_VREG);
16753
16754 if (!TARGET_FLOAT)
16755 {
16756 gcc_assert (local_cum.aapcs_nvrn == 0);
16757 vr_saved = 0;
16758 }
16759
16760 if (!no_rtl)
16761 {
16762 if (gr_saved > 0)
16763 {
16764 rtx ptr, mem;
16765
16766 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
16767 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16768 - gr_saved * UNITS_PER_WORD);
16769 mem = gen_frame_mem (BLKmode, ptr);
16770 set_mem_alias_set (mem, get_varargs_alias_set ());
16771
16772 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16773 mem, gr_saved);
16774 }
16775 if (vr_saved > 0)
16776 {
16777 /* We can't use move_block_from_reg, because it will use
16778 the wrong mode, storing D regs only. */
16779 machine_mode mode = TImode;
16780 int off, i, vr_start;
16781
16782 /* Set OFF to the offset from virtual_incoming_args_rtx of
16783 the first vector register. The VR save area lies below
16784 the GR one, and is aligned to 16 bytes. */
16785 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16786 STACK_BOUNDARY / BITS_PER_UNIT);
16787 off -= vr_saved * UNITS_PER_VREG;
16788
16789 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16790 for (i = 0; i < vr_saved; ++i)
16791 {
16792 rtx ptr, mem;
16793
16794 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16795 mem = gen_frame_mem (mode, ptr);
16796 set_mem_alias_set (mem, get_varargs_alias_set ());
16797 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
16798 off += UNITS_PER_VREG;
16799 }
16800 }
16801 }
16802
16803 /* We don't save the size into *PRETEND_SIZE because we want to avoid
16804 any complication of having crtl->args.pretend_args_size changed. */
16805 cfun->machine->frame.saved_varargs_size
16806 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16807 STACK_BOUNDARY / BITS_PER_UNIT)
16808 + vr_saved * UNITS_PER_VREG);
16809 }
16810
16811 static void
16812 aarch64_conditional_register_usage (void)
16813 {
16814 int i;
16815 if (!TARGET_FLOAT)
16816 {
16817 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16818 {
16819 fixed_regs[i] = 1;
16820 call_used_regs[i] = 1;
16821 }
16822 }
16823 if (!TARGET_SVE)
16824 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16825 {
16826 fixed_regs[i] = 1;
16827 call_used_regs[i] = 1;
16828 }
16829
16830 /* Only allow the FFR and FFRT to be accessed via special patterns. */
16831 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16832 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16833
16834 /* When tracking speculation, we need a couple of call-clobbered registers
16835 to track the speculation state. It would be nice to just use
16836 IP0 and IP1, but currently there are numerous places that just
16837 assume these registers are free for other uses (eg pointer
16838 authentication). */
16839 if (aarch64_track_speculation)
16840 {
16841 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16842 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16843 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16844 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16845 }
16846 }
16847
16848 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
16849
16850 bool
16851 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16852 {
16853 /* For records we're passed a FIELD_DECL, for arrays we're passed
16854 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
16855 const_tree type = TREE_TYPE (field_or_array);
16856
16857 /* Assign BLKmode to anything that contains multiple SVE predicates.
16858 For structures, the "multiple" case is indicated by MODE being
16859 VOIDmode. */
16860 unsigned int num_zr, num_pr;
16861 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16862 {
16863 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16864 return !simple_cst_equal (TYPE_SIZE (field_or_array),
16865 TYPE_SIZE (type));
16866 return mode == VOIDmode;
16867 }
16868
16869 return default_member_type_forces_blk (field_or_array, mode);
16870 }
16871
16872 /* Bitmasks that indicate whether earlier versions of GCC would have
16873 taken a different path through the ABI logic. This should result in
16874 a -Wpsabi warning if the earlier path led to a different ABI decision.
16875
16876 WARN_PSABI_EMPTY_CXX17_BASE
16877 Indicates that the type includes an artificial empty C++17 base field
16878 that, prior to GCC 10.1, would prevent the type from being treated as
16879 a HFA or HVA. See PR94383 for details.
16880
16881 WARN_PSABI_NO_UNIQUE_ADDRESS
16882 Indicates that the type includes an empty [[no_unique_address]] field
16883 that, prior to GCC 10.1, would prevent the type from being treated as
16884 a HFA or HVA. */
16885 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
16886 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
16887
16888 /* Walk down the type tree of TYPE counting consecutive base elements.
16889 If *MODEP is VOIDmode, then set it to the first valid floating point
16890 type. If a non-floating point type is found, or if a floating point
16891 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16892 otherwise return the count in the sub-tree.
16893
16894 The WARN_PSABI_FLAGS argument allows the caller to check whether this
16895 function has changed its behavior relative to earlier versions of GCC.
16896 Normally the argument should be nonnull and point to a zero-initialized
16897 variable. The function then records whether the ABI decision might
16898 be affected by a known fix to the ABI logic, setting the associated
16899 WARN_PSABI_* bits if so.
16900
16901 When the argument is instead a null pointer, the function tries to
16902 simulate the behavior of GCC before all such ABI fixes were made.
16903 This is useful to check whether the function returns something
16904 different after the ABI fixes. */
16905 static int
16906 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
16907 unsigned int *warn_psabi_flags)
16908 {
16909 machine_mode mode;
16910 HOST_WIDE_INT size;
16911
16912 if (aarch64_sve::builtin_type_p (type))
16913 return -1;
16914
16915 switch (TREE_CODE (type))
16916 {
16917 case REAL_TYPE:
16918 mode = TYPE_MODE (type);
16919 if (mode != DFmode && mode != SFmode
16920 && mode != TFmode && mode != HFmode)
16921 return -1;
16922
16923 if (*modep == VOIDmode)
16924 *modep = mode;
16925
16926 if (*modep == mode)
16927 return 1;
16928
16929 break;
16930
16931 case COMPLEX_TYPE:
16932 mode = TYPE_MODE (TREE_TYPE (type));
16933 if (mode != DFmode && mode != SFmode
16934 && mode != TFmode && mode != HFmode)
16935 return -1;
16936
16937 if (*modep == VOIDmode)
16938 *modep = mode;
16939
16940 if (*modep == mode)
16941 return 2;
16942
16943 break;
16944
16945 case VECTOR_TYPE:
16946 /* Use V2SImode and V4SImode as representatives of all 64-bit
16947 and 128-bit vector types. */
16948 size = int_size_in_bytes (type);
16949 switch (size)
16950 {
16951 case 8:
16952 mode = V2SImode;
16953 break;
16954 case 16:
16955 mode = V4SImode;
16956 break;
16957 default:
16958 return -1;
16959 }
16960
16961 if (*modep == VOIDmode)
16962 *modep = mode;
16963
16964 /* Vector modes are considered to be opaque: two vectors are
16965 equivalent for the purposes of being homogeneous aggregates
16966 if they are the same size. */
16967 if (*modep == mode)
16968 return 1;
16969
16970 break;
16971
16972 case ARRAY_TYPE:
16973 {
16974 int count;
16975 tree index = TYPE_DOMAIN (type);
16976
16977 /* Can't handle incomplete types nor sizes that are not
16978 fixed. */
16979 if (!COMPLETE_TYPE_P (type)
16980 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16981 return -1;
16982
16983 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
16984 warn_psabi_flags);
16985 if (count == -1
16986 || !index
16987 || !TYPE_MAX_VALUE (index)
16988 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
16989 || !TYPE_MIN_VALUE (index)
16990 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
16991 || count < 0)
16992 return -1;
16993
16994 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16995 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
16996
16997 /* There must be no padding. */
16998 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16999 count * GET_MODE_BITSIZE (*modep)))
17000 return -1;
17001
17002 return count;
17003 }
17004
17005 case RECORD_TYPE:
17006 {
17007 int count = 0;
17008 int sub_count;
17009 tree field;
17010
17011 /* Can't handle incomplete types nor sizes that are not
17012 fixed. */
17013 if (!COMPLETE_TYPE_P (type)
17014 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
17015 return -1;
17016
17017 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17018 {
17019 if (TREE_CODE (field) != FIELD_DECL)
17020 continue;
17021
17022 if (DECL_FIELD_ABI_IGNORED (field))
17023 {
17024 /* See whether this is something that earlier versions of
17025 GCC failed to ignore. */
17026 unsigned int flag;
17027 if (lookup_attribute ("no_unique_address",
17028 DECL_ATTRIBUTES (field)))
17029 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
17030 else if (cxx17_empty_base_field_p (field))
17031 flag = WARN_PSABI_EMPTY_CXX17_BASE;
17032 else
17033 /* No compatibility problem. */
17034 continue;
17035
17036 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
17037 if (warn_psabi_flags)
17038 {
17039 *warn_psabi_flags |= flag;
17040 continue;
17041 }
17042 }
17043
17044 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
17045 warn_psabi_flags);
17046 if (sub_count < 0)
17047 return -1;
17048 count += sub_count;
17049 }
17050
17051 /* There must be no padding. */
17052 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17053 count * GET_MODE_BITSIZE (*modep)))
17054 return -1;
17055
17056 return count;
17057 }
17058
17059 case UNION_TYPE:
17060 case QUAL_UNION_TYPE:
17061 {
17062 /* These aren't very interesting except in a degenerate case. */
17063 int count = 0;
17064 int sub_count;
17065 tree field;
17066
17067 /* Can't handle incomplete types nor sizes that are not
17068 fixed. */
17069 if (!COMPLETE_TYPE_P (type)
17070 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
17071 return -1;
17072
17073 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17074 {
17075 if (TREE_CODE (field) != FIELD_DECL)
17076 continue;
17077
17078 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
17079 warn_psabi_flags);
17080 if (sub_count < 0)
17081 return -1;
17082 count = count > sub_count ? count : sub_count;
17083 }
17084
17085 /* There must be no padding. */
17086 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17087 count * GET_MODE_BITSIZE (*modep)))
17088 return -1;
17089
17090 return count;
17091 }
17092
17093 default:
17094 break;
17095 }
17096
17097 return -1;
17098 }
17099
17100 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
17101 type as described in AAPCS64 \S 4.1.2.
17102
17103 See the comment above aarch64_composite_type_p for the notes on MODE. */
17104
17105 static bool
17106 aarch64_short_vector_p (const_tree type,
17107 machine_mode mode)
17108 {
17109 poly_int64 size = -1;
17110
17111 if (type && TREE_CODE (type) == VECTOR_TYPE)
17112 {
17113 if (aarch64_sve::builtin_type_p (type))
17114 return false;
17115 size = int_size_in_bytes (type);
17116 }
17117 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
17118 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
17119 {
17120 /* Rely only on the type, not the mode, when processing SVE types. */
17121 if (type && aarch64_some_values_include_pst_objects_p (type))
17122 /* Leave later code to report an error if SVE is disabled. */
17123 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
17124 else
17125 size = GET_MODE_SIZE (mode);
17126 }
17127 if (known_eq (size, 8) || known_eq (size, 16))
17128 {
17129 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
17130 they are being treated as scalable AAPCS64 types. */
17131 gcc_assert (!aarch64_sve_mode_p (mode));
17132 return true;
17133 }
17134 return false;
17135 }
17136
17137 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
17138 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
17139 array types. The C99 floating-point complex types are also considered
17140 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
17141 types, which are GCC extensions and out of the scope of AAPCS64, are
17142 treated as composite types here as well.
17143
17144 Note that MODE itself is not sufficient in determining whether a type
17145 is such a composite type or not. This is because
17146 stor-layout.c:compute_record_mode may have already changed the MODE
17147 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
17148 structure with only one field may have its MODE set to the mode of the
17149 field. Also an integer mode whose size matches the size of the
17150 RECORD_TYPE type may be used to substitute the original mode
17151 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
17152 solely relied on. */
17153
17154 static bool
17155 aarch64_composite_type_p (const_tree type,
17156 machine_mode mode)
17157 {
17158 if (aarch64_short_vector_p (type, mode))
17159 return false;
17160
17161 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
17162 return true;
17163
17164 if (mode == BLKmode
17165 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
17166 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
17167 return true;
17168
17169 return false;
17170 }
17171
17172 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
17173 shall be passed or returned in simd/fp register(s) (providing these
17174 parameter passing registers are available).
17175
17176 Upon successful return, *COUNT returns the number of needed registers,
17177 *BASE_MODE returns the mode of the individual register and when IS_HAF
17178 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
17179 floating-point aggregate or a homogeneous short-vector aggregate.
17180
17181 SILENT_P is true if the function should refrain from reporting any
17182 diagnostics. This should only be used if the caller is certain that
17183 any ABI decisions would eventually come through this function with
17184 SILENT_P set to false. */
17185
17186 static bool
17187 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
17188 const_tree type,
17189 machine_mode *base_mode,
17190 int *count,
17191 bool *is_ha,
17192 bool silent_p)
17193 {
17194 if (is_ha != NULL) *is_ha = false;
17195
17196 machine_mode new_mode = VOIDmode;
17197 bool composite_p = aarch64_composite_type_p (type, mode);
17198
17199 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
17200 || aarch64_short_vector_p (type, mode))
17201 {
17202 *count = 1;
17203 new_mode = mode;
17204 }
17205 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
17206 {
17207 if (is_ha != NULL) *is_ha = true;
17208 *count = 2;
17209 new_mode = GET_MODE_INNER (mode);
17210 }
17211 else if (type && composite_p)
17212 {
17213 unsigned int warn_psabi_flags = 0;
17214 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
17215 &warn_psabi_flags);
17216 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
17217 {
17218 static unsigned last_reported_type_uid;
17219 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
17220 int alt;
17221 if (!silent_p
17222 && warn_psabi
17223 && warn_psabi_flags
17224 && uid != last_reported_type_uid
17225 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
17226 != ag_count))
17227 {
17228 const char *url
17229 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
17230 gcc_assert (alt == -1);
17231 last_reported_type_uid = uid;
17232 /* Use TYPE_MAIN_VARIANT to strip any redundant const
17233 qualification. */
17234 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
17235 inform (input_location, "parameter passing for argument of "
17236 "type %qT with %<[[no_unique_address]]%> members "
17237 "changed %{in GCC 10.1%}",
17238 TYPE_MAIN_VARIANT (type), url);
17239 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
17240 inform (input_location, "parameter passing for argument of "
17241 "type %qT when C++17 is enabled changed to match "
17242 "C++14 %{in GCC 10.1%}",
17243 TYPE_MAIN_VARIANT (type), url);
17244 }
17245
17246 if (is_ha != NULL) *is_ha = true;
17247 *count = ag_count;
17248 }
17249 else
17250 return false;
17251 }
17252 else
17253 return false;
17254
17255 gcc_assert (!aarch64_sve_mode_p (new_mode));
17256 *base_mode = new_mode;
17257 return true;
17258 }
17259
17260 /* Implement TARGET_STRUCT_VALUE_RTX. */
17261
17262 static rtx
17263 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
17264 int incoming ATTRIBUTE_UNUSED)
17265 {
17266 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
17267 }
17268
17269 /* Implements target hook vector_mode_supported_p. */
17270 static bool
17271 aarch64_vector_mode_supported_p (machine_mode mode)
17272 {
17273 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17274 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
17275 }
17276
17277 /* Return the full-width SVE vector mode for element mode MODE, if one
17278 exists. */
17279 opt_machine_mode
17280 aarch64_full_sve_mode (scalar_mode mode)
17281 {
17282 switch (mode)
17283 {
17284 case E_DFmode:
17285 return VNx2DFmode;
17286 case E_SFmode:
17287 return VNx4SFmode;
17288 case E_HFmode:
17289 return VNx8HFmode;
17290 case E_BFmode:
17291 return VNx8BFmode;
17292 case E_DImode:
17293 return VNx2DImode;
17294 case E_SImode:
17295 return VNx4SImode;
17296 case E_HImode:
17297 return VNx8HImode;
17298 case E_QImode:
17299 return VNx16QImode;
17300 default:
17301 return opt_machine_mode ();
17302 }
17303 }
17304
17305 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
17306 if it exists. */
17307 opt_machine_mode
17308 aarch64_vq_mode (scalar_mode mode)
17309 {
17310 switch (mode)
17311 {
17312 case E_DFmode:
17313 return V2DFmode;
17314 case E_SFmode:
17315 return V4SFmode;
17316 case E_HFmode:
17317 return V8HFmode;
17318 case E_BFmode:
17319 return V8BFmode;
17320 case E_SImode:
17321 return V4SImode;
17322 case E_HImode:
17323 return V8HImode;
17324 case E_QImode:
17325 return V16QImode;
17326 case E_DImode:
17327 return V2DImode;
17328 default:
17329 return opt_machine_mode ();
17330 }
17331 }
17332
17333 /* Return appropriate SIMD container
17334 for MODE within a vector of WIDTH bits. */
17335 static machine_mode
17336 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
17337 {
17338 if (TARGET_SVE
17339 && maybe_ne (width, 128)
17340 && known_eq (width, BITS_PER_SVE_VECTOR))
17341 return aarch64_full_sve_mode (mode).else_mode (word_mode);
17342
17343 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
17344 if (TARGET_SIMD)
17345 {
17346 if (known_eq (width, 128))
17347 return aarch64_vq_mode (mode).else_mode (word_mode);
17348 else
17349 switch (mode)
17350 {
17351 case E_SFmode:
17352 return V2SFmode;
17353 case E_HFmode:
17354 return V4HFmode;
17355 case E_BFmode:
17356 return V4BFmode;
17357 case E_SImode:
17358 return V2SImode;
17359 case E_HImode:
17360 return V4HImode;
17361 case E_QImode:
17362 return V8QImode;
17363 default:
17364 break;
17365 }
17366 }
17367 return word_mode;
17368 }
17369
17370 static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64);
17371
17372 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
17373 and return whether the SVE mode should be preferred over the
17374 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
17375 static bool
17376 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
17377 {
17378 /* Take into account the aarch64-autovec-preference param if non-zero. */
17379 bool only_asimd_p = aarch64_autovec_preference == 1;
17380 bool only_sve_p = aarch64_autovec_preference == 2;
17381
17382 if (only_asimd_p)
17383 return false;
17384 if (only_sve_p)
17385 return true;
17386
17387 /* The preference in case of a tie in costs. */
17388 bool prefer_asimd = aarch64_autovec_preference == 3;
17389 bool prefer_sve = aarch64_autovec_preference == 4;
17390
17391 aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
17392
17393 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
17394 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
17395 /* If the CPU information does not have an SVE width registered use the
17396 generic poly_int comparison that prefers SVE. If a preference is
17397 explicitly requested avoid this path. */
17398 if (tune_width == SVE_SCALABLE
17399 && !prefer_asimd
17400 && !prefer_sve)
17401 return maybe_gt (nunits_sve, nunits_asimd);
17402
17403 /* Otherwise estimate the runtime width of the modes involved. */
17404 HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve);
17405 HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd);
17406
17407 /* Preferring SVE means picking it first unless the Advanced SIMD mode
17408 is clearly wider. */
17409 if (prefer_sve)
17410 return est_sve >= est_asimd;
17411 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
17412 is clearly wider. */
17413 if (prefer_asimd)
17414 return est_sve > est_asimd;
17415
17416 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
17417 return est_sve > est_asimd;
17418 }
17419
17420 /* Return 128-bit container as the preferred SIMD mode for MODE. */
17421 static machine_mode
17422 aarch64_preferred_simd_mode (scalar_mode mode)
17423 {
17424 /* Take into account explicit auto-vectorization ISA preferences through
17425 aarch64_cmp_autovec_modes. */
17426 poly_int64 bits
17427 = (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
17428 ? BITS_PER_SVE_VECTOR : 128;
17429 return aarch64_simd_container_mode (mode, bits);
17430 }
17431
17432 /* Return a list of possible vector sizes for the vectorizer
17433 to iterate over. */
17434 static unsigned int
17435 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
17436 {
17437 static const machine_mode sve_modes[] = {
17438 /* Try using full vectors for all element types. */
17439 VNx16QImode,
17440
17441 /* Try using 16-bit containers for 8-bit elements and full vectors
17442 for wider elements. */
17443 VNx8QImode,
17444
17445 /* Try using 32-bit containers for 8-bit and 16-bit elements and
17446 full vectors for wider elements. */
17447 VNx4QImode,
17448
17449 /* Try using 64-bit containers for all element types. */
17450 VNx2QImode
17451 };
17452
17453 static const machine_mode advsimd_modes[] = {
17454 /* Try using 128-bit vectors for all element types. */
17455 V16QImode,
17456
17457 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17458 for wider elements. */
17459 V8QImode,
17460
17461 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17462 for wider elements.
17463
17464 TODO: We could support a limited form of V4QImode too, so that
17465 we use 32-bit vectors for 8-bit elements. */
17466 V4HImode,
17467
17468 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17469 for 64-bit elements.
17470
17471 TODO: We could similarly support limited forms of V2QImode and V2HImode
17472 for this case. */
17473 V2SImode
17474 };
17475
17476 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17477 This is because:
17478
17479 - If we can't use N-byte Advanced SIMD vectors then the placement
17480 doesn't matter; we'll just continue as though the Advanced SIMD
17481 entry didn't exist.
17482
17483 - If an SVE main loop with N bytes ends up being cheaper than an
17484 Advanced SIMD main loop with N bytes then by default we'll replace
17485 the Advanced SIMD version with the SVE one.
17486
17487 - If an Advanced SIMD main loop with N bytes ends up being cheaper
17488 than an SVE main loop with N bytes then by default we'll try to
17489 use the SVE loop to vectorize the epilogue instead. */
17490
17491 bool only_asimd_p = aarch64_autovec_preference == 1;
17492 bool only_sve_p = aarch64_autovec_preference == 2;
17493
17494 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
17495 unsigned int advsimd_i = 0;
17496
17497 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
17498 {
17499 if (sve_i < ARRAY_SIZE (sve_modes)
17500 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
17501 advsimd_modes[advsimd_i]))
17502 modes->safe_push (sve_modes[sve_i++]);
17503 else
17504 modes->safe_push (advsimd_modes[advsimd_i++]);
17505 }
17506 while (sve_i < ARRAY_SIZE (sve_modes))
17507 modes->safe_push (sve_modes[sve_i++]);
17508
17509 unsigned int flags = 0;
17510 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17511 can compare SVE against Advanced SIMD and so that we can compare
17512 multiple SVE vectorization approaches against each other. There's
17513 not really any point doing this for Advanced SIMD only, since the
17514 first mode that works should always be the best. */
17515 if (TARGET_SVE && aarch64_sve_compare_costs)
17516 flags |= VECT_COMPARE_COSTS;
17517 return flags;
17518 }
17519
17520 /* Implement TARGET_MANGLE_TYPE. */
17521
17522 static const char *
17523 aarch64_mangle_type (const_tree type)
17524 {
17525 /* The AArch64 ABI documents say that "__va_list" has to be
17526 mangled as if it is in the "std" namespace. */
17527 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17528 return "St9__va_list";
17529
17530 /* Half-precision floating point types. */
17531 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
17532 {
17533 if (TYPE_MODE (type) == BFmode)
17534 return "u6__bf16";
17535 else
17536 return "Dh";
17537 }
17538
17539 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
17540 builtin types. */
17541 if (TYPE_NAME (type) != NULL)
17542 {
17543 const char *res;
17544 if ((res = aarch64_general_mangle_builtin_type (type))
17545 || (res = aarch64_sve::mangle_builtin_type (type)))
17546 return res;
17547 }
17548
17549 /* Use the default mangling. */
17550 return NULL;
17551 }
17552
17553 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
17554
17555 static bool
17556 aarch64_verify_type_context (location_t loc, type_context_kind context,
17557 const_tree type, bool silent_p)
17558 {
17559 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17560 }
17561
17562 /* Find the first rtx_insn before insn that will generate an assembly
17563 instruction. */
17564
17565 static rtx_insn *
17566 aarch64_prev_real_insn (rtx_insn *insn)
17567 {
17568 if (!insn)
17569 return NULL;
17570
17571 do
17572 {
17573 insn = prev_real_insn (insn);
17574 }
17575 while (insn && recog_memoized (insn) < 0);
17576
17577 return insn;
17578 }
17579
17580 static bool
17581 is_madd_op (enum attr_type t1)
17582 {
17583 unsigned int i;
17584 /* A number of these may be AArch32 only. */
17585 enum attr_type mlatypes[] = {
17586 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17587 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17588 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17589 };
17590
17591 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17592 {
17593 if (t1 == mlatypes[i])
17594 return true;
17595 }
17596
17597 return false;
17598 }
17599
17600 /* Check if there is a register dependency between a load and the insn
17601 for which we hold recog_data. */
17602
17603 static bool
17604 dep_between_memop_and_curr (rtx memop)
17605 {
17606 rtx load_reg;
17607 int opno;
17608
17609 gcc_assert (GET_CODE (memop) == SET);
17610
17611 if (!REG_P (SET_DEST (memop)))
17612 return false;
17613
17614 load_reg = SET_DEST (memop);
17615 for (opno = 1; opno < recog_data.n_operands; opno++)
17616 {
17617 rtx operand = recog_data.operand[opno];
17618 if (REG_P (operand)
17619 && reg_overlap_mentioned_p (load_reg, operand))
17620 return true;
17621
17622 }
17623 return false;
17624 }
17625
17626
17627 /* When working around the Cortex-A53 erratum 835769,
17628 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17629 instruction and has a preceding memory instruction such that a NOP
17630 should be inserted between them. */
17631
17632 bool
17633 aarch64_madd_needs_nop (rtx_insn* insn)
17634 {
17635 enum attr_type attr_type;
17636 rtx_insn *prev;
17637 rtx body;
17638
17639 if (!TARGET_FIX_ERR_A53_835769)
17640 return false;
17641
17642 if (!INSN_P (insn) || recog_memoized (insn) < 0)
17643 return false;
17644
17645 attr_type = get_attr_type (insn);
17646 if (!is_madd_op (attr_type))
17647 return false;
17648
17649 prev = aarch64_prev_real_insn (insn);
17650 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17651 Restore recog state to INSN to avoid state corruption. */
17652 extract_constrain_insn_cached (insn);
17653
17654 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17655 return false;
17656
17657 body = single_set (prev);
17658
17659 /* If the previous insn is a memory op and there is no dependency between
17660 it and the DImode madd, emit a NOP between them. If body is NULL then we
17661 have a complex memory operation, probably a load/store pair.
17662 Be conservative for now and emit a NOP. */
17663 if (GET_MODE (recog_data.operand[0]) == DImode
17664 && (!body || !dep_between_memop_and_curr (body)))
17665 return true;
17666
17667 return false;
17668
17669 }
17670
17671
17672 /* Implement FINAL_PRESCAN_INSN. */
17673
17674 void
17675 aarch64_final_prescan_insn (rtx_insn *insn)
17676 {
17677 if (aarch64_madd_needs_nop (insn))
17678 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17679 }
17680
17681
17682 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17683 instruction. */
17684
17685 bool
17686 aarch64_sve_index_immediate_p (rtx base_or_step)
17687 {
17688 return (CONST_INT_P (base_or_step)
17689 && IN_RANGE (INTVAL (base_or_step), -16, 15));
17690 }
17691
17692 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17693 when applied to mode MODE. Negate X first if NEGATE_P is true. */
17694
17695 bool
17696 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17697 {
17698 rtx elt = unwrap_const_vec_duplicate (x);
17699 if (!CONST_INT_P (elt))
17700 return false;
17701
17702 HOST_WIDE_INT val = INTVAL (elt);
17703 if (negate_p)
17704 val = -val;
17705 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17706
17707 if (val & 0xff)
17708 return IN_RANGE (val, 0, 0xff);
17709 return IN_RANGE (val, 0, 0xff00);
17710 }
17711
17712 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17713 instructions when applied to mode MODE. Negate X first if NEGATE_P
17714 is true. */
17715
17716 bool
17717 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17718 {
17719 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17720 return false;
17721
17722 /* After the optional negation, the immediate must be nonnegative.
17723 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17724 instead of SQADD Zn.B, Zn.B, #129. */
17725 rtx elt = unwrap_const_vec_duplicate (x);
17726 return negate_p == (INTVAL (elt) < 0);
17727 }
17728
17729 /* Return true if X is a valid immediate operand for an SVE logical
17730 instruction such as AND. */
17731
17732 bool
17733 aarch64_sve_bitmask_immediate_p (rtx x)
17734 {
17735 rtx elt;
17736
17737 return (const_vec_duplicate_p (x, &elt)
17738 && CONST_INT_P (elt)
17739 && aarch64_bitmask_imm (INTVAL (elt),
17740 GET_MODE_INNER (GET_MODE (x))));
17741 }
17742
17743 /* Return true if X is a valid immediate for the SVE DUP and CPY
17744 instructions. */
17745
17746 bool
17747 aarch64_sve_dup_immediate_p (rtx x)
17748 {
17749 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17750 if (!CONST_INT_P (x))
17751 return false;
17752
17753 HOST_WIDE_INT val = INTVAL (x);
17754 if (val & 0xff)
17755 return IN_RANGE (val, -0x80, 0x7f);
17756 return IN_RANGE (val, -0x8000, 0x7f00);
17757 }
17758
17759 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17760 SIGNED_P says whether the operand is signed rather than unsigned. */
17761
17762 bool
17763 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17764 {
17765 x = unwrap_const_vec_duplicate (x);
17766 return (CONST_INT_P (x)
17767 && (signed_p
17768 ? IN_RANGE (INTVAL (x), -16, 15)
17769 : IN_RANGE (INTVAL (x), 0, 127)));
17770 }
17771
17772 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17773 instruction. Negate X first if NEGATE_P is true. */
17774
17775 bool
17776 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17777 {
17778 rtx elt;
17779 REAL_VALUE_TYPE r;
17780
17781 if (!const_vec_duplicate_p (x, &elt)
17782 || !CONST_DOUBLE_P (elt))
17783 return false;
17784
17785 r = *CONST_DOUBLE_REAL_VALUE (elt);
17786
17787 if (negate_p)
17788 r = real_value_negate (&r);
17789
17790 if (real_equal (&r, &dconst1))
17791 return true;
17792 if (real_equal (&r, &dconsthalf))
17793 return true;
17794 return false;
17795 }
17796
17797 /* Return true if X is a valid immediate operand for an SVE FMUL
17798 instruction. */
17799
17800 bool
17801 aarch64_sve_float_mul_immediate_p (rtx x)
17802 {
17803 rtx elt;
17804
17805 return (const_vec_duplicate_p (x, &elt)
17806 && CONST_DOUBLE_P (elt)
17807 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17808 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
17809 }
17810
17811 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17812 for the Advanced SIMD operation described by WHICH and INSN. If INFO
17813 is nonnull, use it to describe valid immediates. */
17814 static bool
17815 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17816 simd_immediate_info *info,
17817 enum simd_immediate_check which,
17818 simd_immediate_info::insn_type insn)
17819 {
17820 /* Try a 4-byte immediate with LSL. */
17821 for (unsigned int shift = 0; shift < 32; shift += 8)
17822 if ((val32 & (0xff << shift)) == val32)
17823 {
17824 if (info)
17825 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17826 simd_immediate_info::LSL, shift);
17827 return true;
17828 }
17829
17830 /* Try a 2-byte immediate with LSL. */
17831 unsigned int imm16 = val32 & 0xffff;
17832 if (imm16 == (val32 >> 16))
17833 for (unsigned int shift = 0; shift < 16; shift += 8)
17834 if ((imm16 & (0xff << shift)) == imm16)
17835 {
17836 if (info)
17837 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17838 simd_immediate_info::LSL, shift);
17839 return true;
17840 }
17841
17842 /* Try a 4-byte immediate with MSL, except for cases that MVN
17843 can handle. */
17844 if (which == AARCH64_CHECK_MOV)
17845 for (unsigned int shift = 8; shift < 24; shift += 8)
17846 {
17847 unsigned int low = (1 << shift) - 1;
17848 if (((val32 & (0xff << shift)) | low) == val32)
17849 {
17850 if (info)
17851 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17852 simd_immediate_info::MSL, shift);
17853 return true;
17854 }
17855 }
17856
17857 return false;
17858 }
17859
17860 /* Return true if replicating VAL64 is a valid immediate for the
17861 Advanced SIMD operation described by WHICH. If INFO is nonnull,
17862 use it to describe valid immediates. */
17863 static bool
17864 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17865 simd_immediate_info *info,
17866 enum simd_immediate_check which)
17867 {
17868 unsigned int val32 = val64 & 0xffffffff;
17869 unsigned int val16 = val64 & 0xffff;
17870 unsigned int val8 = val64 & 0xff;
17871
17872 if (val32 == (val64 >> 32))
17873 {
17874 if ((which & AARCH64_CHECK_ORR) != 0
17875 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17876 simd_immediate_info::MOV))
17877 return true;
17878
17879 if ((which & AARCH64_CHECK_BIC) != 0
17880 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17881 simd_immediate_info::MVN))
17882 return true;
17883
17884 /* Try using a replicated byte. */
17885 if (which == AARCH64_CHECK_MOV
17886 && val16 == (val32 >> 16)
17887 && val8 == (val16 >> 8))
17888 {
17889 if (info)
17890 *info = simd_immediate_info (QImode, val8);
17891 return true;
17892 }
17893 }
17894
17895 /* Try using a bit-to-bytemask. */
17896 if (which == AARCH64_CHECK_MOV)
17897 {
17898 unsigned int i;
17899 for (i = 0; i < 64; i += 8)
17900 {
17901 unsigned char byte = (val64 >> i) & 0xff;
17902 if (byte != 0 && byte != 0xff)
17903 break;
17904 }
17905 if (i == 64)
17906 {
17907 if (info)
17908 *info = simd_immediate_info (DImode, val64);
17909 return true;
17910 }
17911 }
17912 return false;
17913 }
17914
17915 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17916 instruction. If INFO is nonnull, use it to describe valid immediates. */
17917
17918 static bool
17919 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17920 simd_immediate_info *info)
17921 {
17922 scalar_int_mode mode = DImode;
17923 unsigned int val32 = val64 & 0xffffffff;
17924 if (val32 == (val64 >> 32))
17925 {
17926 mode = SImode;
17927 unsigned int val16 = val32 & 0xffff;
17928 if (val16 == (val32 >> 16))
17929 {
17930 mode = HImode;
17931 unsigned int val8 = val16 & 0xff;
17932 if (val8 == (val16 >> 8))
17933 mode = QImode;
17934 }
17935 }
17936 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17937 if (IN_RANGE (val, -0x80, 0x7f))
17938 {
17939 /* DUP with no shift. */
17940 if (info)
17941 *info = simd_immediate_info (mode, val);
17942 return true;
17943 }
17944 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17945 {
17946 /* DUP with LSL #8. */
17947 if (info)
17948 *info = simd_immediate_info (mode, val);
17949 return true;
17950 }
17951 if (aarch64_bitmask_imm (val64, mode))
17952 {
17953 /* DUPM. */
17954 if (info)
17955 *info = simd_immediate_info (mode, val);
17956 return true;
17957 }
17958 return false;
17959 }
17960
17961 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17962
17963 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17964
17965 where PATTERN is the svpattern as a CONST_INT and where ZERO
17966 is a zero constant of the required PTRUE mode (which can have
17967 fewer elements than X's mode, if zero bits are significant).
17968
17969 If so, and if INFO is nonnull, describe the immediate in INFO. */
17970 bool
17971 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17972 {
17973 if (GET_CODE (x) != CONST)
17974 return false;
17975
17976 x = XEXP (x, 0);
17977 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17978 return false;
17979
17980 if (info)
17981 {
17982 aarch64_svpattern pattern
17983 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17984 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17985 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17986 *info = simd_immediate_info (int_mode, pattern);
17987 }
17988 return true;
17989 }
17990
17991 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
17992 it to describe valid immediates. */
17993
17994 static bool
17995 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17996 {
17997 if (aarch64_sve_ptrue_svpattern_p (x, info))
17998 return true;
17999
18000 if (x == CONST0_RTX (GET_MODE (x)))
18001 {
18002 if (info)
18003 *info = simd_immediate_info (DImode, 0);
18004 return true;
18005 }
18006
18007 /* Analyze the value as a VNx16BImode. This should be relatively
18008 efficient, since rtx_vector_builder has enough built-in capacity
18009 to store all VLA predicate constants without needing the heap. */
18010 rtx_vector_builder builder;
18011 if (!aarch64_get_sve_pred_bits (builder, x))
18012 return false;
18013
18014 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
18015 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
18016 {
18017 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
18018 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
18019 if (pattern != AARCH64_NUM_SVPATTERNS)
18020 {
18021 if (info)
18022 {
18023 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
18024 *info = simd_immediate_info (int_mode, pattern);
18025 }
18026 return true;
18027 }
18028 }
18029 return false;
18030 }
18031
18032 /* Return true if OP is a valid SIMD immediate for the operation
18033 described by WHICH. If INFO is nonnull, use it to describe valid
18034 immediates. */
18035 bool
18036 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
18037 enum simd_immediate_check which)
18038 {
18039 machine_mode mode = GET_MODE (op);
18040 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18041 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
18042 return false;
18043
18044 if (vec_flags & VEC_SVE_PRED)
18045 return aarch64_sve_pred_valid_immediate (op, info);
18046
18047 scalar_mode elt_mode = GET_MODE_INNER (mode);
18048 rtx base, step;
18049 unsigned int n_elts;
18050 if (GET_CODE (op) == CONST_VECTOR
18051 && CONST_VECTOR_DUPLICATE_P (op))
18052 n_elts = CONST_VECTOR_NPATTERNS (op);
18053 else if ((vec_flags & VEC_SVE_DATA)
18054 && const_vec_series_p (op, &base, &step))
18055 {
18056 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
18057 if (!aarch64_sve_index_immediate_p (base)
18058 || !aarch64_sve_index_immediate_p (step))
18059 return false;
18060
18061 if (info)
18062 {
18063 /* Get the corresponding container mode. E.g. an INDEX on V2SI
18064 should yield two integer values per 128-bit block, meaning
18065 that we need to treat it in the same way as V2DI and then
18066 ignore the upper 32 bits of each element. */
18067 elt_mode = aarch64_sve_container_int_mode (mode);
18068 *info = simd_immediate_info (elt_mode, base, step);
18069 }
18070 return true;
18071 }
18072 else if (GET_CODE (op) == CONST_VECTOR
18073 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
18074 /* N_ELTS set above. */;
18075 else
18076 return false;
18077
18078 scalar_float_mode elt_float_mode;
18079 if (n_elts == 1
18080 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
18081 {
18082 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
18083 if (aarch64_float_const_zero_rtx_p (elt)
18084 || aarch64_float_const_representable_p (elt))
18085 {
18086 if (info)
18087 *info = simd_immediate_info (elt_float_mode, elt);
18088 return true;
18089 }
18090 }
18091
18092 /* If all elements in an SVE vector have the same value, we have a free
18093 choice between using the element mode and using the container mode.
18094 Using the element mode means that unused parts of the vector are
18095 duplicates of the used elements, while using the container mode means
18096 that the unused parts are an extension of the used elements. Using the
18097 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
18098 for its container mode VNx4SI while 0x00000101 isn't.
18099
18100 If not all elements in an SVE vector have the same value, we need the
18101 transition from one element to the next to occur at container boundaries.
18102 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
18103 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
18104 scalar_int_mode elt_int_mode;
18105 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
18106 elt_int_mode = aarch64_sve_container_int_mode (mode);
18107 else
18108 elt_int_mode = int_mode_for_mode (elt_mode).require ();
18109
18110 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
18111 if (elt_size > 8)
18112 return false;
18113
18114 /* Expand the vector constant out into a byte vector, with the least
18115 significant byte of the register first. */
18116 auto_vec<unsigned char, 16> bytes;
18117 bytes.reserve (n_elts * elt_size);
18118 for (unsigned int i = 0; i < n_elts; i++)
18119 {
18120 /* The vector is provided in gcc endian-neutral fashion.
18121 For aarch64_be Advanced SIMD, it must be laid out in the vector
18122 register in reverse order. */
18123 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
18124 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
18125
18126 if (elt_mode != elt_int_mode)
18127 elt = gen_lowpart (elt_int_mode, elt);
18128
18129 if (!CONST_INT_P (elt))
18130 return false;
18131
18132 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
18133 for (unsigned int byte = 0; byte < elt_size; byte++)
18134 {
18135 bytes.quick_push (elt_val & 0xff);
18136 elt_val >>= BITS_PER_UNIT;
18137 }
18138 }
18139
18140 /* The immediate must repeat every eight bytes. */
18141 unsigned int nbytes = bytes.length ();
18142 for (unsigned i = 8; i < nbytes; ++i)
18143 if (bytes[i] != bytes[i - 8])
18144 return false;
18145
18146 /* Get the repeating 8-byte value as an integer. No endian correction
18147 is needed here because bytes is already in lsb-first order. */
18148 unsigned HOST_WIDE_INT val64 = 0;
18149 for (unsigned int i = 0; i < 8; i++)
18150 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
18151 << (i * BITS_PER_UNIT));
18152
18153 if (vec_flags & VEC_SVE_DATA)
18154 return aarch64_sve_valid_immediate (val64, info);
18155 else
18156 return aarch64_advsimd_valid_immediate (val64, info, which);
18157 }
18158
18159 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
18160 has a step in the range of INDEX. Return the index expression if so,
18161 otherwise return null. */
18162 rtx
18163 aarch64_check_zero_based_sve_index_immediate (rtx x)
18164 {
18165 rtx base, step;
18166 if (const_vec_series_p (x, &base, &step)
18167 && base == const0_rtx
18168 && aarch64_sve_index_immediate_p (step))
18169 return step;
18170 return NULL_RTX;
18171 }
18172
18173 /* Check of immediate shift constants are within range. */
18174 bool
18175 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
18176 {
18177 x = unwrap_const_vec_duplicate (x);
18178 if (!CONST_INT_P (x))
18179 return false;
18180 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
18181 if (left)
18182 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
18183 else
18184 return IN_RANGE (INTVAL (x), 1, bit_width);
18185 }
18186
18187 /* Return the bitmask CONST_INT to select the bits required by a zero extract
18188 operation of width WIDTH at bit position POS. */
18189
18190 rtx
18191 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
18192 {
18193 gcc_assert (CONST_INT_P (width));
18194 gcc_assert (CONST_INT_P (pos));
18195
18196 unsigned HOST_WIDE_INT mask
18197 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
18198 return GEN_INT (mask << UINTVAL (pos));
18199 }
18200
18201 bool
18202 aarch64_mov_operand_p (rtx x, machine_mode mode)
18203 {
18204 if (GET_CODE (x) == HIGH
18205 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
18206 return true;
18207
18208 if (CONST_INT_P (x))
18209 return true;
18210
18211 if (VECTOR_MODE_P (GET_MODE (x)))
18212 {
18213 /* Require predicate constants to be VNx16BI before RA, so that we
18214 force everything to have a canonical form. */
18215 if (!lra_in_progress
18216 && !reload_completed
18217 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
18218 && GET_MODE (x) != VNx16BImode)
18219 return false;
18220
18221 return aarch64_simd_valid_immediate (x, NULL);
18222 }
18223
18224 x = strip_salt (x);
18225 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
18226 return true;
18227
18228 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
18229 return true;
18230
18231 return aarch64_classify_symbolic_expression (x)
18232 == SYMBOL_TINY_ABSOLUTE;
18233 }
18234
18235 /* Return a const_int vector of VAL. */
18236 rtx
18237 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
18238 {
18239 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
18240 return gen_const_vec_duplicate (mode, c);
18241 }
18242
18243 /* Check OP is a legal scalar immediate for the MOVI instruction. */
18244
18245 bool
18246 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
18247 {
18248 machine_mode vmode;
18249
18250 vmode = aarch64_simd_container_mode (mode, 64);
18251 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
18252 return aarch64_simd_valid_immediate (op_v, NULL);
18253 }
18254
18255 /* Construct and return a PARALLEL RTX vector with elements numbering the
18256 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
18257 the vector - from the perspective of the architecture. This does not
18258 line up with GCC's perspective on lane numbers, so we end up with
18259 different masks depending on our target endian-ness. The diagram
18260 below may help. We must draw the distinction when building masks
18261 which select one half of the vector. An instruction selecting
18262 architectural low-lanes for a big-endian target, must be described using
18263 a mask selecting GCC high-lanes.
18264
18265 Big-Endian Little-Endian
18266
18267 GCC 0 1 2 3 3 2 1 0
18268 | x | x | x | x | | x | x | x | x |
18269 Architecture 3 2 1 0 3 2 1 0
18270
18271 Low Mask: { 2, 3 } { 0, 1 }
18272 High Mask: { 0, 1 } { 2, 3 }
18273
18274 MODE Is the mode of the vector and NUNITS is the number of units in it. */
18275
18276 rtx
18277 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
18278 {
18279 rtvec v = rtvec_alloc (nunits / 2);
18280 int high_base = nunits / 2;
18281 int low_base = 0;
18282 int base;
18283 rtx t1;
18284 int i;
18285
18286 if (BYTES_BIG_ENDIAN)
18287 base = high ? low_base : high_base;
18288 else
18289 base = high ? high_base : low_base;
18290
18291 for (i = 0; i < nunits / 2; i++)
18292 RTVEC_ELT (v, i) = GEN_INT (base + i);
18293
18294 t1 = gen_rtx_PARALLEL (mode, v);
18295 return t1;
18296 }
18297
18298 /* Check OP for validity as a PARALLEL RTX vector with elements
18299 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
18300 from the perspective of the architecture. See the diagram above
18301 aarch64_simd_vect_par_cnst_half for more details. */
18302
18303 bool
18304 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
18305 bool high)
18306 {
18307 int nelts;
18308 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
18309 return false;
18310
18311 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
18312 HOST_WIDE_INT count_op = XVECLEN (op, 0);
18313 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
18314 int i = 0;
18315
18316 if (count_op != count_ideal)
18317 return false;
18318
18319 for (i = 0; i < count_ideal; i++)
18320 {
18321 rtx elt_op = XVECEXP (op, 0, i);
18322 rtx elt_ideal = XVECEXP (ideal, 0, i);
18323
18324 if (!CONST_INT_P (elt_op)
18325 || INTVAL (elt_ideal) != INTVAL (elt_op))
18326 return false;
18327 }
18328 return true;
18329 }
18330
18331 /* Return a PARALLEL containing NELTS elements, with element I equal
18332 to BASE + I * STEP. */
18333
18334 rtx
18335 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
18336 {
18337 rtvec vec = rtvec_alloc (nelts);
18338 for (unsigned int i = 0; i < nelts; ++i)
18339 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
18340 return gen_rtx_PARALLEL (VOIDmode, vec);
18341 }
18342
18343 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
18344 series with step STEP. */
18345
18346 bool
18347 aarch64_stepped_int_parallel_p (rtx op, int step)
18348 {
18349 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
18350 return false;
18351
18352 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
18353 for (int i = 1; i < XVECLEN (op, 0); ++i)
18354 if (!CONST_INT_P (XVECEXP (op, 0, i))
18355 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
18356 return false;
18357
18358 return true;
18359 }
18360
18361 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
18362 HIGH (exclusive). */
18363 void
18364 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
18365 const_tree exp)
18366 {
18367 HOST_WIDE_INT lane;
18368 gcc_assert (CONST_INT_P (operand));
18369 lane = INTVAL (operand);
18370
18371 if (lane < low || lane >= high)
18372 {
18373 if (exp)
18374 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
18375 else
18376 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
18377 }
18378 }
18379
18380 /* Peform endian correction on lane number N, which indexes a vector
18381 of mode MODE, and return the result as an SImode rtx. */
18382
18383 rtx
18384 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18385 {
18386 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18387 }
18388
18389 /* Return TRUE if OP is a valid vector addressing mode. */
18390
18391 bool
18392 aarch64_simd_mem_operand_p (rtx op)
18393 {
18394 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
18395 || REG_P (XEXP (op, 0)));
18396 }
18397
18398 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
18399
18400 bool
18401 aarch64_sve_ld1r_operand_p (rtx op)
18402 {
18403 struct aarch64_address_info addr;
18404 scalar_mode mode;
18405
18406 return (MEM_P (op)
18407 && is_a <scalar_mode> (GET_MODE (op), &mode)
18408 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18409 && addr.type == ADDRESS_REG_IMM
18410 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18411 }
18412
18413 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18414 where the size of the read data is specified by `mode` and the size of the
18415 vector elements are specified by `elem_mode`. */
18416 bool
18417 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18418 scalar_mode elem_mode)
18419 {
18420 struct aarch64_address_info addr;
18421 if (!MEM_P (op)
18422 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18423 return false;
18424
18425 if (addr.type == ADDRESS_REG_IMM)
18426 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
18427
18428 if (addr.type == ADDRESS_REG_REG)
18429 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18430
18431 return false;
18432 }
18433
18434 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
18435 bool
18436 aarch64_sve_ld1rq_operand_p (rtx op)
18437 {
18438 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18439 GET_MODE_INNER (GET_MODE (op)));
18440 }
18441
18442 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18443 accessing a vector where the element size is specified by `elem_mode`. */
18444 bool
18445 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18446 {
18447 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18448 }
18449
18450 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
18451 bool
18452 aarch64_sve_ldff1_operand_p (rtx op)
18453 {
18454 if (!MEM_P (op))
18455 return false;
18456
18457 struct aarch64_address_info addr;
18458 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18459 return false;
18460
18461 if (addr.type == ADDRESS_REG_IMM)
18462 return known_eq (addr.const_offset, 0);
18463
18464 return addr.type == ADDRESS_REG_REG;
18465 }
18466
18467 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
18468 bool
18469 aarch64_sve_ldnf1_operand_p (rtx op)
18470 {
18471 struct aarch64_address_info addr;
18472
18473 return (MEM_P (op)
18474 && aarch64_classify_address (&addr, XEXP (op, 0),
18475 GET_MODE (op), false)
18476 && addr.type == ADDRESS_REG_IMM);
18477 }
18478
18479 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18480 The conditions for STR are the same. */
18481 bool
18482 aarch64_sve_ldr_operand_p (rtx op)
18483 {
18484 struct aarch64_address_info addr;
18485
18486 return (MEM_P (op)
18487 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18488 false, ADDR_QUERY_ANY)
18489 && addr.type == ADDRESS_REG_IMM);
18490 }
18491
18492 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18493 addressing memory of mode MODE. */
18494 bool
18495 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18496 {
18497 struct aarch64_address_info addr;
18498 if (!aarch64_classify_address (&addr, op, mode, false))
18499 return false;
18500
18501 if (addr.type == ADDRESS_REG_IMM)
18502 return known_eq (addr.const_offset, 0);
18503
18504 return addr.type == ADDRESS_REG_REG;
18505 }
18506
18507 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18508 We need to be able to access the individual pieces, so the range
18509 is different from LD[234] and ST[234]. */
18510 bool
18511 aarch64_sve_struct_memory_operand_p (rtx op)
18512 {
18513 if (!MEM_P (op))
18514 return false;
18515
18516 machine_mode mode = GET_MODE (op);
18517 struct aarch64_address_info addr;
18518 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18519 ADDR_QUERY_ANY)
18520 || addr.type != ADDRESS_REG_IMM)
18521 return false;
18522
18523 poly_int64 first = addr.const_offset;
18524 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18525 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18526 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18527 }
18528
18529 /* Emit a register copy from operand to operand, taking care not to
18530 early-clobber source registers in the process.
18531
18532 COUNT is the number of components into which the copy needs to be
18533 decomposed. */
18534 void
18535 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
18536 unsigned int count)
18537 {
18538 unsigned int i;
18539 int rdest = REGNO (operands[0]);
18540 int rsrc = REGNO (operands[1]);
18541
18542 if (!reg_overlap_mentioned_p (operands[0], operands[1])
18543 || rdest < rsrc)
18544 for (i = 0; i < count; i++)
18545 emit_move_insn (gen_rtx_REG (mode, rdest + i),
18546 gen_rtx_REG (mode, rsrc + i));
18547 else
18548 for (i = 0; i < count; i++)
18549 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18550 gen_rtx_REG (mode, rsrc + count - i - 1));
18551 }
18552
18553 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
18554 one of VSTRUCT modes: OI, CI, or XI. */
18555 int
18556 aarch64_simd_attr_length_rglist (machine_mode mode)
18557 {
18558 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
18559 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
18560 }
18561
18562 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
18563 alignment of a vector to 128 bits. SVE predicates have an alignment of
18564 16 bits. */
18565 static HOST_WIDE_INT
18566 aarch64_simd_vector_alignment (const_tree type)
18567 {
18568 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18569 be set for non-predicate vectors of booleans. Modes are the most
18570 direct way we have of identifying real SVE predicate types. */
18571 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18572 return 16;
18573 widest_int min_size
18574 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18575 return wi::umin (min_size, 128).to_uhwi ();
18576 }
18577
18578 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
18579 static poly_uint64
18580 aarch64_vectorize_preferred_vector_alignment (const_tree type)
18581 {
18582 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18583 {
18584 /* If the length of the vector is fixed, try to align to that length,
18585 otherwise don't try to align at all. */
18586 HOST_WIDE_INT result;
18587 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
18588 result = TYPE_ALIGN (TREE_TYPE (type));
18589 return result;
18590 }
18591 return TYPE_ALIGN (type);
18592 }
18593
18594 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
18595 static bool
18596 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18597 {
18598 if (is_packed)
18599 return false;
18600
18601 /* For fixed-length vectors, check that the vectorizer will aim for
18602 full-vector alignment. This isn't true for generic GCC vectors
18603 that are wider than the ABI maximum of 128 bits. */
18604 poly_uint64 preferred_alignment =
18605 aarch64_vectorize_preferred_vector_alignment (type);
18606 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
18607 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18608 preferred_alignment))
18609 return false;
18610
18611 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
18612 return true;
18613 }
18614
18615 /* Return true if the vector misalignment factor is supported by the
18616 target. */
18617 static bool
18618 aarch64_builtin_support_vector_misalignment (machine_mode mode,
18619 const_tree type, int misalignment,
18620 bool is_packed)
18621 {
18622 if (TARGET_SIMD && STRICT_ALIGNMENT)
18623 {
18624 /* Return if movmisalign pattern is not supported for this mode. */
18625 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18626 return false;
18627
18628 /* Misalignment factor is unknown at compile time. */
18629 if (misalignment == -1)
18630 return false;
18631 }
18632 return default_builtin_support_vector_misalignment (mode, type, misalignment,
18633 is_packed);
18634 }
18635
18636 /* If VALS is a vector constant that can be loaded into a register
18637 using DUP, generate instructions to do so and return an RTX to
18638 assign to the register. Otherwise return NULL_RTX. */
18639 static rtx
18640 aarch64_simd_dup_constant (rtx vals)
18641 {
18642 machine_mode mode = GET_MODE (vals);
18643 machine_mode inner_mode = GET_MODE_INNER (mode);
18644 rtx x;
18645
18646 if (!const_vec_duplicate_p (vals, &x))
18647 return NULL_RTX;
18648
18649 /* We can load this constant by using DUP and a constant in a
18650 single ARM register. This will be cheaper than a vector
18651 load. */
18652 x = copy_to_mode_reg (inner_mode, x);
18653 return gen_vec_duplicate (mode, x);
18654 }
18655
18656
18657 /* Generate code to load VALS, which is a PARALLEL containing only
18658 constants (for vec_init) or CONST_VECTOR, efficiently into a
18659 register. Returns an RTX to copy into the register, or NULL_RTX
18660 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
18661 static rtx
18662 aarch64_simd_make_constant (rtx vals)
18663 {
18664 machine_mode mode = GET_MODE (vals);
18665 rtx const_dup;
18666 rtx const_vec = NULL_RTX;
18667 int n_const = 0;
18668 int i;
18669
18670 if (GET_CODE (vals) == CONST_VECTOR)
18671 const_vec = vals;
18672 else if (GET_CODE (vals) == PARALLEL)
18673 {
18674 /* A CONST_VECTOR must contain only CONST_INTs and
18675 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18676 Only store valid constants in a CONST_VECTOR. */
18677 int n_elts = XVECLEN (vals, 0);
18678 for (i = 0; i < n_elts; ++i)
18679 {
18680 rtx x = XVECEXP (vals, 0, i);
18681 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18682 n_const++;
18683 }
18684 if (n_const == n_elts)
18685 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18686 }
18687 else
18688 gcc_unreachable ();
18689
18690 if (const_vec != NULL_RTX
18691 && aarch64_simd_valid_immediate (const_vec, NULL))
18692 /* Load using MOVI/MVNI. */
18693 return const_vec;
18694 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18695 /* Loaded using DUP. */
18696 return const_dup;
18697 else if (const_vec != NULL_RTX)
18698 /* Load from constant pool. We cannot take advantage of single-cycle
18699 LD1 because we need a PC-relative addressing mode. */
18700 return const_vec;
18701 else
18702 /* A PARALLEL containing something not valid inside CONST_VECTOR.
18703 We cannot construct an initializer. */
18704 return NULL_RTX;
18705 }
18706
18707 /* Expand a vector initialisation sequence, such that TARGET is
18708 initialised to contain VALS. */
18709
18710 void
18711 aarch64_expand_vector_init (rtx target, rtx vals)
18712 {
18713 machine_mode mode = GET_MODE (target);
18714 scalar_mode inner_mode = GET_MODE_INNER (mode);
18715 /* The number of vector elements. */
18716 int n_elts = XVECLEN (vals, 0);
18717 /* The number of vector elements which are not constant. */
18718 int n_var = 0;
18719 rtx any_const = NULL_RTX;
18720 /* The first element of vals. */
18721 rtx v0 = XVECEXP (vals, 0, 0);
18722 bool all_same = true;
18723
18724 /* This is a special vec_init<M><N> where N is not an element mode but a
18725 vector mode with half the elements of M. We expect to find two entries
18726 of mode N in VALS and we must put their concatentation into TARGET. */
18727 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18728 {
18729 gcc_assert (known_eq (GET_MODE_SIZE (mode),
18730 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18731 rtx lo = XVECEXP (vals, 0, 0);
18732 rtx hi = XVECEXP (vals, 0, 1);
18733 machine_mode narrow_mode = GET_MODE (lo);
18734 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18735 gcc_assert (narrow_mode == GET_MODE (hi));
18736
18737 /* When we want to concatenate a half-width vector with zeroes we can
18738 use the aarch64_combinez[_be] patterns. Just make sure that the
18739 zeroes are in the right half. */
18740 if (BYTES_BIG_ENDIAN
18741 && aarch64_simd_imm_zero (lo, narrow_mode)
18742 && general_operand (hi, narrow_mode))
18743 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18744 else if (!BYTES_BIG_ENDIAN
18745 && aarch64_simd_imm_zero (hi, narrow_mode)
18746 && general_operand (lo, narrow_mode))
18747 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18748 else
18749 {
18750 /* Else create the two half-width registers and combine them. */
18751 if (!REG_P (lo))
18752 lo = force_reg (GET_MODE (lo), lo);
18753 if (!REG_P (hi))
18754 hi = force_reg (GET_MODE (hi), hi);
18755
18756 if (BYTES_BIG_ENDIAN)
18757 std::swap (lo, hi);
18758 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18759 }
18760 return;
18761 }
18762
18763 /* Count the number of variable elements to initialise. */
18764 for (int i = 0; i < n_elts; ++i)
18765 {
18766 rtx x = XVECEXP (vals, 0, i);
18767 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18768 ++n_var;
18769 else
18770 any_const = x;
18771
18772 all_same &= rtx_equal_p (x, v0);
18773 }
18774
18775 /* No variable elements, hand off to aarch64_simd_make_constant which knows
18776 how best to handle this. */
18777 if (n_var == 0)
18778 {
18779 rtx constant = aarch64_simd_make_constant (vals);
18780 if (constant != NULL_RTX)
18781 {
18782 emit_move_insn (target, constant);
18783 return;
18784 }
18785 }
18786
18787 /* Splat a single non-constant element if we can. */
18788 if (all_same)
18789 {
18790 rtx x = copy_to_mode_reg (inner_mode, v0);
18791 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18792 return;
18793 }
18794
18795 enum insn_code icode = optab_handler (vec_set_optab, mode);
18796 gcc_assert (icode != CODE_FOR_nothing);
18797
18798 /* If there are only variable elements, try to optimize
18799 the insertion using dup for the most common element
18800 followed by insertions. */
18801
18802 /* The algorithm will fill matches[*][0] with the earliest matching element,
18803 and matches[X][1] with the count of duplicate elements (if X is the
18804 earliest element which has duplicates). */
18805
18806 if (n_var == n_elts && n_elts <= 16)
18807 {
18808 int matches[16][2] = {0};
18809 for (int i = 0; i < n_elts; i++)
18810 {
18811 for (int j = 0; j <= i; j++)
18812 {
18813 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18814 {
18815 matches[i][0] = j;
18816 matches[j][1]++;
18817 break;
18818 }
18819 }
18820 }
18821 int maxelement = 0;
18822 int maxv = 0;
18823 for (int i = 0; i < n_elts; i++)
18824 if (matches[i][1] > maxv)
18825 {
18826 maxelement = i;
18827 maxv = matches[i][1];
18828 }
18829
18830 /* Create a duplicate of the most common element, unless all elements
18831 are equally useless to us, in which case just immediately set the
18832 vector register using the first element. */
18833
18834 if (maxv == 1)
18835 {
18836 /* For vectors of two 64-bit elements, we can do even better. */
18837 if (n_elts == 2
18838 && (inner_mode == E_DImode
18839 || inner_mode == E_DFmode))
18840
18841 {
18842 rtx x0 = XVECEXP (vals, 0, 0);
18843 rtx x1 = XVECEXP (vals, 0, 1);
18844 /* Combine can pick up this case, but handling it directly
18845 here leaves clearer RTL.
18846
18847 This is load_pair_lanes<mode>, and also gives us a clean-up
18848 for store_pair_lanes<mode>. */
18849 if (memory_operand (x0, inner_mode)
18850 && memory_operand (x1, inner_mode)
18851 && !STRICT_ALIGNMENT
18852 && rtx_equal_p (XEXP (x1, 0),
18853 plus_constant (Pmode,
18854 XEXP (x0, 0),
18855 GET_MODE_SIZE (inner_mode))))
18856 {
18857 rtx t;
18858 if (inner_mode == DFmode)
18859 t = gen_load_pair_lanesdf (target, x0, x1);
18860 else
18861 t = gen_load_pair_lanesdi (target, x0, x1);
18862 emit_insn (t);
18863 return;
18864 }
18865 }
18866 /* The subreg-move sequence below will move into lane zero of the
18867 vector register. For big-endian we want that position to hold
18868 the last element of VALS. */
18869 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18870 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18871 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18872 }
18873 else
18874 {
18875 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18876 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18877 }
18878
18879 /* Insert the rest. */
18880 for (int i = 0; i < n_elts; i++)
18881 {
18882 rtx x = XVECEXP (vals, 0, i);
18883 if (matches[i][0] == maxelement)
18884 continue;
18885 x = copy_to_mode_reg (inner_mode, x);
18886 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18887 }
18888 return;
18889 }
18890
18891 /* Initialise a vector which is part-variable. We want to first try
18892 to build those lanes which are constant in the most efficient way we
18893 can. */
18894 if (n_var != n_elts)
18895 {
18896 rtx copy = copy_rtx (vals);
18897
18898 /* Load constant part of vector. We really don't care what goes into the
18899 parts we will overwrite, but we're more likely to be able to load the
18900 constant efficiently if it has fewer, larger, repeating parts
18901 (see aarch64_simd_valid_immediate). */
18902 for (int i = 0; i < n_elts; i++)
18903 {
18904 rtx x = XVECEXP (vals, 0, i);
18905 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18906 continue;
18907 rtx subst = any_const;
18908 for (int bit = n_elts / 2; bit > 0; bit /= 2)
18909 {
18910 /* Look in the copied vector, as more elements are const. */
18911 rtx test = XVECEXP (copy, 0, i ^ bit);
18912 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18913 {
18914 subst = test;
18915 break;
18916 }
18917 }
18918 XVECEXP (copy, 0, i) = subst;
18919 }
18920 aarch64_expand_vector_init (target, copy);
18921 }
18922
18923 /* Insert the variable lanes directly. */
18924 for (int i = 0; i < n_elts; i++)
18925 {
18926 rtx x = XVECEXP (vals, 0, i);
18927 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18928 continue;
18929 x = copy_to_mode_reg (inner_mode, x);
18930 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18931 }
18932 }
18933
18934 /* Emit RTL corresponding to:
18935 insr TARGET, ELEM. */
18936
18937 static void
18938 emit_insr (rtx target, rtx elem)
18939 {
18940 machine_mode mode = GET_MODE (target);
18941 scalar_mode elem_mode = GET_MODE_INNER (mode);
18942 elem = force_reg (elem_mode, elem);
18943
18944 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18945 gcc_assert (icode != CODE_FOR_nothing);
18946 emit_insn (GEN_FCN (icode) (target, target, elem));
18947 }
18948
18949 /* Subroutine of aarch64_sve_expand_vector_init for handling
18950 trailing constants.
18951 This function works as follows:
18952 (a) Create a new vector consisting of trailing constants.
18953 (b) Initialize TARGET with the constant vector using emit_move_insn.
18954 (c) Insert remaining elements in TARGET using insr.
18955 NELTS is the total number of elements in original vector while
18956 while NELTS_REQD is the number of elements that are actually
18957 significant.
18958
18959 ??? The heuristic used is to do above only if number of constants
18960 is at least half the total number of elements. May need fine tuning. */
18961
18962 static bool
18963 aarch64_sve_expand_vector_init_handle_trailing_constants
18964 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18965 {
18966 machine_mode mode = GET_MODE (target);
18967 scalar_mode elem_mode = GET_MODE_INNER (mode);
18968 int n_trailing_constants = 0;
18969
18970 for (int i = nelts_reqd - 1;
18971 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
18972 i--)
18973 n_trailing_constants++;
18974
18975 if (n_trailing_constants >= nelts_reqd / 2)
18976 {
18977 /* Try to use the natural pattern of BUILDER to extend the trailing
18978 constant elements to a full vector. Replace any variables in the
18979 extra elements with zeros.
18980
18981 ??? It would be better if the builders supported "don't care"
18982 elements, with the builder filling in whichever elements
18983 give the most compact encoding. */
18984 rtx_vector_builder v (mode, nelts, 1);
18985 for (int i = 0; i < nelts; i++)
18986 {
18987 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
18988 if (!valid_for_const_vector_p (elem_mode, x))
18989 x = const0_rtx;
18990 v.quick_push (x);
18991 }
18992 rtx const_vec = v.build ();
18993 emit_move_insn (target, const_vec);
18994
18995 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18996 emit_insr (target, builder.elt (i));
18997
18998 return true;
18999 }
19000
19001 return false;
19002 }
19003
19004 /* Subroutine of aarch64_sve_expand_vector_init.
19005 Works as follows:
19006 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
19007 (b) Skip trailing elements from BUILDER, which are the same as
19008 element NELTS_REQD - 1.
19009 (c) Insert earlier elements in reverse order in TARGET using insr. */
19010
19011 static void
19012 aarch64_sve_expand_vector_init_insert_elems (rtx target,
19013 const rtx_vector_builder &builder,
19014 int nelts_reqd)
19015 {
19016 machine_mode mode = GET_MODE (target);
19017 scalar_mode elem_mode = GET_MODE_INNER (mode);
19018
19019 struct expand_operand ops[2];
19020 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
19021 gcc_assert (icode != CODE_FOR_nothing);
19022
19023 create_output_operand (&ops[0], target, mode);
19024 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
19025 expand_insn (icode, 2, ops);
19026
19027 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19028 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
19029 emit_insr (target, builder.elt (i));
19030 }
19031
19032 /* Subroutine of aarch64_sve_expand_vector_init to handle case
19033 when all trailing elements of builder are same.
19034 This works as follows:
19035 (a) Use expand_insn interface to broadcast last vector element in TARGET.
19036 (b) Insert remaining elements in TARGET using insr.
19037
19038 ??? The heuristic used is to do above if number of same trailing elements
19039 is at least 3/4 of total number of elements, loosely based on
19040 heuristic from mostly_zeros_p. May need fine-tuning. */
19041
19042 static bool
19043 aarch64_sve_expand_vector_init_handle_trailing_same_elem
19044 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
19045 {
19046 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19047 if (ndups >= (3 * nelts_reqd) / 4)
19048 {
19049 aarch64_sve_expand_vector_init_insert_elems (target, builder,
19050 nelts_reqd - ndups + 1);
19051 return true;
19052 }
19053
19054 return false;
19055 }
19056
19057 /* Initialize register TARGET from BUILDER. NELTS is the constant number
19058 of elements in BUILDER.
19059
19060 The function tries to initialize TARGET from BUILDER if it fits one
19061 of the special cases outlined below.
19062
19063 Failing that, the function divides BUILDER into two sub-vectors:
19064 v_even = even elements of BUILDER;
19065 v_odd = odd elements of BUILDER;
19066
19067 and recursively calls itself with v_even and v_odd.
19068
19069 if (recursive call succeeded for v_even or v_odd)
19070 TARGET = zip (v_even, v_odd)
19071
19072 The function returns true if it managed to build TARGET from BUILDER
19073 with one of the special cases, false otherwise.
19074
19075 Example: {a, 1, b, 2, c, 3, d, 4}
19076
19077 The vector gets divided into:
19078 v_even = {a, b, c, d}
19079 v_odd = {1, 2, 3, 4}
19080
19081 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
19082 initialize tmp2 from constant vector v_odd using emit_move_insn.
19083
19084 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
19085 4 elements, so we construct tmp1 from v_even using insr:
19086 tmp1 = dup(d)
19087 insr tmp1, c
19088 insr tmp1, b
19089 insr tmp1, a
19090
19091 And finally:
19092 TARGET = zip (tmp1, tmp2)
19093 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
19094
19095 static bool
19096 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
19097 int nelts, int nelts_reqd)
19098 {
19099 machine_mode mode = GET_MODE (target);
19100
19101 /* Case 1: Vector contains trailing constants. */
19102
19103 if (aarch64_sve_expand_vector_init_handle_trailing_constants
19104 (target, builder, nelts, nelts_reqd))
19105 return true;
19106
19107 /* Case 2: Vector contains leading constants. */
19108
19109 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
19110 for (int i = 0; i < nelts_reqd; i++)
19111 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
19112 rev_builder.finalize ();
19113
19114 if (aarch64_sve_expand_vector_init_handle_trailing_constants
19115 (target, rev_builder, nelts, nelts_reqd))
19116 {
19117 emit_insn (gen_aarch64_sve_rev (mode, target, target));
19118 return true;
19119 }
19120
19121 /* Case 3: Vector contains trailing same element. */
19122
19123 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19124 (target, builder, nelts_reqd))
19125 return true;
19126
19127 /* Case 4: Vector contains leading same element. */
19128
19129 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19130 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
19131 {
19132 emit_insn (gen_aarch64_sve_rev (mode, target, target));
19133 return true;
19134 }
19135
19136 /* Avoid recursing below 4-elements.
19137 ??? The threshold 4 may need fine-tuning. */
19138
19139 if (nelts_reqd <= 4)
19140 return false;
19141
19142 rtx_vector_builder v_even (mode, nelts, 1);
19143 rtx_vector_builder v_odd (mode, nelts, 1);
19144
19145 for (int i = 0; i < nelts * 2; i += 2)
19146 {
19147 v_even.quick_push (builder.elt (i));
19148 v_odd.quick_push (builder.elt (i + 1));
19149 }
19150
19151 v_even.finalize ();
19152 v_odd.finalize ();
19153
19154 rtx tmp1 = gen_reg_rtx (mode);
19155 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
19156 nelts, nelts_reqd / 2);
19157
19158 rtx tmp2 = gen_reg_rtx (mode);
19159 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
19160 nelts, nelts_reqd / 2);
19161
19162 if (!did_even_p && !did_odd_p)
19163 return false;
19164
19165 /* Initialize v_even and v_odd using INSR if it didn't match any of the
19166 special cases and zip v_even, v_odd. */
19167
19168 if (!did_even_p)
19169 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
19170
19171 if (!did_odd_p)
19172 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
19173
19174 rtvec v = gen_rtvec (2, tmp1, tmp2);
19175 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
19176 return true;
19177 }
19178
19179 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
19180
19181 void
19182 aarch64_sve_expand_vector_init (rtx target, rtx vals)
19183 {
19184 machine_mode mode = GET_MODE (target);
19185 int nelts = XVECLEN (vals, 0);
19186
19187 rtx_vector_builder v (mode, nelts, 1);
19188 for (int i = 0; i < nelts; i++)
19189 v.quick_push (XVECEXP (vals, 0, i));
19190 v.finalize ();
19191
19192 /* If neither sub-vectors of v could be initialized specially,
19193 then use INSR to insert all elements from v into TARGET.
19194 ??? This might not be optimal for vectors with large
19195 initializers like 16-element or above.
19196 For nelts < 4, it probably isn't useful to handle specially. */
19197
19198 if (nelts < 4
19199 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
19200 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
19201 }
19202
19203 /* Check whether VALUE is a vector constant in which every element
19204 is either a power of 2 or a negated power of 2. If so, return
19205 a constant vector of log2s, and flip CODE between PLUS and MINUS
19206 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
19207
19208 static rtx
19209 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
19210 {
19211 if (GET_CODE (value) != CONST_VECTOR)
19212 return NULL_RTX;
19213
19214 rtx_vector_builder builder;
19215 if (!builder.new_unary_operation (GET_MODE (value), value, false))
19216 return NULL_RTX;
19217
19218 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
19219 /* 1 if the result of the multiplication must be negated,
19220 0 if it mustn't, or -1 if we don't yet care. */
19221 int negate = -1;
19222 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
19223 for (unsigned int i = 0; i < encoded_nelts; ++i)
19224 {
19225 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
19226 if (!CONST_SCALAR_INT_P (elt))
19227 return NULL_RTX;
19228 rtx_mode_t val (elt, int_mode);
19229 wide_int pow2 = wi::neg (val);
19230 if (val != pow2)
19231 {
19232 /* It matters whether we negate or not. Make that choice,
19233 and make sure that it's consistent with previous elements. */
19234 if (negate == !wi::neg_p (val))
19235 return NULL_RTX;
19236 negate = wi::neg_p (val);
19237 if (!negate)
19238 pow2 = val;
19239 }
19240 /* POW2 is now the value that we want to be a power of 2. */
19241 int shift = wi::exact_log2 (pow2);
19242 if (shift < 0)
19243 return NULL_RTX;
19244 builder.quick_push (gen_int_mode (shift, int_mode));
19245 }
19246 if (negate == -1)
19247 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
19248 code = PLUS;
19249 else if (negate == 1)
19250 code = code == PLUS ? MINUS : PLUS;
19251 return builder.build ();
19252 }
19253
19254 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
19255 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
19256 operands array, in the same order as for fma_optab. Return true if
19257 the function emitted all the necessary instructions, false if the caller
19258 should generate the pattern normally with the new OPERANDS array. */
19259
19260 bool
19261 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
19262 {
19263 machine_mode mode = GET_MODE (operands[0]);
19264 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
19265 {
19266 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
19267 NULL_RTX, true, OPTAB_DIRECT);
19268 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
19269 operands[3], product, operands[0], true,
19270 OPTAB_DIRECT);
19271 return true;
19272 }
19273 operands[2] = force_reg (mode, operands[2]);
19274 return false;
19275 }
19276
19277 /* Likewise, but for a conditional pattern. */
19278
19279 bool
19280 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
19281 {
19282 machine_mode mode = GET_MODE (operands[0]);
19283 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
19284 {
19285 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
19286 NULL_RTX, true, OPTAB_DIRECT);
19287 emit_insn (gen_cond (code, mode, operands[0], operands[1],
19288 operands[4], product, operands[5]));
19289 return true;
19290 }
19291 operands[3] = force_reg (mode, operands[3]);
19292 return false;
19293 }
19294
19295 static unsigned HOST_WIDE_INT
19296 aarch64_shift_truncation_mask (machine_mode mode)
19297 {
19298 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
19299 return 0;
19300 return GET_MODE_UNIT_BITSIZE (mode) - 1;
19301 }
19302
19303 /* Select a format to encode pointers in exception handling data. */
19304 int
19305 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
19306 {
19307 int type;
19308 switch (aarch64_cmodel)
19309 {
19310 case AARCH64_CMODEL_TINY:
19311 case AARCH64_CMODEL_TINY_PIC:
19312 case AARCH64_CMODEL_SMALL:
19313 case AARCH64_CMODEL_SMALL_PIC:
19314 case AARCH64_CMODEL_SMALL_SPIC:
19315 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
19316 for everything. */
19317 type = DW_EH_PE_sdata4;
19318 break;
19319 default:
19320 /* No assumptions here. 8-byte relocs required. */
19321 type = DW_EH_PE_sdata8;
19322 break;
19323 }
19324 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
19325 }
19326
19327 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
19328
19329 static void
19330 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
19331 {
19332 if (TREE_CODE (decl) == FUNCTION_DECL)
19333 {
19334 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
19335 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
19336 {
19337 fprintf (stream, "\t.variant_pcs\t");
19338 assemble_name (stream, name);
19339 fprintf (stream, "\n");
19340 }
19341 }
19342 }
19343
19344 /* The last .arch and .tune assembly strings that we printed. */
19345 static std::string aarch64_last_printed_arch_string;
19346 static std::string aarch64_last_printed_tune_string;
19347
19348 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
19349 by the function fndecl. */
19350
19351 void
19352 aarch64_declare_function_name (FILE *stream, const char* name,
19353 tree fndecl)
19354 {
19355 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19356
19357 struct cl_target_option *targ_options;
19358 if (target_parts)
19359 targ_options = TREE_TARGET_OPTION (target_parts);
19360 else
19361 targ_options = TREE_TARGET_OPTION (target_option_current_node);
19362 gcc_assert (targ_options);
19363
19364 const struct processor *this_arch
19365 = aarch64_get_arch (targ_options->x_explicit_arch);
19366
19367 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
19368 std::string extension
19369 = aarch64_get_extension_string_for_isa_flags (isa_flags,
19370 this_arch->flags);
19371 /* Only update the assembler .arch string if it is distinct from the last
19372 such string we printed. */
19373 std::string to_print = this_arch->name + extension;
19374 if (to_print != aarch64_last_printed_arch_string)
19375 {
19376 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19377 aarch64_last_printed_arch_string = to_print;
19378 }
19379
19380 /* Print the cpu name we're tuning for in the comments, might be
19381 useful to readers of the generated asm. Do it only when it changes
19382 from function to function and verbose assembly is requested. */
19383 const struct processor *this_tune
19384 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19385
19386 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19387 {
19388 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19389 this_tune->name);
19390 aarch64_last_printed_tune_string = this_tune->name;
19391 }
19392
19393 aarch64_asm_output_variant_pcs (stream, fndecl, name);
19394
19395 /* Don't forget the type directive for ELF. */
19396 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19397 ASM_OUTPUT_LABEL (stream, name);
19398
19399 cfun->machine->label_is_assembled = true;
19400 }
19401
19402 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
19403 the function label and emit a BTI if necessary. */
19404
19405 void
19406 aarch64_print_patchable_function_entry (FILE *file,
19407 unsigned HOST_WIDE_INT patch_area_size,
19408 bool record_p)
19409 {
19410 if (cfun->machine->label_is_assembled
19411 && aarch64_bti_enabled ()
19412 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
19413 {
19414 /* Remove the BTI that follows the patch area and insert a new BTI
19415 before the patch area right after the function label. */
19416 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19417 if (insn
19418 && INSN_P (insn)
19419 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19420 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
19421 delete_insn (insn);
19422 asm_fprintf (file, "\thint\t34 // bti c\n");
19423 }
19424
19425 default_print_patchable_function_entry (file, patch_area_size, record_p);
19426 }
19427
19428 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
19429
19430 void
19431 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19432 {
19433 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19434 const char *value = IDENTIFIER_POINTER (target);
19435 aarch64_asm_output_variant_pcs (stream, decl, name);
19436 ASM_OUTPUT_DEF (stream, name, value);
19437 }
19438
19439 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
19440 function symbol references. */
19441
19442 void
19443 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
19444 {
19445 default_elf_asm_output_external (stream, decl, name);
19446 aarch64_asm_output_variant_pcs (stream, decl, name);
19447 }
19448
19449 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19450 Used to output the .cfi_b_key_frame directive when signing the current
19451 function with the B key. */
19452
19453 void
19454 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19455 {
19456 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
19457 && aarch64_ra_sign_key == AARCH64_KEY_B)
19458 asm_fprintf (f, "\t.cfi_b_key_frame\n");
19459 }
19460
19461 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
19462
19463 static void
19464 aarch64_start_file (void)
19465 {
19466 struct cl_target_option *default_options
19467 = TREE_TARGET_OPTION (target_option_default_node);
19468
19469 const struct processor *default_arch
19470 = aarch64_get_arch (default_options->x_explicit_arch);
19471 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
19472 std::string extension
19473 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19474 default_arch->flags);
19475
19476 aarch64_last_printed_arch_string = default_arch->name + extension;
19477 aarch64_last_printed_tune_string = "";
19478 asm_fprintf (asm_out_file, "\t.arch %s\n",
19479 aarch64_last_printed_arch_string.c_str ());
19480
19481 default_file_start ();
19482 }
19483
19484 /* Emit load exclusive. */
19485
19486 static void
19487 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
19488 rtx mem, rtx model_rtx)
19489 {
19490 if (mode == TImode)
19491 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19492 gen_highpart (DImode, rval),
19493 mem, model_rtx));
19494 else
19495 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
19496 }
19497
19498 /* Emit store exclusive. */
19499
19500 static void
19501 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
19502 rtx mem, rtx rval, rtx model_rtx)
19503 {
19504 if (mode == TImode)
19505 emit_insn (gen_aarch64_store_exclusive_pair
19506 (bval, mem, operand_subword (rval, 0, 0, TImode),
19507 operand_subword (rval, 1, 0, TImode), model_rtx));
19508 else
19509 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
19510 }
19511
19512 /* Mark the previous jump instruction as unlikely. */
19513
19514 static void
19515 aarch64_emit_unlikely_jump (rtx insn)
19516 {
19517 rtx_insn *jump = emit_jump_insn (insn);
19518 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
19519 }
19520
19521 /* We store the names of the various atomic helpers in a 5x4 array.
19522 Return the libcall function given MODE, MODEL and NAMES. */
19523
19524 rtx
19525 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19526 const atomic_ool_names *names)
19527 {
19528 memmodel model = memmodel_base (INTVAL (model_rtx));
19529 int mode_idx, model_idx;
19530
19531 switch (mode)
19532 {
19533 case E_QImode:
19534 mode_idx = 0;
19535 break;
19536 case E_HImode:
19537 mode_idx = 1;
19538 break;
19539 case E_SImode:
19540 mode_idx = 2;
19541 break;
19542 case E_DImode:
19543 mode_idx = 3;
19544 break;
19545 case E_TImode:
19546 mode_idx = 4;
19547 break;
19548 default:
19549 gcc_unreachable ();
19550 }
19551
19552 switch (model)
19553 {
19554 case MEMMODEL_RELAXED:
19555 model_idx = 0;
19556 break;
19557 case MEMMODEL_CONSUME:
19558 case MEMMODEL_ACQUIRE:
19559 model_idx = 1;
19560 break;
19561 case MEMMODEL_RELEASE:
19562 model_idx = 2;
19563 break;
19564 case MEMMODEL_ACQ_REL:
19565 case MEMMODEL_SEQ_CST:
19566 model_idx = 3;
19567 break;
19568 default:
19569 gcc_unreachable ();
19570 }
19571
19572 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19573 VISIBILITY_HIDDEN);
19574 }
19575
19576 #define DEF0(B, N) \
19577 { "__aarch64_" #B #N "_relax", \
19578 "__aarch64_" #B #N "_acq", \
19579 "__aarch64_" #B #N "_rel", \
19580 "__aarch64_" #B #N "_acq_rel" }
19581
19582 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19583 { NULL, NULL, NULL, NULL }
19584 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19585
19586 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19587 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19588 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19589 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19590 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19591 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19592
19593 #undef DEF0
19594 #undef DEF4
19595 #undef DEF5
19596
19597 /* Expand a compare and swap pattern. */
19598
19599 void
19600 aarch64_expand_compare_and_swap (rtx operands[])
19601 {
19602 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19603 machine_mode mode, r_mode;
19604
19605 bval = operands[0];
19606 rval = operands[1];
19607 mem = operands[2];
19608 oldval = operands[3];
19609 newval = operands[4];
19610 is_weak = operands[5];
19611 mod_s = operands[6];
19612 mod_f = operands[7];
19613 mode = GET_MODE (mem);
19614
19615 /* Normally the succ memory model must be stronger than fail, but in the
19616 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19617 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
19618 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19619 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
19620 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19621
19622 r_mode = mode;
19623 if (mode == QImode || mode == HImode)
19624 {
19625 r_mode = SImode;
19626 rval = gen_reg_rtx (r_mode);
19627 }
19628
19629 if (TARGET_LSE)
19630 {
19631 /* The CAS insn requires oldval and rval overlap, but we need to
19632 have a copy of oldval saved across the operation to tell if
19633 the operation is successful. */
19634 if (reg_overlap_mentioned_p (rval, oldval))
19635 rval = copy_to_mode_reg (r_mode, oldval);
19636 else
19637 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19638
19639 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19640 newval, mod_s));
19641 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19642 }
19643 else if (TARGET_OUTLINE_ATOMICS)
19644 {
19645 /* Oldval must satisfy compare afterward. */
19646 if (!aarch64_plus_operand (oldval, mode))
19647 oldval = force_reg (mode, oldval);
19648 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19649 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19650 oldval, mode, newval, mode,
19651 XEXP (mem, 0), Pmode);
19652 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19653 }
19654 else
19655 {
19656 /* The oldval predicate varies by mode. Test it and force to reg. */
19657 insn_code code = code_for_aarch64_compare_and_swap (mode);
19658 if (!insn_data[code].operand[2].predicate (oldval, mode))
19659 oldval = force_reg (mode, oldval);
19660
19661 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19662 is_weak, mod_s, mod_f));
19663 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19664 }
19665
19666 if (r_mode != mode)
19667 rval = gen_lowpart (mode, rval);
19668 emit_move_insn (operands[1], rval);
19669
19670 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19671 emit_insn (gen_rtx_SET (bval, x));
19672 }
19673
19674 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19675 sequence implementing an atomic operation. */
19676
19677 static void
19678 aarch64_emit_post_barrier (enum memmodel model)
19679 {
19680 const enum memmodel base_model = memmodel_base (model);
19681
19682 if (is_mm_sync (model)
19683 && (base_model == MEMMODEL_ACQUIRE
19684 || base_model == MEMMODEL_ACQ_REL
19685 || base_model == MEMMODEL_SEQ_CST))
19686 {
19687 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19688 }
19689 }
19690
19691 /* Split a compare and swap pattern. */
19692
19693 void
19694 aarch64_split_compare_and_swap (rtx operands[])
19695 {
19696 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19697 gcc_assert (epilogue_completed);
19698
19699 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19700 machine_mode mode;
19701 bool is_weak;
19702 rtx_code_label *label1, *label2;
19703 enum memmodel model;
19704
19705 rval = operands[0];
19706 mem = operands[1];
19707 oldval = operands[2];
19708 newval = operands[3];
19709 is_weak = (operands[4] != const0_rtx);
19710 model_rtx = operands[5];
19711 scratch = operands[7];
19712 mode = GET_MODE (mem);
19713 model = memmodel_from_int (INTVAL (model_rtx));
19714
19715 /* When OLDVAL is zero and we want the strong version we can emit a tighter
19716 loop:
19717 .label1:
19718 LD[A]XR rval, [mem]
19719 CBNZ rval, .label2
19720 ST[L]XR scratch, newval, [mem]
19721 CBNZ scratch, .label1
19722 .label2:
19723 CMP rval, 0. */
19724 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19725 oldval == const0_rtx && mode != TImode);
19726
19727 label1 = NULL;
19728 if (!is_weak)
19729 {
19730 label1 = gen_label_rtx ();
19731 emit_label (label1);
19732 }
19733 label2 = gen_label_rtx ();
19734
19735 /* The initial load can be relaxed for a __sync operation since a final
19736 barrier will be emitted to stop code hoisting. */
19737 if (is_mm_sync (model))
19738 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19739 else
19740 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19741
19742 if (strong_zero_p)
19743 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19744 else
19745 {
19746 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19747 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19748 }
19749 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19750 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19751 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19752
19753 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19754
19755 if (!is_weak)
19756 {
19757 if (aarch64_track_speculation)
19758 {
19759 /* Emit an explicit compare instruction, so that we can correctly
19760 track the condition codes. */
19761 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19762 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19763 }
19764 else
19765 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19766
19767 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19768 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
19769 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19770 }
19771 else
19772 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19773
19774 emit_label (label2);
19775
19776 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19777 to set the condition flags. If this is not used it will be removed by
19778 later passes. */
19779 if (strong_zero_p)
19780 aarch64_gen_compare_reg (NE, rval, const0_rtx);
19781
19782 /* Emit any final barrier needed for a __sync operation. */
19783 if (is_mm_sync (model))
19784 aarch64_emit_post_barrier (model);
19785 }
19786
19787 /* Split an atomic operation. */
19788
19789 void
19790 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
19791 rtx value, rtx model_rtx, rtx cond)
19792 {
19793 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19794 gcc_assert (epilogue_completed);
19795
19796 machine_mode mode = GET_MODE (mem);
19797 machine_mode wmode = (mode == DImode ? DImode : SImode);
19798 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19799 const bool is_sync = is_mm_sync (model);
19800 rtx_code_label *label;
19801 rtx x;
19802
19803 /* Split the atomic operation into a sequence. */
19804 label = gen_label_rtx ();
19805 emit_label (label);
19806
19807 if (new_out)
19808 new_out = gen_lowpart (wmode, new_out);
19809 if (old_out)
19810 old_out = gen_lowpart (wmode, old_out);
19811 else
19812 old_out = new_out;
19813 value = simplify_gen_subreg (wmode, value, mode, 0);
19814
19815 /* The initial load can be relaxed for a __sync operation since a final
19816 barrier will be emitted to stop code hoisting. */
19817 if (is_sync)
19818 aarch64_emit_load_exclusive (mode, old_out, mem,
19819 GEN_INT (MEMMODEL_RELAXED));
19820 else
19821 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
19822
19823 switch (code)
19824 {
19825 case SET:
19826 new_out = value;
19827 break;
19828
19829 case NOT:
19830 x = gen_rtx_AND (wmode, old_out, value);
19831 emit_insn (gen_rtx_SET (new_out, x));
19832 x = gen_rtx_NOT (wmode, new_out);
19833 emit_insn (gen_rtx_SET (new_out, x));
19834 break;
19835
19836 case MINUS:
19837 if (CONST_INT_P (value))
19838 {
19839 value = GEN_INT (-INTVAL (value));
19840 code = PLUS;
19841 }
19842 /* Fall through. */
19843
19844 default:
19845 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
19846 emit_insn (gen_rtx_SET (new_out, x));
19847 break;
19848 }
19849
19850 aarch64_emit_store_exclusive (mode, cond, mem,
19851 gen_lowpart (mode, new_out), model_rtx);
19852
19853 if (aarch64_track_speculation)
19854 {
19855 /* Emit an explicit compare instruction, so that we can correctly
19856 track the condition codes. */
19857 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19858 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19859 }
19860 else
19861 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19862
19863 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19864 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
19865 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19866
19867 /* Emit any final barrier needed for a __sync operation. */
19868 if (is_sync)
19869 aarch64_emit_post_barrier (model);
19870 }
19871
19872 static void
19873 aarch64_init_libfuncs (void)
19874 {
19875 /* Half-precision float operations. The compiler handles all operations
19876 with NULL libfuncs by converting to SFmode. */
19877
19878 /* Conversions. */
19879 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19880 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19881
19882 /* Arithmetic. */
19883 set_optab_libfunc (add_optab, HFmode, NULL);
19884 set_optab_libfunc (sdiv_optab, HFmode, NULL);
19885 set_optab_libfunc (smul_optab, HFmode, NULL);
19886 set_optab_libfunc (neg_optab, HFmode, NULL);
19887 set_optab_libfunc (sub_optab, HFmode, NULL);
19888
19889 /* Comparisons. */
19890 set_optab_libfunc (eq_optab, HFmode, NULL);
19891 set_optab_libfunc (ne_optab, HFmode, NULL);
19892 set_optab_libfunc (lt_optab, HFmode, NULL);
19893 set_optab_libfunc (le_optab, HFmode, NULL);
19894 set_optab_libfunc (ge_optab, HFmode, NULL);
19895 set_optab_libfunc (gt_optab, HFmode, NULL);
19896 set_optab_libfunc (unord_optab, HFmode, NULL);
19897 }
19898
19899 /* Target hook for c_mode_for_suffix. */
19900 static machine_mode
19901 aarch64_c_mode_for_suffix (char suffix)
19902 {
19903 if (suffix == 'q')
19904 return TFmode;
19905
19906 return VOIDmode;
19907 }
19908
19909 /* We can only represent floating point constants which will fit in
19910 "quarter-precision" values. These values are characterised by
19911 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
19912 by:
19913
19914 (-1)^s * (n/16) * 2^r
19915
19916 Where:
19917 's' is the sign bit.
19918 'n' is an integer in the range 16 <= n <= 31.
19919 'r' is an integer in the range -3 <= r <= 4. */
19920
19921 /* Return true iff X can be represented by a quarter-precision
19922 floating point immediate operand X. Note, we cannot represent 0.0. */
19923 bool
19924 aarch64_float_const_representable_p (rtx x)
19925 {
19926 /* This represents our current view of how many bits
19927 make up the mantissa. */
19928 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
19929 int exponent;
19930 unsigned HOST_WIDE_INT mantissa, mask;
19931 REAL_VALUE_TYPE r, m;
19932 bool fail;
19933
19934 x = unwrap_const_vec_duplicate (x);
19935 if (!CONST_DOUBLE_P (x))
19936 return false;
19937
19938 if (GET_MODE (x) == VOIDmode
19939 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
19940 return false;
19941
19942 r = *CONST_DOUBLE_REAL_VALUE (x);
19943
19944 /* We cannot represent infinities, NaNs or +/-zero. We won't
19945 know if we have +zero until we analyse the mantissa, but we
19946 can reject the other invalid values. */
19947 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19948 || REAL_VALUE_MINUS_ZERO (r))
19949 return false;
19950
19951 /* Extract exponent. */
19952 r = real_value_abs (&r);
19953 exponent = REAL_EXP (&r);
19954
19955 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19956 highest (sign) bit, with a fixed binary point at bit point_pos.
19957 m1 holds the low part of the mantissa, m2 the high part.
19958 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19959 bits for the mantissa, this can fail (low bits will be lost). */
19960 real_ldexp (&m, &r, point_pos - exponent);
19961 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
19962
19963 /* If the low part of the mantissa has bits set we cannot represent
19964 the value. */
19965 if (w.ulow () != 0)
19966 return false;
19967 /* We have rejected the lower HOST_WIDE_INT, so update our
19968 understanding of how many bits lie in the mantissa and
19969 look only at the high HOST_WIDE_INT. */
19970 mantissa = w.elt (1);
19971 point_pos -= HOST_BITS_PER_WIDE_INT;
19972
19973 /* We can only represent values with a mantissa of the form 1.xxxx. */
19974 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19975 if ((mantissa & mask) != 0)
19976 return false;
19977
19978 /* Having filtered unrepresentable values, we may now remove all
19979 but the highest 5 bits. */
19980 mantissa >>= point_pos - 5;
19981
19982 /* We cannot represent the value 0.0, so reject it. This is handled
19983 elsewhere. */
19984 if (mantissa == 0)
19985 return false;
19986
19987 /* Then, as bit 4 is always set, we can mask it off, leaving
19988 the mantissa in the range [0, 15]. */
19989 mantissa &= ~(1 << 4);
19990 gcc_assert (mantissa <= 15);
19991
19992 /* GCC internally does not use IEEE754-like encoding (where normalized
19993 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
19994 Our mantissa values are shifted 4 places to the left relative to
19995 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19996 by 5 places to correct for GCC's representation. */
19997 exponent = 5 - exponent;
19998
19999 return (exponent >= 0 && exponent <= 7);
20000 }
20001
20002 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
20003 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
20004 output MOVI/MVNI, ORR or BIC immediate. */
20005 char*
20006 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
20007 enum simd_immediate_check which)
20008 {
20009 bool is_valid;
20010 static char templ[40];
20011 const char *mnemonic;
20012 const char *shift_op;
20013 unsigned int lane_count = 0;
20014 char element_char;
20015
20016 struct simd_immediate_info info;
20017
20018 /* This will return true to show const_vector is legal for use as either
20019 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
20020 It will also update INFO to show how the immediate should be generated.
20021 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
20022 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
20023 gcc_assert (is_valid);
20024
20025 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20026 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
20027
20028 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
20029 {
20030 gcc_assert (info.insn == simd_immediate_info::MOV
20031 && info.u.mov.shift == 0);
20032 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
20033 move immediate path. */
20034 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20035 info.u.mov.value = GEN_INT (0);
20036 else
20037 {
20038 const unsigned int buf_size = 20;
20039 char float_buf[buf_size] = {'\0'};
20040 real_to_decimal_for_mode (float_buf,
20041 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
20042 buf_size, buf_size, 1, info.elt_mode);
20043
20044 if (lane_count == 1)
20045 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
20046 else
20047 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
20048 lane_count, element_char, float_buf);
20049 return templ;
20050 }
20051 }
20052
20053 gcc_assert (CONST_INT_P (info.u.mov.value));
20054
20055 if (which == AARCH64_CHECK_MOV)
20056 {
20057 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
20058 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
20059 ? "msl" : "lsl");
20060 if (lane_count == 1)
20061 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
20062 mnemonic, UINTVAL (info.u.mov.value));
20063 else if (info.u.mov.shift)
20064 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20065 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
20066 element_char, UINTVAL (info.u.mov.value), shift_op,
20067 info.u.mov.shift);
20068 else
20069 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20070 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
20071 element_char, UINTVAL (info.u.mov.value));
20072 }
20073 else
20074 {
20075 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
20076 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
20077 if (info.u.mov.shift)
20078 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20079 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
20080 element_char, UINTVAL (info.u.mov.value), "lsl",
20081 info.u.mov.shift);
20082 else
20083 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20084 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
20085 element_char, UINTVAL (info.u.mov.value));
20086 }
20087 return templ;
20088 }
20089
20090 char*
20091 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
20092 {
20093
20094 /* If a floating point number was passed and we desire to use it in an
20095 integer mode do the conversion to integer. */
20096 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
20097 {
20098 unsigned HOST_WIDE_INT ival;
20099 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
20100 gcc_unreachable ();
20101 immediate = gen_int_mode (ival, mode);
20102 }
20103
20104 machine_mode vmode;
20105 /* use a 64 bit mode for everything except for DI/DF mode, where we use
20106 a 128 bit vector mode. */
20107 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
20108
20109 vmode = aarch64_simd_container_mode (mode, width);
20110 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
20111 return aarch64_output_simd_mov_immediate (v_op, width);
20112 }
20113
20114 /* Return the output string to use for moving immediate CONST_VECTOR
20115 into an SVE register. */
20116
20117 char *
20118 aarch64_output_sve_mov_immediate (rtx const_vector)
20119 {
20120 static char templ[40];
20121 struct simd_immediate_info info;
20122 char element_char;
20123
20124 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
20125 gcc_assert (is_valid);
20126
20127 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20128
20129 machine_mode vec_mode = GET_MODE (const_vector);
20130 if (aarch64_sve_pred_mode_p (vec_mode))
20131 {
20132 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
20133 if (info.insn == simd_immediate_info::MOV)
20134 {
20135 gcc_assert (info.u.mov.value == const0_rtx);
20136 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
20137 }
20138 else
20139 {
20140 gcc_assert (info.insn == simd_immediate_info::PTRUE);
20141 unsigned int total_bytes;
20142 if (info.u.pattern == AARCH64_SV_ALL
20143 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
20144 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
20145 total_bytes / GET_MODE_SIZE (info.elt_mode));
20146 else
20147 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
20148 svpattern_token (info.u.pattern));
20149 }
20150 return buf;
20151 }
20152
20153 if (info.insn == simd_immediate_info::INDEX)
20154 {
20155 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
20156 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
20157 element_char, INTVAL (info.u.index.base),
20158 INTVAL (info.u.index.step));
20159 return templ;
20160 }
20161
20162 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
20163 {
20164 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20165 info.u.mov.value = GEN_INT (0);
20166 else
20167 {
20168 const int buf_size = 20;
20169 char float_buf[buf_size] = {};
20170 real_to_decimal_for_mode (float_buf,
20171 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
20172 buf_size, buf_size, 1, info.elt_mode);
20173
20174 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
20175 element_char, float_buf);
20176 return templ;
20177 }
20178 }
20179
20180 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
20181 element_char, INTVAL (info.u.mov.value));
20182 return templ;
20183 }
20184
20185 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
20186 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
20187 pattern. */
20188
20189 char *
20190 aarch64_output_sve_ptrues (rtx const_unspec)
20191 {
20192 static char templ[40];
20193
20194 struct simd_immediate_info info;
20195 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
20196 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
20197
20198 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20199 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
20200 svpattern_token (info.u.pattern));
20201 return templ;
20202 }
20203
20204 /* Split operands into moves from op[1] + op[2] into op[0]. */
20205
20206 void
20207 aarch64_split_combinev16qi (rtx operands[3])
20208 {
20209 unsigned int dest = REGNO (operands[0]);
20210 unsigned int src1 = REGNO (operands[1]);
20211 unsigned int src2 = REGNO (operands[2]);
20212 machine_mode halfmode = GET_MODE (operands[1]);
20213 unsigned int halfregs = REG_NREGS (operands[1]);
20214 rtx destlo, desthi;
20215
20216 gcc_assert (halfmode == V16QImode);
20217
20218 if (src1 == dest && src2 == dest + halfregs)
20219 {
20220 /* No-op move. Can't split to nothing; emit something. */
20221 emit_note (NOTE_INSN_DELETED);
20222 return;
20223 }
20224
20225 /* Preserve register attributes for variable tracking. */
20226 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
20227 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
20228 GET_MODE_SIZE (halfmode));
20229
20230 /* Special case of reversed high/low parts. */
20231 if (reg_overlap_mentioned_p (operands[2], destlo)
20232 && reg_overlap_mentioned_p (operands[1], desthi))
20233 {
20234 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20235 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
20236 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20237 }
20238 else if (!reg_overlap_mentioned_p (operands[2], destlo))
20239 {
20240 /* Try to avoid unnecessary moves if part of the result
20241 is in the right place already. */
20242 if (src1 != dest)
20243 emit_move_insn (destlo, operands[1]);
20244 if (src2 != dest + halfregs)
20245 emit_move_insn (desthi, operands[2]);
20246 }
20247 else
20248 {
20249 if (src2 != dest + halfregs)
20250 emit_move_insn (desthi, operands[2]);
20251 if (src1 != dest)
20252 emit_move_insn (destlo, operands[1]);
20253 }
20254 }
20255
20256 /* vec_perm support. */
20257
20258 struct expand_vec_perm_d
20259 {
20260 rtx target, op0, op1;
20261 vec_perm_indices perm;
20262 machine_mode vmode;
20263 unsigned int vec_flags;
20264 bool one_vector_p;
20265 bool testing_p;
20266 };
20267
20268 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
20269
20270 /* Generate a variable permutation. */
20271
20272 static void
20273 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
20274 {
20275 machine_mode vmode = GET_MODE (target);
20276 bool one_vector_p = rtx_equal_p (op0, op1);
20277
20278 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
20279 gcc_checking_assert (GET_MODE (op0) == vmode);
20280 gcc_checking_assert (GET_MODE (op1) == vmode);
20281 gcc_checking_assert (GET_MODE (sel) == vmode);
20282 gcc_checking_assert (TARGET_SIMD);
20283
20284 if (one_vector_p)
20285 {
20286 if (vmode == V8QImode)
20287 {
20288 /* Expand the argument to a V16QI mode by duplicating it. */
20289 rtx pair = gen_reg_rtx (V16QImode);
20290 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
20291 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20292 }
20293 else
20294 {
20295 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
20296 }
20297 }
20298 else
20299 {
20300 rtx pair;
20301
20302 if (vmode == V8QImode)
20303 {
20304 pair = gen_reg_rtx (V16QImode);
20305 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
20306 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20307 }
20308 else
20309 {
20310 pair = gen_reg_rtx (OImode);
20311 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
20312 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
20313 }
20314 }
20315 }
20316
20317 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
20318 NELT is the number of elements in the vector. */
20319
20320 void
20321 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
20322 unsigned int nelt)
20323 {
20324 machine_mode vmode = GET_MODE (target);
20325 bool one_vector_p = rtx_equal_p (op0, op1);
20326 rtx mask;
20327
20328 /* The TBL instruction does not use a modulo index, so we must take care
20329 of that ourselves. */
20330 mask = aarch64_simd_gen_const_vector_dup (vmode,
20331 one_vector_p ? nelt - 1 : 2 * nelt - 1);
20332 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
20333
20334 /* For big-endian, we also need to reverse the index within the vector
20335 (but not which vector). */
20336 if (BYTES_BIG_ENDIAN)
20337 {
20338 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
20339 if (!one_vector_p)
20340 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
20341 sel = expand_simple_binop (vmode, XOR, sel, mask,
20342 NULL, 0, OPTAB_LIB_WIDEN);
20343 }
20344 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
20345 }
20346
20347 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
20348
20349 static void
20350 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
20351 {
20352 emit_insn (gen_rtx_SET (target,
20353 gen_rtx_UNSPEC (GET_MODE (target),
20354 gen_rtvec (2, op0, op1), code)));
20355 }
20356
20357 /* Expand an SVE vec_perm with the given operands. */
20358
20359 void
20360 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
20361 {
20362 machine_mode data_mode = GET_MODE (target);
20363 machine_mode sel_mode = GET_MODE (sel);
20364 /* Enforced by the pattern condition. */
20365 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
20366
20367 /* Note: vec_perm indices are supposed to wrap when they go beyond the
20368 size of the two value vectors, i.e. the upper bits of the indices
20369 are effectively ignored. SVE TBL instead produces 0 for any
20370 out-of-range indices, so we need to modulo all the vec_perm indices
20371 to ensure they are all in range. */
20372 rtx sel_reg = force_reg (sel_mode, sel);
20373
20374 /* Check if the sel only references the first values vector. */
20375 if (GET_CODE (sel) == CONST_VECTOR
20376 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20377 {
20378 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20379 return;
20380 }
20381
20382 /* Check if the two values vectors are the same. */
20383 if (rtx_equal_p (op0, op1))
20384 {
20385 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20386 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20387 NULL, 0, OPTAB_DIRECT);
20388 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20389 return;
20390 }
20391
20392 /* Run TBL on for each value vector and combine the results. */
20393
20394 rtx res0 = gen_reg_rtx (data_mode);
20395 rtx res1 = gen_reg_rtx (data_mode);
20396 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20397 if (GET_CODE (sel) != CONST_VECTOR
20398 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20399 {
20400 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20401 2 * nunits - 1);
20402 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20403 NULL, 0, OPTAB_DIRECT);
20404 }
20405 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20406 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20407 NULL, 0, OPTAB_DIRECT);
20408 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20409 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20410 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20411 else
20412 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20413 }
20414
20415 /* Recognize patterns suitable for the TRN instructions. */
20416 static bool
20417 aarch64_evpc_trn (struct expand_vec_perm_d *d)
20418 {
20419 HOST_WIDE_INT odd;
20420 poly_uint64 nelt = d->perm.length ();
20421 rtx out, in0, in1, x;
20422 machine_mode vmode = d->vmode;
20423
20424 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20425 return false;
20426
20427 /* Note that these are little-endian tests.
20428 We correct for big-endian later. */
20429 if (!d->perm[0].is_constant (&odd)
20430 || (odd != 0 && odd != 1)
20431 || !d->perm.series_p (0, 2, odd, 2)
20432 || !d->perm.series_p (1, 2, nelt + odd, 2))
20433 return false;
20434
20435 /* Success! */
20436 if (d->testing_p)
20437 return true;
20438
20439 in0 = d->op0;
20440 in1 = d->op1;
20441 /* We don't need a big-endian lane correction for SVE; see the comment
20442 at the head of aarch64-sve.md for details. */
20443 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20444 {
20445 x = in0, in0 = in1, in1 = x;
20446 odd = !odd;
20447 }
20448 out = d->target;
20449
20450 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20451 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
20452 return true;
20453 }
20454
20455 /* Try to re-encode the PERM constant so it combines odd and even elements.
20456 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
20457 We retry with this new constant with the full suite of patterns. */
20458 static bool
20459 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
20460 {
20461 expand_vec_perm_d newd;
20462 unsigned HOST_WIDE_INT nelt;
20463
20464 if (d->vec_flags != VEC_ADVSIMD)
20465 return false;
20466
20467 /* Get the new mode. Always twice the size of the inner
20468 and half the elements. */
20469 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
20470 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
20471 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
20472 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
20473
20474 if (new_mode == word_mode)
20475 return false;
20476
20477 /* to_constant is safe since this routine is specific to Advanced SIMD
20478 vectors. */
20479 nelt = d->perm.length ().to_constant ();
20480
20481 vec_perm_builder newpermconst;
20482 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
20483
20484 /* Convert the perm constant if we can. Require even, odd as the pairs. */
20485 for (unsigned int i = 0; i < nelt; i += 2)
20486 {
20487 poly_int64 elt0 = d->perm[i];
20488 poly_int64 elt1 = d->perm[i + 1];
20489 poly_int64 newelt;
20490 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
20491 return false;
20492 newpermconst.quick_push (newelt.to_constant ());
20493 }
20494 newpermconst.finalize ();
20495
20496 newd.vmode = new_mode;
20497 newd.vec_flags = VEC_ADVSIMD;
20498 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
20499 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
20500 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
20501 newd.testing_p = d->testing_p;
20502 newd.one_vector_p = d->one_vector_p;
20503
20504 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
20505 return aarch64_expand_vec_perm_const_1 (&newd);
20506 }
20507
20508 /* Recognize patterns suitable for the UZP instructions. */
20509 static bool
20510 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20511 {
20512 HOST_WIDE_INT odd;
20513 rtx out, in0, in1, x;
20514 machine_mode vmode = d->vmode;
20515
20516 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20517 return false;
20518
20519 /* Note that these are little-endian tests.
20520 We correct for big-endian later. */
20521 if (!d->perm[0].is_constant (&odd)
20522 || (odd != 0 && odd != 1)
20523 || !d->perm.series_p (0, 1, odd, 2))
20524 return false;
20525
20526 /* Success! */
20527 if (d->testing_p)
20528 return true;
20529
20530 in0 = d->op0;
20531 in1 = d->op1;
20532 /* We don't need a big-endian lane correction for SVE; see the comment
20533 at the head of aarch64-sve.md for details. */
20534 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20535 {
20536 x = in0, in0 = in1, in1 = x;
20537 odd = !odd;
20538 }
20539 out = d->target;
20540
20541 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20542 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
20543 return true;
20544 }
20545
20546 /* Recognize patterns suitable for the ZIP instructions. */
20547 static bool
20548 aarch64_evpc_zip (struct expand_vec_perm_d *d)
20549 {
20550 unsigned int high;
20551 poly_uint64 nelt = d->perm.length ();
20552 rtx out, in0, in1, x;
20553 machine_mode vmode = d->vmode;
20554
20555 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20556 return false;
20557
20558 /* Note that these are little-endian tests.
20559 We correct for big-endian later. */
20560 poly_uint64 first = d->perm[0];
20561 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20562 || !d->perm.series_p (0, 2, first, 1)
20563 || !d->perm.series_p (1, 2, first + nelt, 1))
20564 return false;
20565 high = maybe_ne (first, 0U);
20566
20567 /* Success! */
20568 if (d->testing_p)
20569 return true;
20570
20571 in0 = d->op0;
20572 in1 = d->op1;
20573 /* We don't need a big-endian lane correction for SVE; see the comment
20574 at the head of aarch64-sve.md for details. */
20575 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20576 {
20577 x = in0, in0 = in1, in1 = x;
20578 high = !high;
20579 }
20580 out = d->target;
20581
20582 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20583 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
20584 return true;
20585 }
20586
20587 /* Recognize patterns for the EXT insn. */
20588
20589 static bool
20590 aarch64_evpc_ext (struct expand_vec_perm_d *d)
20591 {
20592 HOST_WIDE_INT location;
20593 rtx offset;
20594
20595 /* The first element always refers to the first vector.
20596 Check if the extracted indices are increasing by one. */
20597 if (d->vec_flags == VEC_SVE_PRED
20598 || !d->perm[0].is_constant (&location)
20599 || !d->perm.series_p (0, 1, location, 1))
20600 return false;
20601
20602 /* Success! */
20603 if (d->testing_p)
20604 return true;
20605
20606 /* The case where (location == 0) is a no-op for both big- and little-endian,
20607 and is removed by the mid-end at optimization levels -O1 and higher.
20608
20609 We don't need a big-endian lane correction for SVE; see the comment
20610 at the head of aarch64-sve.md for details. */
20611 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
20612 {
20613 /* After setup, we want the high elements of the first vector (stored
20614 at the LSB end of the register), and the low elements of the second
20615 vector (stored at the MSB end of the register). So swap. */
20616 std::swap (d->op0, d->op1);
20617 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20618 to_constant () is safe since this is restricted to Advanced SIMD
20619 vectors. */
20620 location = d->perm.length ().to_constant () - location;
20621 }
20622
20623 offset = GEN_INT (location);
20624 emit_set_insn (d->target,
20625 gen_rtx_UNSPEC (d->vmode,
20626 gen_rtvec (3, d->op0, d->op1, offset),
20627 UNSPEC_EXT));
20628 return true;
20629 }
20630
20631 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20632 within each 64-bit, 32-bit or 16-bit granule. */
20633
20634 static bool
20635 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
20636 {
20637 HOST_WIDE_INT diff;
20638 unsigned int i, size, unspec;
20639 machine_mode pred_mode;
20640
20641 if (d->vec_flags == VEC_SVE_PRED
20642 || !d->one_vector_p
20643 || !d->perm[0].is_constant (&diff)
20644 || !diff)
20645 return false;
20646
20647 if (d->vec_flags & VEC_SVE_DATA)
20648 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
20649 else
20650 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
20651 if (size == 64)
20652 {
20653 unspec = UNSPEC_REV64;
20654 pred_mode = VNx2BImode;
20655 }
20656 else if (size == 32)
20657 {
20658 unspec = UNSPEC_REV32;
20659 pred_mode = VNx4BImode;
20660 }
20661 else if (size == 16)
20662 {
20663 unspec = UNSPEC_REV16;
20664 pred_mode = VNx8BImode;
20665 }
20666 else
20667 return false;
20668
20669 unsigned int step = diff + 1;
20670 for (i = 0; i < step; ++i)
20671 if (!d->perm.series_p (i, step, diff - i, step))
20672 return false;
20673
20674 /* Success! */
20675 if (d->testing_p)
20676 return true;
20677
20678 if (d->vec_flags & VEC_SVE_DATA)
20679 {
20680 rtx pred = aarch64_ptrue_reg (pred_mode);
20681 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
20682 d->target, pred, d->op0));
20683 return true;
20684 }
20685 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
20686 emit_set_insn (d->target, src);
20687 return true;
20688 }
20689
20690 /* Recognize patterns for the REV insn, which reverses elements within
20691 a full vector. */
20692
20693 static bool
20694 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20695 {
20696 poly_uint64 nelt = d->perm.length ();
20697
20698 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
20699 return false;
20700
20701 if (!d->perm.series_p (0, 1, nelt - 1, -1))
20702 return false;
20703
20704 /* Success! */
20705 if (d->testing_p)
20706 return true;
20707
20708 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20709 emit_set_insn (d->target, src);
20710 return true;
20711 }
20712
20713 static bool
20714 aarch64_evpc_dup (struct expand_vec_perm_d *d)
20715 {
20716 rtx out = d->target;
20717 rtx in0;
20718 HOST_WIDE_INT elt;
20719 machine_mode vmode = d->vmode;
20720 rtx lane;
20721
20722 if (d->vec_flags == VEC_SVE_PRED
20723 || d->perm.encoding ().encoded_nelts () != 1
20724 || !d->perm[0].is_constant (&elt))
20725 return false;
20726
20727 if ((d->vec_flags & VEC_SVE_DATA)
20728 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
20729 return false;
20730
20731 /* Success! */
20732 if (d->testing_p)
20733 return true;
20734
20735 /* The generic preparation in aarch64_expand_vec_perm_const_1
20736 swaps the operand order and the permute indices if it finds
20737 d->perm[0] to be in the second operand. Thus, we can always
20738 use d->op0 and need not do any extra arithmetic to get the
20739 correct lane number. */
20740 in0 = d->op0;
20741 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
20742
20743 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20744 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20745 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20746 return true;
20747 }
20748
20749 static bool
20750 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20751 {
20752 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20753 machine_mode vmode = d->vmode;
20754
20755 /* Make sure that the indices are constant. */
20756 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20757 for (unsigned int i = 0; i < encoded_nelts; ++i)
20758 if (!d->perm[i].is_constant ())
20759 return false;
20760
20761 if (d->testing_p)
20762 return true;
20763
20764 /* Generic code will try constant permutation twice. Once with the
20765 original mode and again with the elements lowered to QImode.
20766 So wait and don't do the selector expansion ourselves. */
20767 if (vmode != V8QImode && vmode != V16QImode)
20768 return false;
20769
20770 /* to_constant is safe since this routine is specific to Advanced SIMD
20771 vectors. */
20772 unsigned int nelt = d->perm.length ().to_constant ();
20773 for (unsigned int i = 0; i < nelt; ++i)
20774 /* If big-endian and two vectors we end up with a weird mixed-endian
20775 mode on NEON. Reverse the index within each word but not the word
20776 itself. to_constant is safe because we checked is_constant above. */
20777 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20778 ? d->perm[i].to_constant () ^ (nelt - 1)
20779 : d->perm[i].to_constant ());
20780
20781 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20782 sel = force_reg (vmode, sel);
20783
20784 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20785 return true;
20786 }
20787
20788 /* Try to implement D using an SVE TBL instruction. */
20789
20790 static bool
20791 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20792 {
20793 unsigned HOST_WIDE_INT nelt;
20794
20795 /* Permuting two variable-length vectors could overflow the
20796 index range. */
20797 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20798 return false;
20799
20800 if (d->testing_p)
20801 return true;
20802
20803 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20804 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
20805 if (d->one_vector_p)
20806 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20807 else
20808 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
20809 return true;
20810 }
20811
20812 /* Try to implement D using SVE SEL instruction. */
20813
20814 static bool
20815 aarch64_evpc_sel (struct expand_vec_perm_d *d)
20816 {
20817 machine_mode vmode = d->vmode;
20818 int unit_size = GET_MODE_UNIT_SIZE (vmode);
20819
20820 if (d->vec_flags != VEC_SVE_DATA
20821 || unit_size > 8)
20822 return false;
20823
20824 int n_patterns = d->perm.encoding ().npatterns ();
20825 poly_int64 vec_len = d->perm.length ();
20826
20827 for (int i = 0; i < n_patterns; ++i)
20828 if (!known_eq (d->perm[i], i)
20829 && !known_eq (d->perm[i], vec_len + i))
20830 return false;
20831
20832 for (int i = n_patterns; i < n_patterns * 2; i++)
20833 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20834 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20835 return false;
20836
20837 if (d->testing_p)
20838 return true;
20839
20840 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
20841
20842 /* Build a predicate that is true when op0 elements should be used. */
20843 rtx_vector_builder builder (pred_mode, n_patterns, 2);
20844 for (int i = 0; i < n_patterns * 2; i++)
20845 {
20846 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20847 : CONST0_RTX (BImode);
20848 builder.quick_push (elem);
20849 }
20850
20851 rtx const_vec = builder.build ();
20852 rtx pred = force_reg (pred_mode, const_vec);
20853 /* TARGET = PRED ? OP0 : OP1. */
20854 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
20855 return true;
20856 }
20857
20858 /* Recognize patterns suitable for the INS instructions. */
20859 static bool
20860 aarch64_evpc_ins (struct expand_vec_perm_d *d)
20861 {
20862 machine_mode mode = d->vmode;
20863 unsigned HOST_WIDE_INT nelt;
20864
20865 if (d->vec_flags != VEC_ADVSIMD)
20866 return false;
20867
20868 /* to_constant is safe since this routine is specific to Advanced SIMD
20869 vectors. */
20870 nelt = d->perm.length ().to_constant ();
20871 rtx insv = d->op0;
20872
20873 HOST_WIDE_INT idx = -1;
20874
20875 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20876 {
20877 HOST_WIDE_INT elt;
20878 if (!d->perm[i].is_constant (&elt))
20879 return false;
20880 if (elt == (HOST_WIDE_INT) i)
20881 continue;
20882 if (idx != -1)
20883 {
20884 idx = -1;
20885 break;
20886 }
20887 idx = i;
20888 }
20889
20890 if (idx == -1)
20891 {
20892 insv = d->op1;
20893 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20894 {
20895 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
20896 continue;
20897 if (idx != -1)
20898 return false;
20899 idx = i;
20900 }
20901
20902 if (idx == -1)
20903 return false;
20904 }
20905
20906 if (d->testing_p)
20907 return true;
20908
20909 gcc_assert (idx != -1);
20910
20911 unsigned extractindex = d->perm[idx].to_constant ();
20912 rtx extractv = d->op0;
20913 if (extractindex >= nelt)
20914 {
20915 extractv = d->op1;
20916 extractindex -= nelt;
20917 }
20918 gcc_assert (extractindex < nelt);
20919
20920 emit_move_insn (d->target, insv);
20921 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
20922 expand_operand ops[5];
20923 create_output_operand (&ops[0], d->target, mode);
20924 create_input_operand (&ops[1], d->target, mode);
20925 create_integer_operand (&ops[2], 1 << idx);
20926 create_input_operand (&ops[3], extractv, mode);
20927 create_integer_operand (&ops[4], extractindex);
20928 expand_insn (icode, 5, ops);
20929
20930 return true;
20931 }
20932
20933 static bool
20934 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20935 {
20936 /* The pattern matching functions above are written to look for a small
20937 number to begin the sequence (0, 1, N/2). If we begin with an index
20938 from the second operand, we can swap the operands. */
20939 poly_int64 nelt = d->perm.length ();
20940 if (known_ge (d->perm[0], nelt))
20941 {
20942 d->perm.rotate_inputs (1);
20943 std::swap (d->op0, d->op1);
20944 }
20945
20946 if ((d->vec_flags == VEC_ADVSIMD
20947 || d->vec_flags == VEC_SVE_DATA
20948 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
20949 || d->vec_flags == VEC_SVE_PRED)
20950 && known_gt (nelt, 1))
20951 {
20952 if (aarch64_evpc_rev_local (d))
20953 return true;
20954 else if (aarch64_evpc_rev_global (d))
20955 return true;
20956 else if (aarch64_evpc_ext (d))
20957 return true;
20958 else if (aarch64_evpc_dup (d))
20959 return true;
20960 else if (aarch64_evpc_zip (d))
20961 return true;
20962 else if (aarch64_evpc_uzp (d))
20963 return true;
20964 else if (aarch64_evpc_trn (d))
20965 return true;
20966 else if (aarch64_evpc_sel (d))
20967 return true;
20968 else if (aarch64_evpc_ins (d))
20969 return true;
20970 else if (aarch64_evpc_reencode (d))
20971 return true;
20972 if (d->vec_flags == VEC_SVE_DATA)
20973 return aarch64_evpc_sve_tbl (d);
20974 else if (d->vec_flags == VEC_ADVSIMD)
20975 return aarch64_evpc_tbl (d);
20976 }
20977 return false;
20978 }
20979
20980 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
20981
20982 static bool
20983 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20984 rtx op1, const vec_perm_indices &sel)
20985 {
20986 struct expand_vec_perm_d d;
20987
20988 /* Check whether the mask can be applied to a single vector. */
20989 if (sel.ninputs () == 1
20990 || (op0 && rtx_equal_p (op0, op1)))
20991 d.one_vector_p = true;
20992 else if (sel.all_from_input_p (0))
20993 {
20994 d.one_vector_p = true;
20995 op1 = op0;
20996 }
20997 else if (sel.all_from_input_p (1))
20998 {
20999 d.one_vector_p = true;
21000 op0 = op1;
21001 }
21002 else
21003 d.one_vector_p = false;
21004
21005 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
21006 sel.nelts_per_input ());
21007 d.vmode = vmode;
21008 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
21009 d.target = target;
21010 d.op0 = op0;
21011 d.op1 = op1;
21012 d.testing_p = !target;
21013
21014 if (!d.testing_p)
21015 return aarch64_expand_vec_perm_const_1 (&d);
21016
21017 rtx_insn *last = get_last_insn ();
21018 bool ret = aarch64_expand_vec_perm_const_1 (&d);
21019 gcc_assert (last == get_last_insn ());
21020
21021 return ret;
21022 }
21023
21024 /* Generate a byte permute mask for a register of mode MODE,
21025 which has NUNITS units. */
21026
21027 rtx
21028 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
21029 {
21030 /* We have to reverse each vector because we dont have
21031 a permuted load that can reverse-load according to ABI rules. */
21032 rtx mask;
21033 rtvec v = rtvec_alloc (16);
21034 unsigned int i, j;
21035 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
21036
21037 gcc_assert (BYTES_BIG_ENDIAN);
21038 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
21039
21040 for (i = 0; i < nunits; i++)
21041 for (j = 0; j < usize; j++)
21042 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
21043 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
21044 return force_reg (V16QImode, mask);
21045 }
21046
21047 /* Expand an SVE integer comparison using the SVE equivalent of:
21048
21049 (set TARGET (CODE OP0 OP1)). */
21050
21051 void
21052 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
21053 {
21054 machine_mode pred_mode = GET_MODE (target);
21055 machine_mode data_mode = GET_MODE (op0);
21056 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
21057 op0, op1);
21058 if (!rtx_equal_p (target, res))
21059 emit_move_insn (target, res);
21060 }
21061
21062 /* Return the UNSPEC_COND_* code for comparison CODE. */
21063
21064 static unsigned int
21065 aarch64_unspec_cond_code (rtx_code code)
21066 {
21067 switch (code)
21068 {
21069 case NE:
21070 return UNSPEC_COND_FCMNE;
21071 case EQ:
21072 return UNSPEC_COND_FCMEQ;
21073 case LT:
21074 return UNSPEC_COND_FCMLT;
21075 case GT:
21076 return UNSPEC_COND_FCMGT;
21077 case LE:
21078 return UNSPEC_COND_FCMLE;
21079 case GE:
21080 return UNSPEC_COND_FCMGE;
21081 case UNORDERED:
21082 return UNSPEC_COND_FCMUO;
21083 default:
21084 gcc_unreachable ();
21085 }
21086 }
21087
21088 /* Emit:
21089
21090 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
21091
21092 where <X> is the operation associated with comparison CODE.
21093 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
21094
21095 static void
21096 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
21097 bool known_ptrue_p, rtx op0, rtx op1)
21098 {
21099 rtx flag = gen_int_mode (known_ptrue_p, SImode);
21100 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
21101 gen_rtvec (4, pred, flag, op0, op1),
21102 aarch64_unspec_cond_code (code));
21103 emit_set_insn (target, unspec);
21104 }
21105
21106 /* Emit the SVE equivalent of:
21107
21108 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
21109 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
21110 (set TARGET (ior:PRED_MODE TMP1 TMP2))
21111
21112 where <Xi> is the operation associated with comparison CODEi.
21113 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
21114
21115 static void
21116 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
21117 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
21118 {
21119 machine_mode pred_mode = GET_MODE (pred);
21120 rtx tmp1 = gen_reg_rtx (pred_mode);
21121 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
21122 rtx tmp2 = gen_reg_rtx (pred_mode);
21123 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
21124 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
21125 }
21126
21127 /* Emit the SVE equivalent of:
21128
21129 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
21130 (set TARGET (not TMP))
21131
21132 where <X> is the operation associated with comparison CODE.
21133 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
21134
21135 static void
21136 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
21137 bool known_ptrue_p, rtx op0, rtx op1)
21138 {
21139 machine_mode pred_mode = GET_MODE (pred);
21140 rtx tmp = gen_reg_rtx (pred_mode);
21141 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
21142 aarch64_emit_unop (target, one_cmpl_optab, tmp);
21143 }
21144
21145 /* Expand an SVE floating-point comparison using the SVE equivalent of:
21146
21147 (set TARGET (CODE OP0 OP1))
21148
21149 If CAN_INVERT_P is true, the caller can also handle inverted results;
21150 return true if the result is in fact inverted. */
21151
21152 bool
21153 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
21154 rtx op0, rtx op1, bool can_invert_p)
21155 {
21156 machine_mode pred_mode = GET_MODE (target);
21157 machine_mode data_mode = GET_MODE (op0);
21158
21159 rtx ptrue = aarch64_ptrue_reg (pred_mode);
21160 switch (code)
21161 {
21162 case UNORDERED:
21163 /* UNORDERED has no immediate form. */
21164 op1 = force_reg (data_mode, op1);
21165 /* fall through */
21166 case LT:
21167 case LE:
21168 case GT:
21169 case GE:
21170 case EQ:
21171 case NE:
21172 {
21173 /* There is native support for the comparison. */
21174 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21175 return false;
21176 }
21177
21178 case LTGT:
21179 /* This is a trapping operation (LT or GT). */
21180 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
21181 return false;
21182
21183 case UNEQ:
21184 if (!flag_trapping_math)
21185 {
21186 /* This would trap for signaling NaNs. */
21187 op1 = force_reg (data_mode, op1);
21188 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
21189 ptrue, true, op0, op1);
21190 return false;
21191 }
21192 /* fall through */
21193 case UNLT:
21194 case UNLE:
21195 case UNGT:
21196 case UNGE:
21197 if (flag_trapping_math)
21198 {
21199 /* Work out which elements are ordered. */
21200 rtx ordered = gen_reg_rtx (pred_mode);
21201 op1 = force_reg (data_mode, op1);
21202 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
21203 ptrue, true, op0, op1);
21204
21205 /* Test the opposite condition for the ordered elements,
21206 then invert the result. */
21207 if (code == UNEQ)
21208 code = NE;
21209 else
21210 code = reverse_condition_maybe_unordered (code);
21211 if (can_invert_p)
21212 {
21213 aarch64_emit_sve_fp_cond (target, code,
21214 ordered, false, op0, op1);
21215 return true;
21216 }
21217 aarch64_emit_sve_invert_fp_cond (target, code,
21218 ordered, false, op0, op1);
21219 return false;
21220 }
21221 break;
21222
21223 case ORDERED:
21224 /* ORDERED has no immediate form. */
21225 op1 = force_reg (data_mode, op1);
21226 break;
21227
21228 default:
21229 gcc_unreachable ();
21230 }
21231
21232 /* There is native support for the inverse comparison. */
21233 code = reverse_condition_maybe_unordered (code);
21234 if (can_invert_p)
21235 {
21236 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21237 return true;
21238 }
21239 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
21240 return false;
21241 }
21242
21243 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
21244 of the data being selected and CMP_MODE is the mode of the values being
21245 compared. */
21246
21247 void
21248 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
21249 rtx *ops)
21250 {
21251 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
21252 rtx pred = gen_reg_rtx (pred_mode);
21253 if (FLOAT_MODE_P (cmp_mode))
21254 {
21255 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
21256 ops[4], ops[5], true))
21257 std::swap (ops[1], ops[2]);
21258 }
21259 else
21260 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
21261
21262 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
21263 ops[1] = force_reg (data_mode, ops[1]);
21264 /* The "false" value can only be zero if the "true" value is a constant. */
21265 if (register_operand (ops[1], data_mode)
21266 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
21267 ops[2] = force_reg (data_mode, ops[2]);
21268
21269 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
21270 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
21271 }
21272
21273 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
21274 true. However due to issues with register allocation it is preferable
21275 to avoid tieing integer scalar and FP scalar modes. Executing integer
21276 operations in general registers is better than treating them as scalar
21277 vector operations. This reduces latency and avoids redundant int<->FP
21278 moves. So tie modes if they are either the same class, or vector modes
21279 with other vector modes, vector structs or any scalar mode. */
21280
21281 static bool
21282 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
21283 {
21284 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
21285 return true;
21286
21287 /* We specifically want to allow elements of "structure" modes to
21288 be tieable to the structure. This more general condition allows
21289 other rarer situations too. The reason we don't extend this to
21290 predicate modes is that there are no predicate structure modes
21291 nor any specific instructions for extracting part of a predicate
21292 register. */
21293 if (aarch64_vector_data_mode_p (mode1)
21294 && aarch64_vector_data_mode_p (mode2))
21295 return true;
21296
21297 /* Also allow any scalar modes with vectors. */
21298 if (aarch64_vector_mode_supported_p (mode1)
21299 || aarch64_vector_mode_supported_p (mode2))
21300 return true;
21301
21302 return false;
21303 }
21304
21305 /* Return a new RTX holding the result of moving POINTER forward by
21306 AMOUNT bytes. */
21307
21308 static rtx
21309 aarch64_move_pointer (rtx pointer, poly_int64 amount)
21310 {
21311 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
21312
21313 return adjust_automodify_address (pointer, GET_MODE (pointer),
21314 next, amount);
21315 }
21316
21317 /* Return a new RTX holding the result of moving POINTER forward by the
21318 size of the mode it points to. */
21319
21320 static rtx
21321 aarch64_progress_pointer (rtx pointer)
21322 {
21323 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
21324 }
21325
21326 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
21327 MODE bytes. */
21328
21329 static void
21330 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
21331 machine_mode mode)
21332 {
21333 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
21334 address copies using V4SImode so that we can use Q registers. */
21335 if (known_eq (GET_MODE_BITSIZE (mode), 256))
21336 {
21337 mode = V4SImode;
21338 rtx reg1 = gen_reg_rtx (mode);
21339 rtx reg2 = gen_reg_rtx (mode);
21340 /* "Cast" the pointers to the correct mode. */
21341 *src = adjust_address (*src, mode, 0);
21342 *dst = adjust_address (*dst, mode, 0);
21343 /* Emit the memcpy. */
21344 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
21345 aarch64_progress_pointer (*src)));
21346 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
21347 aarch64_progress_pointer (*dst), reg2));
21348 /* Move the pointers forward. */
21349 *src = aarch64_move_pointer (*src, 32);
21350 *dst = aarch64_move_pointer (*dst, 32);
21351 return;
21352 }
21353
21354 rtx reg = gen_reg_rtx (mode);
21355
21356 /* "Cast" the pointers to the correct mode. */
21357 *src = adjust_address (*src, mode, 0);
21358 *dst = adjust_address (*dst, mode, 0);
21359 /* Emit the memcpy. */
21360 emit_move_insn (reg, *src);
21361 emit_move_insn (*dst, reg);
21362 /* Move the pointers forward. */
21363 *src = aarch64_progress_pointer (*src);
21364 *dst = aarch64_progress_pointer (*dst);
21365 }
21366
21367 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
21368 we succeed, otherwise return false. */
21369
21370 bool
21371 aarch64_expand_cpymem (rtx *operands)
21372 {
21373 int mode_bits;
21374 rtx dst = operands[0];
21375 rtx src = operands[1];
21376 rtx base;
21377 machine_mode cur_mode = BLKmode;
21378
21379 /* Only expand fixed-size copies. */
21380 if (!CONST_INT_P (operands[2]))
21381 return false;
21382
21383 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
21384
21385 /* Inline up to 256 bytes when optimizing for speed. */
21386 unsigned HOST_WIDE_INT max_copy_size = 256;
21387
21388 if (optimize_function_for_size_p (cfun))
21389 max_copy_size = 128;
21390
21391 int copy_bits = 256;
21392
21393 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
21394 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
21395 if (size <= 24
21396 || !TARGET_SIMD
21397 || (aarch64_tune_params.extra_tuning_flags
21398 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21399 {
21400 copy_bits = 128;
21401 max_copy_size = max_copy_size / 2;
21402 }
21403
21404 if (size > max_copy_size)
21405 return false;
21406
21407 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21408 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21409
21410 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
21411 src = adjust_automodify_address (src, VOIDmode, base, 0);
21412
21413 /* Convert size to bits to make the rest of the code simpler. */
21414 int n = size * BITS_PER_UNIT;
21415
21416 while (n > 0)
21417 {
21418 /* Find the largest mode in which to do the copy in without over reading
21419 or writing. */
21420 opt_scalar_int_mode mode_iter;
21421 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21422 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
21423 cur_mode = mode_iter.require ();
21424
21425 gcc_assert (cur_mode != BLKmode);
21426
21427 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21428
21429 /* Prefer Q-register accesses for the last bytes. */
21430 if (mode_bits == 128 && copy_bits == 256)
21431 cur_mode = V4SImode;
21432
21433 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
21434
21435 n -= mode_bits;
21436
21437 /* Emit trailing copies using overlapping unaligned accesses - this is
21438 smaller and faster. */
21439 if (n > 0 && n < copy_bits / 2)
21440 {
21441 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
21442 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21443 gcc_assert (n_bits <= mode_bits);
21444 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
21445 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21446 n = n_bits;
21447 }
21448 }
21449
21450 return true;
21451 }
21452
21453 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
21454 SRC is a register we have created with the duplicated value to be set. */
21455 static void
21456 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
21457 machine_mode mode)
21458 {
21459 /* If we are copying 128bits or 256bits, we can do that straight from
21460 the SIMD register we prepared. */
21461 if (known_eq (GET_MODE_BITSIZE (mode), 256))
21462 {
21463 mode = GET_MODE (src);
21464 /* "Cast" the *dst to the correct mode. */
21465 *dst = adjust_address (*dst, mode, 0);
21466 /* Emit the memset. */
21467 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
21468 aarch64_progress_pointer (*dst), src));
21469
21470 /* Move the pointers forward. */
21471 *dst = aarch64_move_pointer (*dst, 32);
21472 return;
21473 }
21474 if (known_eq (GET_MODE_BITSIZE (mode), 128))
21475 {
21476 /* "Cast" the *dst to the correct mode. */
21477 *dst = adjust_address (*dst, GET_MODE (src), 0);
21478 /* Emit the memset. */
21479 emit_move_insn (*dst, src);
21480 /* Move the pointers forward. */
21481 *dst = aarch64_move_pointer (*dst, 16);
21482 return;
21483 }
21484 /* For copying less, we have to extract the right amount from src. */
21485 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
21486
21487 /* "Cast" the *dst to the correct mode. */
21488 *dst = adjust_address (*dst, mode, 0);
21489 /* Emit the memset. */
21490 emit_move_insn (*dst, reg);
21491 /* Move the pointer forward. */
21492 *dst = aarch64_progress_pointer (*dst);
21493 }
21494
21495 /* Expand setmem, as if from a __builtin_memset. Return true if
21496 we succeed, otherwise return false. */
21497
21498 bool
21499 aarch64_expand_setmem (rtx *operands)
21500 {
21501 int n, mode_bits;
21502 unsigned HOST_WIDE_INT len;
21503 rtx dst = operands[0];
21504 rtx val = operands[2], src;
21505 rtx base;
21506 machine_mode cur_mode = BLKmode, next_mode;
21507
21508 /* We can't do anything smart if the amount to copy is not constant. */
21509 if (!CONST_INT_P (operands[1]))
21510 return false;
21511
21512 bool speed_p = !optimize_function_for_size_p (cfun);
21513
21514 /* Default the maximum to 256-bytes. */
21515 unsigned max_set_size = 256;
21516
21517 /* In case we are optimizing for size or if the core does not
21518 want to use STP Q regs, lower the max_set_size. */
21519 max_set_size = (!speed_p
21520 || (aarch64_tune_params.extra_tuning_flags
21521 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21522 ? max_set_size / 2 : max_set_size;
21523
21524 len = INTVAL (operands[1]);
21525
21526 /* Upper bound check. */
21527 if (len > max_set_size)
21528 return false;
21529
21530 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21531 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21532
21533 /* Prepare the val using a DUP/MOVI v0.16B, val. */
21534 src = expand_vector_broadcast (V16QImode, val);
21535 src = force_reg (V16QImode, src);
21536
21537 /* Convert len to bits to make the rest of the code simpler. */
21538 n = len * BITS_PER_UNIT;
21539
21540 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
21541 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
21542 pattern is only turned on for TARGET_SIMD. */
21543 const int copy_limit = (speed_p
21544 && (aarch64_tune_params.extra_tuning_flags
21545 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
21546 ? GET_MODE_BITSIZE (TImode) : 256;
21547
21548 while (n > 0)
21549 {
21550 /* Find the largest mode in which to do the copy without
21551 over writing. */
21552 opt_scalar_int_mode mode_iter;
21553 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21554 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
21555 cur_mode = mode_iter.require ();
21556
21557 gcc_assert (cur_mode != BLKmode);
21558
21559 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21560 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
21561
21562 n -= mode_bits;
21563
21564 /* Do certain trailing copies as overlapping if it's going to be
21565 cheaper. i.e. less instructions to do so. For instance doing a 15
21566 byte copy it's more efficient to do two overlapping 8 byte copies than
21567 8 + 4 + 2 + 1. */
21568 if (n > 0 && n < copy_limit / 2)
21569 {
21570 next_mode = smallest_mode_for_size (n, MODE_INT);
21571 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21572 gcc_assert (n_bits <= mode_bits);
21573 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21574 n = n_bits;
21575 }
21576 }
21577
21578 return true;
21579 }
21580
21581
21582 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
21583 SImode stores. Handle the case when the constant has identical
21584 bottom and top halves. This is beneficial when the two stores can be
21585 merged into an STP and we avoid synthesising potentially expensive
21586 immediates twice. Return true if such a split is possible. */
21587
21588 bool
21589 aarch64_split_dimode_const_store (rtx dst, rtx src)
21590 {
21591 rtx lo = gen_lowpart (SImode, src);
21592 rtx hi = gen_highpart_mode (SImode, DImode, src);
21593
21594 bool size_p = optimize_function_for_size_p (cfun);
21595
21596 if (!rtx_equal_p (lo, hi))
21597 return false;
21598
21599 unsigned int orig_cost
21600 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
21601 unsigned int lo_cost
21602 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
21603
21604 /* We want to transform:
21605 MOV x1, 49370
21606 MOVK x1, 0x140, lsl 16
21607 MOVK x1, 0xc0da, lsl 32
21608 MOVK x1, 0x140, lsl 48
21609 STR x1, [x0]
21610 into:
21611 MOV w1, 49370
21612 MOVK w1, 0x140, lsl 16
21613 STP w1, w1, [x0]
21614 So we want to perform this only when we save two instructions
21615 or more. When optimizing for size, however, accept any code size
21616 savings we can. */
21617 if (size_p && orig_cost <= lo_cost)
21618 return false;
21619
21620 if (!size_p
21621 && (orig_cost <= lo_cost + 1))
21622 return false;
21623
21624 rtx mem_lo = adjust_address (dst, SImode, 0);
21625 if (!aarch64_mem_pair_operand (mem_lo, SImode))
21626 return false;
21627
21628 rtx tmp_reg = gen_reg_rtx (SImode);
21629 aarch64_expand_mov_immediate (tmp_reg, lo);
21630 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
21631 /* Don't emit an explicit store pair as this may not be always profitable.
21632 Let the sched-fusion logic decide whether to merge them. */
21633 emit_move_insn (mem_lo, tmp_reg);
21634 emit_move_insn (mem_hi, tmp_reg);
21635
21636 return true;
21637 }
21638
21639 /* Generate RTL for a conditional branch with rtx comparison CODE in
21640 mode CC_MODE. The destination of the unlikely conditional branch
21641 is LABEL_REF. */
21642
21643 void
21644 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
21645 rtx label_ref)
21646 {
21647 rtx x;
21648 x = gen_rtx_fmt_ee (code, VOIDmode,
21649 gen_rtx_REG (cc_mode, CC_REGNUM),
21650 const0_rtx);
21651
21652 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21653 gen_rtx_LABEL_REF (VOIDmode, label_ref),
21654 pc_rtx);
21655 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21656 }
21657
21658 /* Generate DImode scratch registers for 128-bit (TImode) addition.
21659
21660 OP1 represents the TImode destination operand 1
21661 OP2 represents the TImode destination operand 2
21662 LOW_DEST represents the low half (DImode) of TImode operand 0
21663 LOW_IN1 represents the low half (DImode) of TImode operand 1
21664 LOW_IN2 represents the low half (DImode) of TImode operand 2
21665 HIGH_DEST represents the high half (DImode) of TImode operand 0
21666 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21667 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21668
21669 void
21670 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21671 rtx *low_in1, rtx *low_in2,
21672 rtx *high_dest, rtx *high_in1,
21673 rtx *high_in2)
21674 {
21675 *low_dest = gen_reg_rtx (DImode);
21676 *low_in1 = gen_lowpart (DImode, op1);
21677 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21678 subreg_lowpart_offset (DImode, TImode));
21679 *high_dest = gen_reg_rtx (DImode);
21680 *high_in1 = gen_highpart (DImode, op1);
21681 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21682 subreg_highpart_offset (DImode, TImode));
21683 }
21684
21685 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21686
21687 This function differs from 'arch64_addti_scratch_regs' in that
21688 OP1 can be an immediate constant (zero). We must call
21689 subreg_highpart_offset with DImode and TImode arguments, otherwise
21690 VOIDmode will be used for the const_int which generates an internal
21691 error from subreg_size_highpart_offset which does not expect a size of zero.
21692
21693 OP1 represents the TImode destination operand 1
21694 OP2 represents the TImode destination operand 2
21695 LOW_DEST represents the low half (DImode) of TImode operand 0
21696 LOW_IN1 represents the low half (DImode) of TImode operand 1
21697 LOW_IN2 represents the low half (DImode) of TImode operand 2
21698 HIGH_DEST represents the high half (DImode) of TImode operand 0
21699 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21700 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21701
21702
21703 void
21704 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21705 rtx *low_in1, rtx *low_in2,
21706 rtx *high_dest, rtx *high_in1,
21707 rtx *high_in2)
21708 {
21709 *low_dest = gen_reg_rtx (DImode);
21710 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21711 subreg_lowpart_offset (DImode, TImode));
21712
21713 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21714 subreg_lowpart_offset (DImode, TImode));
21715 *high_dest = gen_reg_rtx (DImode);
21716
21717 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21718 subreg_highpart_offset (DImode, TImode));
21719 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21720 subreg_highpart_offset (DImode, TImode));
21721 }
21722
21723 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
21724
21725 OP0 represents the TImode destination operand 0
21726 LOW_DEST represents the low half (DImode) of TImode operand 0
21727 LOW_IN1 represents the low half (DImode) of TImode operand 1
21728 LOW_IN2 represents the low half (DImode) of TImode operand 2
21729 HIGH_DEST represents the high half (DImode) of TImode operand 0
21730 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21731 HIGH_IN2 represents the high half (DImode) of TImode operand 2
21732 UNSIGNED_P is true if the operation is being performed on unsigned
21733 values. */
21734 void
21735 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21736 rtx low_in2, rtx high_dest, rtx high_in1,
21737 rtx high_in2, bool unsigned_p)
21738 {
21739 if (low_in2 == const0_rtx)
21740 {
21741 low_dest = low_in1;
21742 high_in2 = force_reg (DImode, high_in2);
21743 if (unsigned_p)
21744 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21745 else
21746 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
21747 }
21748 else
21749 {
21750 if (aarch64_plus_immediate (low_in2, DImode))
21751 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21752 GEN_INT (-INTVAL (low_in2))));
21753 else
21754 {
21755 low_in2 = force_reg (DImode, low_in2);
21756 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
21757 }
21758 high_in2 = force_reg (DImode, high_in2);
21759
21760 if (unsigned_p)
21761 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21762 else
21763 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
21764 }
21765
21766 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21767 emit_move_insn (gen_highpart (DImode, op0), high_dest);
21768
21769 }
21770
21771 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
21772
21773 static unsigned HOST_WIDE_INT
21774 aarch64_asan_shadow_offset (void)
21775 {
21776 if (TARGET_ILP32)
21777 return (HOST_WIDE_INT_1 << 29);
21778 else
21779 return (HOST_WIDE_INT_1 << 36);
21780 }
21781
21782 static rtx
21783 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
21784 int code, tree treeop0, tree treeop1)
21785 {
21786 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21787 rtx op0, op1;
21788 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21789 insn_code icode;
21790 struct expand_operand ops[4];
21791
21792 start_sequence ();
21793 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21794
21795 op_mode = GET_MODE (op0);
21796 if (op_mode == VOIDmode)
21797 op_mode = GET_MODE (op1);
21798
21799 switch (op_mode)
21800 {
21801 case E_QImode:
21802 case E_HImode:
21803 case E_SImode:
21804 cmp_mode = SImode;
21805 icode = CODE_FOR_cmpsi;
21806 break;
21807
21808 case E_DImode:
21809 cmp_mode = DImode;
21810 icode = CODE_FOR_cmpdi;
21811 break;
21812
21813 case E_SFmode:
21814 cmp_mode = SFmode;
21815 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21816 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21817 break;
21818
21819 case E_DFmode:
21820 cmp_mode = DFmode;
21821 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21822 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21823 break;
21824
21825 default:
21826 end_sequence ();
21827 return NULL_RTX;
21828 }
21829
21830 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21831 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
21832 if (!op0 || !op1)
21833 {
21834 end_sequence ();
21835 return NULL_RTX;
21836 }
21837 *prep_seq = get_insns ();
21838 end_sequence ();
21839
21840 create_fixed_operand (&ops[0], op0);
21841 create_fixed_operand (&ops[1], op1);
21842
21843 start_sequence ();
21844 if (!maybe_expand_insn (icode, 2, ops))
21845 {
21846 end_sequence ();
21847 return NULL_RTX;
21848 }
21849 *gen_seq = get_insns ();
21850 end_sequence ();
21851
21852 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21853 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
21854 }
21855
21856 static rtx
21857 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21858 int cmp_code, tree treeop0, tree treeop1, int bit_code)
21859 {
21860 rtx op0, op1, target;
21861 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21862 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21863 insn_code icode;
21864 struct expand_operand ops[6];
21865 int aarch64_cond;
21866
21867 push_to_sequence (*prep_seq);
21868 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21869
21870 op_mode = GET_MODE (op0);
21871 if (op_mode == VOIDmode)
21872 op_mode = GET_MODE (op1);
21873
21874 switch (op_mode)
21875 {
21876 case E_QImode:
21877 case E_HImode:
21878 case E_SImode:
21879 cmp_mode = SImode;
21880 break;
21881
21882 case E_DImode:
21883 cmp_mode = DImode;
21884 break;
21885
21886 case E_SFmode:
21887 cmp_mode = SFmode;
21888 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21889 break;
21890
21891 case E_DFmode:
21892 cmp_mode = DFmode;
21893 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21894 break;
21895
21896 default:
21897 end_sequence ();
21898 return NULL_RTX;
21899 }
21900
21901 icode = code_for_ccmp (cc_mode, cmp_mode);
21902
21903 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21904 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21905 if (!op0 || !op1)
21906 {
21907 end_sequence ();
21908 return NULL_RTX;
21909 }
21910 *prep_seq = get_insns ();
21911 end_sequence ();
21912
21913 target = gen_rtx_REG (cc_mode, CC_REGNUM);
21914 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
21915
21916 if (bit_code != AND)
21917 {
21918 /* Treat the ccmp patterns as canonical and use them where possible,
21919 but fall back to ccmp_rev patterns if there's no other option. */
21920 rtx_code prev_code = GET_CODE (prev);
21921 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21922 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21923 && !(prev_code == EQ
21924 || prev_code == NE
21925 || prev_code == ORDERED
21926 || prev_code == UNORDERED))
21927 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21928 else
21929 {
21930 rtx_code code = reverse_condition (prev_code);
21931 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21932 }
21933 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21934 }
21935
21936 create_fixed_operand (&ops[0], XEXP (prev, 0));
21937 create_fixed_operand (&ops[1], target);
21938 create_fixed_operand (&ops[2], op0);
21939 create_fixed_operand (&ops[3], op1);
21940 create_fixed_operand (&ops[4], prev);
21941 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
21942
21943 push_to_sequence (*gen_seq);
21944 if (!maybe_expand_insn (icode, 6, ops))
21945 {
21946 end_sequence ();
21947 return NULL_RTX;
21948 }
21949
21950 *gen_seq = get_insns ();
21951 end_sequence ();
21952
21953 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21954 }
21955
21956 #undef TARGET_GEN_CCMP_FIRST
21957 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21958
21959 #undef TARGET_GEN_CCMP_NEXT
21960 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21961
21962 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
21963 instruction fusion of some sort. */
21964
21965 static bool
21966 aarch64_macro_fusion_p (void)
21967 {
21968 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21969 }
21970
21971
21972 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
21973 should be kept together during scheduling. */
21974
21975 static bool
21976 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21977 {
21978 rtx set_dest;
21979 rtx prev_set = single_set (prev);
21980 rtx curr_set = single_set (curr);
21981 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
21982 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21983
21984 if (!aarch64_macro_fusion_p ())
21985 return false;
21986
21987 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21988 {
21989 /* We are trying to match:
21990 prev (mov) == (set (reg r0) (const_int imm16))
21991 curr (movk) == (set (zero_extract (reg r0)
21992 (const_int 16)
21993 (const_int 16))
21994 (const_int imm16_1)) */
21995
21996 set_dest = SET_DEST (curr_set);
21997
21998 if (GET_CODE (set_dest) == ZERO_EXTRACT
21999 && CONST_INT_P (SET_SRC (curr_set))
22000 && CONST_INT_P (SET_SRC (prev_set))
22001 && CONST_INT_P (XEXP (set_dest, 2))
22002 && INTVAL (XEXP (set_dest, 2)) == 16
22003 && REG_P (XEXP (set_dest, 0))
22004 && REG_P (SET_DEST (prev_set))
22005 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
22006 {
22007 return true;
22008 }
22009 }
22010
22011 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
22012 {
22013
22014 /* We're trying to match:
22015 prev (adrp) == (set (reg r1)
22016 (high (symbol_ref ("SYM"))))
22017 curr (add) == (set (reg r0)
22018 (lo_sum (reg r1)
22019 (symbol_ref ("SYM"))))
22020 Note that r0 need not necessarily be the same as r1, especially
22021 during pre-regalloc scheduling. */
22022
22023 if (satisfies_constraint_Ush (SET_SRC (prev_set))
22024 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
22025 {
22026 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
22027 && REG_P (XEXP (SET_SRC (curr_set), 0))
22028 && REGNO (XEXP (SET_SRC (curr_set), 0))
22029 == REGNO (SET_DEST (prev_set))
22030 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
22031 XEXP (SET_SRC (curr_set), 1)))
22032 return true;
22033 }
22034 }
22035
22036 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
22037 {
22038
22039 /* We're trying to match:
22040 prev (movk) == (set (zero_extract (reg r0)
22041 (const_int 16)
22042 (const_int 32))
22043 (const_int imm16_1))
22044 curr (movk) == (set (zero_extract (reg r0)
22045 (const_int 16)
22046 (const_int 48))
22047 (const_int imm16_2)) */
22048
22049 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
22050 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
22051 && REG_P (XEXP (SET_DEST (prev_set), 0))
22052 && REG_P (XEXP (SET_DEST (curr_set), 0))
22053 && REGNO (XEXP (SET_DEST (prev_set), 0))
22054 == REGNO (XEXP (SET_DEST (curr_set), 0))
22055 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
22056 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
22057 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
22058 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
22059 && CONST_INT_P (SET_SRC (prev_set))
22060 && CONST_INT_P (SET_SRC (curr_set)))
22061 return true;
22062
22063 }
22064 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
22065 {
22066 /* We're trying to match:
22067 prev (adrp) == (set (reg r0)
22068 (high (symbol_ref ("SYM"))))
22069 curr (ldr) == (set (reg r1)
22070 (mem (lo_sum (reg r0)
22071 (symbol_ref ("SYM")))))
22072 or
22073 curr (ldr) == (set (reg r1)
22074 (zero_extend (mem
22075 (lo_sum (reg r0)
22076 (symbol_ref ("SYM")))))) */
22077 if (satisfies_constraint_Ush (SET_SRC (prev_set))
22078 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
22079 {
22080 rtx curr_src = SET_SRC (curr_set);
22081
22082 if (GET_CODE (curr_src) == ZERO_EXTEND)
22083 curr_src = XEXP (curr_src, 0);
22084
22085 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
22086 && REG_P (XEXP (XEXP (curr_src, 0), 0))
22087 && REGNO (XEXP (XEXP (curr_src, 0), 0))
22088 == REGNO (SET_DEST (prev_set))
22089 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
22090 XEXP (SET_SRC (prev_set), 0)))
22091 return true;
22092 }
22093 }
22094
22095 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
22096 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
22097 && prev_set && curr_set && any_condjump_p (curr)
22098 && GET_CODE (SET_SRC (prev_set)) == COMPARE
22099 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
22100 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
22101 return true;
22102
22103 /* Fuse flag-setting ALU instructions and conditional branch. */
22104 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
22105 && any_condjump_p (curr))
22106 {
22107 unsigned int condreg1, condreg2;
22108 rtx cc_reg_1;
22109 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
22110 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
22111
22112 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
22113 && prev
22114 && modified_in_p (cc_reg_1, prev))
22115 {
22116 enum attr_type prev_type = get_attr_type (prev);
22117
22118 /* FIXME: this misses some which is considered simple arthematic
22119 instructions for ThunderX. Simple shifts are missed here. */
22120 if (prev_type == TYPE_ALUS_SREG
22121 || prev_type == TYPE_ALUS_IMM
22122 || prev_type == TYPE_LOGICS_REG
22123 || prev_type == TYPE_LOGICS_IMM)
22124 return true;
22125 }
22126 }
22127
22128 /* Fuse ALU instructions and CBZ/CBNZ. */
22129 if (prev_set
22130 && curr_set
22131 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
22132 && any_condjump_p (curr))
22133 {
22134 /* We're trying to match:
22135 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
22136 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
22137 (const_int 0))
22138 (label_ref ("SYM"))
22139 (pc)) */
22140 if (SET_DEST (curr_set) == (pc_rtx)
22141 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
22142 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
22143 && REG_P (SET_DEST (prev_set))
22144 && REGNO (SET_DEST (prev_set))
22145 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
22146 {
22147 /* Fuse ALU operations followed by conditional branch instruction. */
22148 switch (get_attr_type (prev))
22149 {
22150 case TYPE_ALU_IMM:
22151 case TYPE_ALU_SREG:
22152 case TYPE_ADC_REG:
22153 case TYPE_ADC_IMM:
22154 case TYPE_ADCS_REG:
22155 case TYPE_ADCS_IMM:
22156 case TYPE_LOGIC_REG:
22157 case TYPE_LOGIC_IMM:
22158 case TYPE_CSEL:
22159 case TYPE_ADR:
22160 case TYPE_MOV_IMM:
22161 case TYPE_SHIFT_REG:
22162 case TYPE_SHIFT_IMM:
22163 case TYPE_BFM:
22164 case TYPE_RBIT:
22165 case TYPE_REV:
22166 case TYPE_EXTEND:
22167 return true;
22168
22169 default:;
22170 }
22171 }
22172 }
22173
22174 return false;
22175 }
22176
22177 /* Return true iff the instruction fusion described by OP is enabled. */
22178
22179 bool
22180 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
22181 {
22182 return (aarch64_tune_params.fusible_ops & op) != 0;
22183 }
22184
22185 /* If MEM is in the form of [base+offset], extract the two parts
22186 of address and set to BASE and OFFSET, otherwise return false
22187 after clearing BASE and OFFSET. */
22188
22189 bool
22190 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
22191 {
22192 rtx addr;
22193
22194 gcc_assert (MEM_P (mem));
22195
22196 addr = XEXP (mem, 0);
22197
22198 if (REG_P (addr))
22199 {
22200 *base = addr;
22201 *offset = const0_rtx;
22202 return true;
22203 }
22204
22205 if (GET_CODE (addr) == PLUS
22206 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
22207 {
22208 *base = XEXP (addr, 0);
22209 *offset = XEXP (addr, 1);
22210 return true;
22211 }
22212
22213 *base = NULL_RTX;
22214 *offset = NULL_RTX;
22215
22216 return false;
22217 }
22218
22219 /* Types for scheduling fusion. */
22220 enum sched_fusion_type
22221 {
22222 SCHED_FUSION_NONE = 0,
22223 SCHED_FUSION_LD_SIGN_EXTEND,
22224 SCHED_FUSION_LD_ZERO_EXTEND,
22225 SCHED_FUSION_LD,
22226 SCHED_FUSION_ST,
22227 SCHED_FUSION_NUM
22228 };
22229
22230 /* If INSN is a load or store of address in the form of [base+offset],
22231 extract the two parts and set to BASE and OFFSET. Return scheduling
22232 fusion type this INSN is. */
22233
22234 static enum sched_fusion_type
22235 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
22236 {
22237 rtx x, dest, src;
22238 enum sched_fusion_type fusion = SCHED_FUSION_LD;
22239
22240 gcc_assert (INSN_P (insn));
22241 x = PATTERN (insn);
22242 if (GET_CODE (x) != SET)
22243 return SCHED_FUSION_NONE;
22244
22245 src = SET_SRC (x);
22246 dest = SET_DEST (x);
22247
22248 machine_mode dest_mode = GET_MODE (dest);
22249
22250 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
22251 return SCHED_FUSION_NONE;
22252
22253 if (GET_CODE (src) == SIGN_EXTEND)
22254 {
22255 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
22256 src = XEXP (src, 0);
22257 if (!MEM_P (src) || GET_MODE (src) != SImode)
22258 return SCHED_FUSION_NONE;
22259 }
22260 else if (GET_CODE (src) == ZERO_EXTEND)
22261 {
22262 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
22263 src = XEXP (src, 0);
22264 if (!MEM_P (src) || GET_MODE (src) != SImode)
22265 return SCHED_FUSION_NONE;
22266 }
22267
22268 if (MEM_P (src) && REG_P (dest))
22269 extract_base_offset_in_addr (src, base, offset);
22270 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
22271 {
22272 fusion = SCHED_FUSION_ST;
22273 extract_base_offset_in_addr (dest, base, offset);
22274 }
22275 else
22276 return SCHED_FUSION_NONE;
22277
22278 if (*base == NULL_RTX || *offset == NULL_RTX)
22279 fusion = SCHED_FUSION_NONE;
22280
22281 return fusion;
22282 }
22283
22284 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
22285
22286 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
22287 and PRI are only calculated for these instructions. For other instruction,
22288 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
22289 type instruction fusion can be added by returning different priorities.
22290
22291 It's important that irrelevant instructions get the largest FUSION_PRI. */
22292
22293 static void
22294 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
22295 int *fusion_pri, int *pri)
22296 {
22297 int tmp, off_val;
22298 rtx base, offset;
22299 enum sched_fusion_type fusion;
22300
22301 gcc_assert (INSN_P (insn));
22302
22303 tmp = max_pri - 1;
22304 fusion = fusion_load_store (insn, &base, &offset);
22305 if (fusion == SCHED_FUSION_NONE)
22306 {
22307 *pri = tmp;
22308 *fusion_pri = tmp;
22309 return;
22310 }
22311
22312 /* Set FUSION_PRI according to fusion type and base register. */
22313 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
22314
22315 /* Calculate PRI. */
22316 tmp /= 2;
22317
22318 /* INSN with smaller offset goes first. */
22319 off_val = (int)(INTVAL (offset));
22320 if (off_val >= 0)
22321 tmp -= (off_val & 0xfffff);
22322 else
22323 tmp += ((- off_val) & 0xfffff);
22324
22325 *pri = tmp;
22326 return;
22327 }
22328
22329 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
22330 Adjust priority of sha1h instructions so they are scheduled before
22331 other SHA1 instructions. */
22332
22333 static int
22334 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
22335 {
22336 rtx x = PATTERN (insn);
22337
22338 if (GET_CODE (x) == SET)
22339 {
22340 x = SET_SRC (x);
22341
22342 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
22343 return priority + 10;
22344 }
22345
22346 return priority;
22347 }
22348
22349 /* Given OPERANDS of consecutive load/store, check if we can merge
22350 them into ldp/stp. LOAD is true if they are load instructions.
22351 MODE is the mode of memory operands. */
22352
22353 bool
22354 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
22355 machine_mode mode)
22356 {
22357 HOST_WIDE_INT offval_1, offval_2, msize;
22358 enum reg_class rclass_1, rclass_2;
22359 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
22360
22361 if (load)
22362 {
22363 mem_1 = operands[1];
22364 mem_2 = operands[3];
22365 reg_1 = operands[0];
22366 reg_2 = operands[2];
22367 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
22368 if (REGNO (reg_1) == REGNO (reg_2))
22369 return false;
22370 }
22371 else
22372 {
22373 mem_1 = operands[0];
22374 mem_2 = operands[2];
22375 reg_1 = operands[1];
22376 reg_2 = operands[3];
22377 }
22378
22379 /* The mems cannot be volatile. */
22380 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
22381 return false;
22382
22383 /* If we have SImode and slow unaligned ldp,
22384 check the alignment to be at least 8 byte. */
22385 if (mode == SImode
22386 && (aarch64_tune_params.extra_tuning_flags
22387 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22388 && !optimize_size
22389 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
22390 return false;
22391
22392 /* Check if the addresses are in the form of [base+offset]. */
22393 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22394 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
22395 return false;
22396 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22397 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
22398 return false;
22399
22400 /* Check if the bases are same. */
22401 if (!rtx_equal_p (base_1, base_2))
22402 return false;
22403
22404 /* The operands must be of the same size. */
22405 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
22406 GET_MODE_SIZE (GET_MODE (mem_2))));
22407
22408 offval_1 = INTVAL (offset_1);
22409 offval_2 = INTVAL (offset_2);
22410 /* We should only be trying this for fixed-sized modes. There is no
22411 SVE LDP/STP instruction. */
22412 msize = GET_MODE_SIZE (mode).to_constant ();
22413 /* Check if the offsets are consecutive. */
22414 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
22415 return false;
22416
22417 /* Check if the addresses are clobbered by load. */
22418 if (load)
22419 {
22420 if (reg_mentioned_p (reg_1, mem_1))
22421 return false;
22422
22423 /* In increasing order, the last load can clobber the address. */
22424 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
22425 return false;
22426 }
22427
22428 /* One of the memory accesses must be a mempair operand.
22429 If it is not the first one, they need to be swapped by the
22430 peephole. */
22431 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
22432 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
22433 return false;
22434
22435 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
22436 rclass_1 = FP_REGS;
22437 else
22438 rclass_1 = GENERAL_REGS;
22439
22440 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
22441 rclass_2 = FP_REGS;
22442 else
22443 rclass_2 = GENERAL_REGS;
22444
22445 /* Check if the registers are of same class. */
22446 if (rclass_1 != rclass_2)
22447 return false;
22448
22449 return true;
22450 }
22451
22452 /* Given OPERANDS of consecutive load/store that can be merged,
22453 swap them if they are not in ascending order. */
22454 void
22455 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
22456 {
22457 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
22458 HOST_WIDE_INT offval_1, offval_2;
22459
22460 if (load)
22461 {
22462 mem_1 = operands[1];
22463 mem_2 = operands[3];
22464 }
22465 else
22466 {
22467 mem_1 = operands[0];
22468 mem_2 = operands[2];
22469 }
22470
22471 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22472 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22473
22474 offval_1 = INTVAL (offset_1);
22475 offval_2 = INTVAL (offset_2);
22476
22477 if (offval_1 > offval_2)
22478 {
22479 /* Irrespective of whether this is a load or a store,
22480 we do the same swap. */
22481 std::swap (operands[0], operands[2]);
22482 std::swap (operands[1], operands[3]);
22483 }
22484 }
22485
22486 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
22487 comparison between the two. */
22488 int
22489 aarch64_host_wide_int_compare (const void *x, const void *y)
22490 {
22491 return wi::cmps (* ((const HOST_WIDE_INT *) x),
22492 * ((const HOST_WIDE_INT *) y));
22493 }
22494
22495 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
22496 other pointing to a REG rtx containing an offset, compare the offsets
22497 of the two pairs.
22498
22499 Return:
22500
22501 1 iff offset (X) > offset (Y)
22502 0 iff offset (X) == offset (Y)
22503 -1 iff offset (X) < offset (Y) */
22504 int
22505 aarch64_ldrstr_offset_compare (const void *x, const void *y)
22506 {
22507 const rtx * operands_1 = (const rtx *) x;
22508 const rtx * operands_2 = (const rtx *) y;
22509 rtx mem_1, mem_2, base, offset_1, offset_2;
22510
22511 if (MEM_P (operands_1[0]))
22512 mem_1 = operands_1[0];
22513 else
22514 mem_1 = operands_1[1];
22515
22516 if (MEM_P (operands_2[0]))
22517 mem_2 = operands_2[0];
22518 else
22519 mem_2 = operands_2[1];
22520
22521 /* Extract the offsets. */
22522 extract_base_offset_in_addr (mem_1, &base, &offset_1);
22523 extract_base_offset_in_addr (mem_2, &base, &offset_2);
22524
22525 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
22526
22527 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
22528 }
22529
22530 /* Given OPERANDS of consecutive load/store, check if we can merge
22531 them into ldp/stp by adjusting the offset. LOAD is true if they
22532 are load instructions. MODE is the mode of memory operands.
22533
22534 Given below consecutive stores:
22535
22536 str w1, [xb, 0x100]
22537 str w1, [xb, 0x104]
22538 str w1, [xb, 0x108]
22539 str w1, [xb, 0x10c]
22540
22541 Though the offsets are out of the range supported by stp, we can
22542 still pair them after adjusting the offset, like:
22543
22544 add scratch, xb, 0x100
22545 stp w1, w1, [scratch]
22546 stp w1, w1, [scratch, 0x8]
22547
22548 The peephole patterns detecting this opportunity should guarantee
22549 the scratch register is avaliable. */
22550
22551 bool
22552 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
22553 machine_mode mode)
22554 {
22555 const int num_insns = 4;
22556 enum reg_class rclass;
22557 HOST_WIDE_INT offvals[num_insns], msize;
22558 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
22559
22560 if (load)
22561 {
22562 for (int i = 0; i < num_insns; i++)
22563 {
22564 reg[i] = operands[2 * i];
22565 mem[i] = operands[2 * i + 1];
22566
22567 gcc_assert (REG_P (reg[i]));
22568 }
22569
22570 /* Do not attempt to merge the loads if the loads clobber each other. */
22571 for (int i = 0; i < 8; i += 2)
22572 for (int j = i + 2; j < 8; j += 2)
22573 if (reg_overlap_mentioned_p (operands[i], operands[j]))
22574 return false;
22575 }
22576 else
22577 for (int i = 0; i < num_insns; i++)
22578 {
22579 mem[i] = operands[2 * i];
22580 reg[i] = operands[2 * i + 1];
22581 }
22582
22583 /* Skip if memory operand is by itself valid for ldp/stp. */
22584 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
22585 return false;
22586
22587 for (int i = 0; i < num_insns; i++)
22588 {
22589 /* The mems cannot be volatile. */
22590 if (MEM_VOLATILE_P (mem[i]))
22591 return false;
22592
22593 /* Check if the addresses are in the form of [base+offset]. */
22594 extract_base_offset_in_addr (mem[i], base + i, offset + i);
22595 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
22596 return false;
22597 }
22598
22599 /* Check if the registers are of same class. */
22600 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
22601 ? FP_REGS : GENERAL_REGS;
22602
22603 for (int i = 1; i < num_insns; i++)
22604 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
22605 {
22606 if (rclass != FP_REGS)
22607 return false;
22608 }
22609 else
22610 {
22611 if (rclass != GENERAL_REGS)
22612 return false;
22613 }
22614
22615 /* Only the last register in the order in which they occur
22616 may be clobbered by the load. */
22617 if (rclass == GENERAL_REGS && load)
22618 for (int i = 0; i < num_insns - 1; i++)
22619 if (reg_mentioned_p (reg[i], mem[i]))
22620 return false;
22621
22622 /* Check if the bases are same. */
22623 for (int i = 0; i < num_insns - 1; i++)
22624 if (!rtx_equal_p (base[i], base[i + 1]))
22625 return false;
22626
22627 for (int i = 0; i < num_insns; i++)
22628 offvals[i] = INTVAL (offset[i]);
22629
22630 msize = GET_MODE_SIZE (mode).to_constant ();
22631
22632 /* Check if the offsets can be put in the right order to do a ldp/stp. */
22633 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
22634 aarch64_host_wide_int_compare);
22635
22636 if (!(offvals[1] == offvals[0] + msize
22637 && offvals[3] == offvals[2] + msize))
22638 return false;
22639
22640 /* Check that offsets are within range of each other. The ldp/stp
22641 instructions have 7 bit immediate offsets, so use 0x80. */
22642 if (offvals[2] - offvals[0] >= msize * 0x80)
22643 return false;
22644
22645 /* The offsets must be aligned with respect to each other. */
22646 if (offvals[0] % msize != offvals[2] % msize)
22647 return false;
22648
22649 /* If we have SImode and slow unaligned ldp,
22650 check the alignment to be at least 8 byte. */
22651 if (mode == SImode
22652 && (aarch64_tune_params.extra_tuning_flags
22653 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22654 && !optimize_size
22655 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
22656 return false;
22657
22658 return true;
22659 }
22660
22661 /* Given OPERANDS of consecutive load/store, this function pairs them
22662 into LDP/STP after adjusting the offset. It depends on the fact
22663 that the operands can be sorted so the offsets are correct for STP.
22664 MODE is the mode of memory operands. CODE is the rtl operator
22665 which should be applied to all memory operands, it's SIGN_EXTEND,
22666 ZERO_EXTEND or UNKNOWN. */
22667
22668 bool
22669 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
22670 machine_mode mode, RTX_CODE code)
22671 {
22672 rtx base, offset_1, offset_3, t1, t2;
22673 rtx mem_1, mem_2, mem_3, mem_4;
22674 rtx temp_operands[8];
22675 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22676 stp_off_upper_limit, stp_off_lower_limit, msize;
22677
22678 /* We make changes on a copy as we may still bail out. */
22679 for (int i = 0; i < 8; i ++)
22680 temp_operands[i] = operands[i];
22681
22682 /* Sort the operands. */
22683 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
22684
22685 /* Copy the memory operands so that if we have to bail for some
22686 reason the original addresses are unchanged. */
22687 if (load)
22688 {
22689 mem_1 = copy_rtx (temp_operands[1]);
22690 mem_2 = copy_rtx (temp_operands[3]);
22691 mem_3 = copy_rtx (temp_operands[5]);
22692 mem_4 = copy_rtx (temp_operands[7]);
22693 }
22694 else
22695 {
22696 mem_1 = copy_rtx (temp_operands[0]);
22697 mem_2 = copy_rtx (temp_operands[2]);
22698 mem_3 = copy_rtx (temp_operands[4]);
22699 mem_4 = copy_rtx (temp_operands[6]);
22700 gcc_assert (code == UNKNOWN);
22701 }
22702
22703 extract_base_offset_in_addr (mem_1, &base, &offset_1);
22704 extract_base_offset_in_addr (mem_3, &base, &offset_3);
22705 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22706 && offset_3 != NULL_RTX);
22707
22708 /* Adjust offset so it can fit in LDP/STP instruction. */
22709 msize = GET_MODE_SIZE (mode).to_constant();
22710 stp_off_upper_limit = msize * (0x40 - 1);
22711 stp_off_lower_limit = - msize * 0x40;
22712
22713 off_val_1 = INTVAL (offset_1);
22714 off_val_3 = INTVAL (offset_3);
22715
22716 /* The base offset is optimally half way between the two STP/LDP offsets. */
22717 if (msize <= 4)
22718 base_off = (off_val_1 + off_val_3) / 2;
22719 else
22720 /* However, due to issues with negative LDP/STP offset generation for
22721 larger modes, for DF, DI and vector modes. we must not use negative
22722 addresses smaller than 9 signed unadjusted bits can store. This
22723 provides the most range in this case. */
22724 base_off = off_val_1;
22725
22726 /* Adjust the base so that it is aligned with the addresses but still
22727 optimal. */
22728 if (base_off % msize != off_val_1 % msize)
22729 /* Fix the offset, bearing in mind we want to make it bigger not
22730 smaller. */
22731 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22732 else if (msize <= 4)
22733 /* The negative range of LDP/STP is one larger than the positive range. */
22734 base_off += msize;
22735
22736 /* Check if base offset is too big or too small. We can attempt to resolve
22737 this issue by setting it to the maximum value and seeing if the offsets
22738 still fit. */
22739 if (base_off >= 0x1000)
22740 {
22741 base_off = 0x1000 - 1;
22742 /* We must still make sure that the base offset is aligned with respect
22743 to the address. But it may not be made any bigger. */
22744 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22745 }
22746
22747 /* Likewise for the case where the base is too small. */
22748 if (base_off <= -0x1000)
22749 {
22750 base_off = -0x1000 + 1;
22751 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22752 }
22753
22754 /* Offset of the first STP/LDP. */
22755 new_off_1 = off_val_1 - base_off;
22756
22757 /* Offset of the second STP/LDP. */
22758 new_off_3 = off_val_3 - base_off;
22759
22760 /* The offsets must be within the range of the LDP/STP instructions. */
22761 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22762 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
22763 return false;
22764
22765 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22766 new_off_1), true);
22767 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22768 new_off_1 + msize), true);
22769 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22770 new_off_3), true);
22771 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22772 new_off_3 + msize), true);
22773
22774 if (!aarch64_mem_pair_operand (mem_1, mode)
22775 || !aarch64_mem_pair_operand (mem_3, mode))
22776 return false;
22777
22778 if (code == ZERO_EXTEND)
22779 {
22780 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22781 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22782 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22783 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22784 }
22785 else if (code == SIGN_EXTEND)
22786 {
22787 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22788 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22789 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22790 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22791 }
22792
22793 if (load)
22794 {
22795 operands[0] = temp_operands[0];
22796 operands[1] = mem_1;
22797 operands[2] = temp_operands[2];
22798 operands[3] = mem_2;
22799 operands[4] = temp_operands[4];
22800 operands[5] = mem_3;
22801 operands[6] = temp_operands[6];
22802 operands[7] = mem_4;
22803 }
22804 else
22805 {
22806 operands[0] = mem_1;
22807 operands[1] = temp_operands[1];
22808 operands[2] = mem_2;
22809 operands[3] = temp_operands[3];
22810 operands[4] = mem_3;
22811 operands[5] = temp_operands[5];
22812 operands[6] = mem_4;
22813 operands[7] = temp_operands[7];
22814 }
22815
22816 /* Emit adjusting instruction. */
22817 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
22818 /* Emit ldp/stp instructions. */
22819 t1 = gen_rtx_SET (operands[0], operands[1]);
22820 t2 = gen_rtx_SET (operands[2], operands[3]);
22821 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22822 t1 = gen_rtx_SET (operands[4], operands[5]);
22823 t2 = gen_rtx_SET (operands[6], operands[7]);
22824 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22825 return true;
22826 }
22827
22828 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
22829 it isn't worth branching around empty masked ops (including masked
22830 stores). */
22831
22832 static bool
22833 aarch64_empty_mask_is_expensive (unsigned)
22834 {
22835 return false;
22836 }
22837
22838 /* Return 1 if pseudo register should be created and used to hold
22839 GOT address for PIC code. */
22840
22841 bool
22842 aarch64_use_pseudo_pic_reg (void)
22843 {
22844 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22845 }
22846
22847 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
22848
22849 static int
22850 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22851 {
22852 switch (XINT (x, 1))
22853 {
22854 case UNSPEC_GOTSMALLPIC:
22855 case UNSPEC_GOTSMALLPIC28K:
22856 case UNSPEC_GOTTINYPIC:
22857 return 0;
22858 default:
22859 break;
22860 }
22861
22862 return default_unspec_may_trap_p (x, flags);
22863 }
22864
22865
22866 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
22867 return the log2 of that value. Otherwise return -1. */
22868
22869 int
22870 aarch64_fpconst_pow_of_2 (rtx x)
22871 {
22872 const REAL_VALUE_TYPE *r;
22873
22874 if (!CONST_DOUBLE_P (x))
22875 return -1;
22876
22877 r = CONST_DOUBLE_REAL_VALUE (x);
22878
22879 if (REAL_VALUE_NEGATIVE (*r)
22880 || REAL_VALUE_ISNAN (*r)
22881 || REAL_VALUE_ISINF (*r)
22882 || !real_isinteger (r, DFmode))
22883 return -1;
22884
22885 return exact_log2 (real_to_integer (r));
22886 }
22887
22888 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22889 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22890 return n. Otherwise return -1. */
22891
22892 int
22893 aarch64_fpconst_pow2_recip (rtx x)
22894 {
22895 REAL_VALUE_TYPE r0;
22896
22897 if (!CONST_DOUBLE_P (x))
22898 return -1;
22899
22900 r0 = *CONST_DOUBLE_REAL_VALUE (x);
22901 if (exact_real_inverse (DFmode, &r0)
22902 && !REAL_VALUE_NEGATIVE (r0))
22903 {
22904 int ret = exact_log2 (real_to_integer (&r0));
22905 if (ret >= 1 && ret <= 32)
22906 return ret;
22907 }
22908 return -1;
22909 }
22910
22911 /* If X is a vector of equal CONST_DOUBLE values and that value is
22912 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
22913
22914 int
22915 aarch64_vec_fpconst_pow_of_2 (rtx x)
22916 {
22917 int nelts;
22918 if (GET_CODE (x) != CONST_VECTOR
22919 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
22920 return -1;
22921
22922 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22923 return -1;
22924
22925 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22926 if (firstval <= 0)
22927 return -1;
22928
22929 for (int i = 1; i < nelts; i++)
22930 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22931 return -1;
22932
22933 return firstval;
22934 }
22935
22936 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22937 to float.
22938
22939 __fp16 always promotes through this hook.
22940 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22941 through the generic excess precision logic rather than here. */
22942
22943 static tree
22944 aarch64_promoted_type (const_tree t)
22945 {
22946 if (SCALAR_FLOAT_TYPE_P (t)
22947 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
22948 return float_type_node;
22949
22950 return NULL_TREE;
22951 }
22952
22953 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
22954
22955 static bool
22956 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22957 optimization_type opt_type)
22958 {
22959 switch (op)
22960 {
22961 case rsqrt_optab:
22962 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22963
22964 default:
22965 return true;
22966 }
22967 }
22968
22969 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
22970
22971 static unsigned int
22972 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22973 int *offset)
22974 {
22975 /* Polynomial invariant 1 == (VG / 2) - 1. */
22976 gcc_assert (i == 1);
22977 *factor = 2;
22978 *offset = 1;
22979 return AARCH64_DWARF_VG;
22980 }
22981
22982 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22983 if MODE is HFmode, and punt to the generic implementation otherwise. */
22984
22985 static bool
22986 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22987 {
22988 return (mode == HFmode
22989 ? true
22990 : default_libgcc_floating_mode_supported_p (mode));
22991 }
22992
22993 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22994 if MODE is HFmode, and punt to the generic implementation otherwise. */
22995
22996 static bool
22997 aarch64_scalar_mode_supported_p (scalar_mode mode)
22998 {
22999 return (mode == HFmode
23000 ? true
23001 : default_scalar_mode_supported_p (mode));
23002 }
23003
23004 /* Set the value of FLT_EVAL_METHOD.
23005 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
23006
23007 0: evaluate all operations and constants, whose semantic type has at
23008 most the range and precision of type float, to the range and
23009 precision of float; evaluate all other operations and constants to
23010 the range and precision of the semantic type;
23011
23012 N, where _FloatN is a supported interchange floating type
23013 evaluate all operations and constants, whose semantic type has at
23014 most the range and precision of _FloatN type, to the range and
23015 precision of the _FloatN type; evaluate all other operations and
23016 constants to the range and precision of the semantic type;
23017
23018 If we have the ARMv8.2-A extensions then we support _Float16 in native
23019 precision, so we should set this to 16. Otherwise, we support the type,
23020 but want to evaluate expressions in float precision, so set this to
23021 0. */
23022
23023 static enum flt_eval_method
23024 aarch64_excess_precision (enum excess_precision_type type)
23025 {
23026 switch (type)
23027 {
23028 case EXCESS_PRECISION_TYPE_FAST:
23029 case EXCESS_PRECISION_TYPE_STANDARD:
23030 /* We can calculate either in 16-bit range and precision or
23031 32-bit range and precision. Make that decision based on whether
23032 we have native support for the ARMv8.2-A 16-bit floating-point
23033 instructions or not. */
23034 return (TARGET_FP_F16INST
23035 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
23036 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
23037 case EXCESS_PRECISION_TYPE_IMPLICIT:
23038 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
23039 default:
23040 gcc_unreachable ();
23041 }
23042 return FLT_EVAL_METHOD_UNPREDICTABLE;
23043 }
23044
23045 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
23046 scheduled for speculative execution. Reject the long-running division
23047 and square-root instructions. */
23048
23049 static bool
23050 aarch64_sched_can_speculate_insn (rtx_insn *insn)
23051 {
23052 switch (get_attr_type (insn))
23053 {
23054 case TYPE_SDIV:
23055 case TYPE_UDIV:
23056 case TYPE_FDIVS:
23057 case TYPE_FDIVD:
23058 case TYPE_FSQRTS:
23059 case TYPE_FSQRTD:
23060 case TYPE_NEON_FP_SQRT_S:
23061 case TYPE_NEON_FP_SQRT_D:
23062 case TYPE_NEON_FP_SQRT_S_Q:
23063 case TYPE_NEON_FP_SQRT_D_Q:
23064 case TYPE_NEON_FP_DIV_S:
23065 case TYPE_NEON_FP_DIV_D:
23066 case TYPE_NEON_FP_DIV_S_Q:
23067 case TYPE_NEON_FP_DIV_D_Q:
23068 return false;
23069 default:
23070 return true;
23071 }
23072 }
23073
23074 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
23075
23076 static int
23077 aarch64_compute_pressure_classes (reg_class *classes)
23078 {
23079 int i = 0;
23080 classes[i++] = GENERAL_REGS;
23081 classes[i++] = FP_REGS;
23082 /* PR_REGS isn't a useful pressure class because many predicate pseudo
23083 registers need to go in PR_LO_REGS at some point during their
23084 lifetime. Splitting it into two halves has the effect of making
23085 all predicates count against PR_LO_REGS, so that we try whenever
23086 possible to restrict the number of live predicates to 8. This
23087 greatly reduces the amount of spilling in certain loops. */
23088 classes[i++] = PR_LO_REGS;
23089 classes[i++] = PR_HI_REGS;
23090 return i;
23091 }
23092
23093 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
23094
23095 static bool
23096 aarch64_can_change_mode_class (machine_mode from,
23097 machine_mode to, reg_class_t)
23098 {
23099 unsigned int from_flags = aarch64_classify_vector_mode (from);
23100 unsigned int to_flags = aarch64_classify_vector_mode (to);
23101
23102 bool from_sve_p = (from_flags & VEC_ANY_SVE);
23103 bool to_sve_p = (to_flags & VEC_ANY_SVE);
23104
23105 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
23106 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
23107
23108 bool from_pred_p = (from_flags & VEC_SVE_PRED);
23109 bool to_pred_p = (to_flags & VEC_SVE_PRED);
23110
23111 /* Don't allow changes between predicate modes and other modes.
23112 Only predicate registers can hold predicate modes and only
23113 non-predicate registers can hold non-predicate modes, so any
23114 attempt to mix them would require a round trip through memory. */
23115 if (from_pred_p != to_pred_p)
23116 return false;
23117
23118 /* Don't allow changes between partial SVE modes and other modes.
23119 The contents of partial SVE modes are distributed evenly across
23120 the register, whereas GCC expects them to be clustered together. */
23121 if (from_partial_sve_p != to_partial_sve_p)
23122 return false;
23123
23124 /* Similarly reject changes between partial SVE modes that have
23125 different patterns of significant and insignificant bits. */
23126 if (from_partial_sve_p
23127 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
23128 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
23129 return false;
23130
23131 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
23132 {
23133 /* Don't allow changes between SVE modes and other modes that might
23134 be bigger than 128 bits. In particular, OImode, CImode and XImode
23135 divide into 128-bit quantities while SVE modes divide into
23136 BITS_PER_SVE_VECTOR quantities. */
23137 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
23138 return false;
23139 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
23140 return false;
23141 }
23142
23143 if (BYTES_BIG_ENDIAN)
23144 {
23145 /* Don't allow changes between SVE data modes and non-SVE modes.
23146 See the comment at the head of aarch64-sve.md for details. */
23147 if (from_sve_p != to_sve_p)
23148 return false;
23149
23150 /* Don't allow changes in element size: lane 0 of the new vector
23151 would not then be lane 0 of the old vector. See the comment
23152 above aarch64_maybe_expand_sve_subreg_move for a more detailed
23153 description.
23154
23155 In the worst case, this forces a register to be spilled in
23156 one mode and reloaded in the other, which handles the
23157 endianness correctly. */
23158 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
23159 return false;
23160 }
23161 return true;
23162 }
23163
23164 /* Implement TARGET_EARLY_REMAT_MODES. */
23165
23166 static void
23167 aarch64_select_early_remat_modes (sbitmap modes)
23168 {
23169 /* SVE values are not normally live across a call, so it should be
23170 worth doing early rematerialization even in VL-specific mode. */
23171 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
23172 if (aarch64_sve_mode_p ((machine_mode) i))
23173 bitmap_set_bit (modes, i);
23174 }
23175
23176 /* Override the default target speculation_safe_value. */
23177 static rtx
23178 aarch64_speculation_safe_value (machine_mode mode,
23179 rtx result, rtx val, rtx failval)
23180 {
23181 /* Maybe we should warn if falling back to hard barriers. They are
23182 likely to be noticably more expensive than the alternative below. */
23183 if (!aarch64_track_speculation)
23184 return default_speculation_safe_value (mode, result, val, failval);
23185
23186 if (!REG_P (val))
23187 val = copy_to_mode_reg (mode, val);
23188
23189 if (!aarch64_reg_or_zero (failval, mode))
23190 failval = copy_to_mode_reg (mode, failval);
23191
23192 emit_insn (gen_despeculate_copy (mode, result, val, failval));
23193 return result;
23194 }
23195
23196 /* Implement TARGET_ESTIMATED_POLY_VALUE.
23197 Look into the tuning structure for an estimate.
23198 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
23199 Advanced SIMD 128 bits. */
23200
23201 static HOST_WIDE_INT
23202 aarch64_estimated_poly_value (poly_int64 val)
23203 {
23204 enum aarch64_sve_vector_bits_enum width_source
23205 = aarch64_tune_params.sve_width;
23206
23207 /* If we still don't have an estimate, use the default. */
23208 if (width_source == SVE_SCALABLE)
23209 return default_estimated_poly_value (val);
23210
23211 HOST_WIDE_INT over_128 = width_source - 128;
23212 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
23213 }
23214
23215
23216 /* Return true for types that could be supported as SIMD return or
23217 argument types. */
23218
23219 static bool
23220 supported_simd_type (tree t)
23221 {
23222 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
23223 {
23224 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
23225 return s == 1 || s == 2 || s == 4 || s == 8;
23226 }
23227 return false;
23228 }
23229
23230 /* Return true for types that currently are supported as SIMD return
23231 or argument types. */
23232
23233 static bool
23234 currently_supported_simd_type (tree t, tree b)
23235 {
23236 if (COMPLEX_FLOAT_TYPE_P (t))
23237 return false;
23238
23239 if (TYPE_SIZE (t) != TYPE_SIZE (b))
23240 return false;
23241
23242 return supported_simd_type (t);
23243 }
23244
23245 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
23246
23247 static int
23248 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
23249 struct cgraph_simd_clone *clonei,
23250 tree base_type, int num)
23251 {
23252 tree t, ret_type, arg_type;
23253 unsigned int elt_bits, count;
23254 unsigned HOST_WIDE_INT const_simdlen;
23255 poly_uint64 vec_bits;
23256
23257 if (!TARGET_SIMD)
23258 return 0;
23259
23260 /* For now, SVE simdclones won't produce illegal simdlen, So only check
23261 const simdlens here. */
23262 if (maybe_ne (clonei->simdlen, 0U)
23263 && clonei->simdlen.is_constant (&const_simdlen)
23264 && (const_simdlen < 2
23265 || const_simdlen > 1024
23266 || (const_simdlen & (const_simdlen - 1)) != 0))
23267 {
23268 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23269 "unsupported simdlen %wd", const_simdlen);
23270 return 0;
23271 }
23272
23273 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
23274 if (TREE_CODE (ret_type) != VOID_TYPE
23275 && !currently_supported_simd_type (ret_type, base_type))
23276 {
23277 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
23278 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23279 "GCC does not currently support mixed size types "
23280 "for %<simd%> functions");
23281 else if (supported_simd_type (ret_type))
23282 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23283 "GCC does not currently support return type %qT "
23284 "for %<simd%> functions", ret_type);
23285 else
23286 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23287 "unsupported return type %qT for %<simd%> functions",
23288 ret_type);
23289 return 0;
23290 }
23291
23292 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
23293 {
23294 arg_type = TREE_TYPE (t);
23295
23296 if (!currently_supported_simd_type (arg_type, base_type))
23297 {
23298 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
23299 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23300 "GCC does not currently support mixed size types "
23301 "for %<simd%> functions");
23302 else
23303 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23304 "GCC does not currently support argument type %qT "
23305 "for %<simd%> functions", arg_type);
23306 return 0;
23307 }
23308 }
23309
23310 clonei->vecsize_mangle = 'n';
23311 clonei->mask_mode = VOIDmode;
23312 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
23313 if (known_eq (clonei->simdlen, 0U))
23314 {
23315 count = 2;
23316 vec_bits = (num == 0 ? 64 : 128);
23317 clonei->simdlen = exact_div (vec_bits, elt_bits);
23318 }
23319 else
23320 {
23321 count = 1;
23322 vec_bits = clonei->simdlen * elt_bits;
23323 /* For now, SVE simdclones won't produce illegal simdlen, So only check
23324 const simdlens here. */
23325 if (clonei->simdlen.is_constant (&const_simdlen)
23326 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
23327 {
23328 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23329 "GCC does not currently support simdlen %wd for type %qT",
23330 const_simdlen, base_type);
23331 return 0;
23332 }
23333 }
23334 clonei->vecsize_int = vec_bits;
23335 clonei->vecsize_float = vec_bits;
23336 return count;
23337 }
23338
23339 /* Implement TARGET_SIMD_CLONE_ADJUST. */
23340
23341 static void
23342 aarch64_simd_clone_adjust (struct cgraph_node *node)
23343 {
23344 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
23345 use the correct ABI. */
23346
23347 tree t = TREE_TYPE (node->decl);
23348 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
23349 TYPE_ATTRIBUTES (t));
23350 }
23351
23352 /* Implement TARGET_SIMD_CLONE_USABLE. */
23353
23354 static int
23355 aarch64_simd_clone_usable (struct cgraph_node *node)
23356 {
23357 switch (node->simdclone->vecsize_mangle)
23358 {
23359 case 'n':
23360 if (!TARGET_SIMD)
23361 return -1;
23362 return 0;
23363 default:
23364 gcc_unreachable ();
23365 }
23366 }
23367
23368 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
23369
23370 static int
23371 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
23372 {
23373 auto check_attr = [&](const char *name) {
23374 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
23375 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
23376 if (!attr1 && !attr2)
23377 return true;
23378
23379 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
23380 };
23381
23382 if (!check_attr ("aarch64_vector_pcs"))
23383 return 0;
23384 if (!check_attr ("Advanced SIMD type"))
23385 return 0;
23386 return 1;
23387 }
23388
23389 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
23390
23391 static const char *
23392 aarch64_get_multilib_abi_name (void)
23393 {
23394 if (TARGET_BIG_END)
23395 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
23396 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
23397 }
23398
23399 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
23400 global variable based guard use the default else
23401 return a null tree. */
23402 static tree
23403 aarch64_stack_protect_guard (void)
23404 {
23405 if (aarch64_stack_protector_guard == SSP_GLOBAL)
23406 return default_stack_protect_guard ();
23407
23408 return NULL_TREE;
23409 }
23410
23411 /* Return the diagnostic message string if conversion from FROMTYPE to
23412 TOTYPE is not allowed, NULL otherwise. */
23413
23414 static const char *
23415 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
23416 {
23417 if (element_mode (fromtype) != element_mode (totype))
23418 {
23419 /* Do no allow conversions to/from BFmode scalar types. */
23420 if (TYPE_MODE (fromtype) == BFmode)
23421 return N_("invalid conversion from type %<bfloat16_t%>");
23422 if (TYPE_MODE (totype) == BFmode)
23423 return N_("invalid conversion to type %<bfloat16_t%>");
23424 }
23425
23426 /* Conversion allowed. */
23427 return NULL;
23428 }
23429
23430 /* Return the diagnostic message string if the unary operation OP is
23431 not permitted on TYPE, NULL otherwise. */
23432
23433 static const char *
23434 aarch64_invalid_unary_op (int op, const_tree type)
23435 {
23436 /* Reject all single-operand operations on BFmode except for &. */
23437 if (element_mode (type) == BFmode && op != ADDR_EXPR)
23438 return N_("operation not permitted on type %<bfloat16_t%>");
23439
23440 /* Operation allowed. */
23441 return NULL;
23442 }
23443
23444 /* Return the diagnostic message string if the binary operation OP is
23445 not permitted on TYPE1 and TYPE2, NULL otherwise. */
23446
23447 static const char *
23448 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
23449 const_tree type2)
23450 {
23451 /* Reject all 2-operand operations on BFmode. */
23452 if (element_mode (type1) == BFmode
23453 || element_mode (type2) == BFmode)
23454 return N_("operation not permitted on type %<bfloat16_t%>");
23455
23456 if (VECTOR_TYPE_P (type1)
23457 && VECTOR_TYPE_P (type2)
23458 && !TYPE_INDIVISIBLE_P (type1)
23459 && !TYPE_INDIVISIBLE_P (type2)
23460 && (aarch64_sve::builtin_type_p (type1)
23461 != aarch64_sve::builtin_type_p (type2)))
23462 return N_("cannot combine GNU and SVE vectors in a binary operation");
23463
23464 /* Operation allowed. */
23465 return NULL;
23466 }
23467
23468 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
23469 compiler that we automatically ignore the top byte of our pointers, which
23470 allows using -fsanitize=hwaddress. */
23471 bool
23472 aarch64_can_tag_addresses ()
23473 {
23474 return !TARGET_ILP32;
23475 }
23476
23477 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
23478 section at the end if needed. */
23479 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
23480 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
23481 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
23482 void
23483 aarch64_file_end_indicate_exec_stack ()
23484 {
23485 file_end_indicate_exec_stack ();
23486
23487 unsigned feature_1_and = 0;
23488 if (aarch64_bti_enabled ())
23489 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
23490
23491 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
23492 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
23493
23494 if (feature_1_and)
23495 {
23496 /* Generate .note.gnu.property section. */
23497 switch_to_section (get_section (".note.gnu.property",
23498 SECTION_NOTYPE, NULL));
23499
23500 /* PT_NOTE header: namesz, descsz, type.
23501 namesz = 4 ("GNU\0")
23502 descsz = 16 (Size of the program property array)
23503 [(12 + padding) * Number of array elements]
23504 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
23505 assemble_align (POINTER_SIZE);
23506 assemble_integer (GEN_INT (4), 4, 32, 1);
23507 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
23508 assemble_integer (GEN_INT (5), 4, 32, 1);
23509
23510 /* PT_NOTE name. */
23511 assemble_string ("GNU", 4);
23512
23513 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
23514 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
23515 datasz = 4
23516 data = feature_1_and. */
23517 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
23518 assemble_integer (GEN_INT (4), 4, 32, 1);
23519 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
23520
23521 /* Pad the size of the note to the required alignment. */
23522 assemble_align (POINTER_SIZE);
23523 }
23524 }
23525 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
23526 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
23527 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
23528
23529 /* Helper function for straight line speculation.
23530 Return what barrier should be emitted for straight line speculation
23531 mitigation.
23532 When not mitigating against straight line speculation this function returns
23533 an empty string.
23534 When mitigating against straight line speculation, use:
23535 * SB when the v8.5-A SB extension is enabled.
23536 * DSB+ISB otherwise. */
23537 const char *
23538 aarch64_sls_barrier (int mitigation_required)
23539 {
23540 return mitigation_required
23541 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
23542 : "";
23543 }
23544
23545 static GTY (()) tree aarch64_sls_shared_thunks[30];
23546 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
23547 const char *indirect_symbol_names[30] = {
23548 "__call_indirect_x0",
23549 "__call_indirect_x1",
23550 "__call_indirect_x2",
23551 "__call_indirect_x3",
23552 "__call_indirect_x4",
23553 "__call_indirect_x5",
23554 "__call_indirect_x6",
23555 "__call_indirect_x7",
23556 "__call_indirect_x8",
23557 "__call_indirect_x9",
23558 "__call_indirect_x10",
23559 "__call_indirect_x11",
23560 "__call_indirect_x12",
23561 "__call_indirect_x13",
23562 "__call_indirect_x14",
23563 "__call_indirect_x15",
23564 "", /* "__call_indirect_x16", */
23565 "", /* "__call_indirect_x17", */
23566 "__call_indirect_x18",
23567 "__call_indirect_x19",
23568 "__call_indirect_x20",
23569 "__call_indirect_x21",
23570 "__call_indirect_x22",
23571 "__call_indirect_x23",
23572 "__call_indirect_x24",
23573 "__call_indirect_x25",
23574 "__call_indirect_x26",
23575 "__call_indirect_x27",
23576 "__call_indirect_x28",
23577 "__call_indirect_x29",
23578 };
23579
23580 /* Function to create a BLR thunk. This thunk is used to mitigate straight
23581 line speculation. Instead of a simple BLR that can be speculated past,
23582 we emit a BL to this thunk, and this thunk contains a BR to the relevant
23583 register. These thunks have the relevant speculation barries put after
23584 their indirect branch so that speculation is blocked.
23585
23586 We use such a thunk so the speculation barriers are kept off the
23587 architecturally executed path in order to reduce the performance overhead.
23588
23589 When optimizing for size we use stubs shared by the linked object.
23590 When optimizing for performance we emit stubs for each function in the hope
23591 that the branch predictor can better train on jumps specific for a given
23592 function. */
23593 rtx
23594 aarch64_sls_create_blr_label (int regnum)
23595 {
23596 gcc_assert (STUB_REGNUM_P (regnum));
23597 if (optimize_function_for_size_p (cfun))
23598 {
23599 /* For the thunks shared between different functions in this compilation
23600 unit we use a named symbol -- this is just for users to more easily
23601 understand the generated assembly. */
23602 aarch64_sls_shared_thunks_needed = true;
23603 const char *thunk_name = indirect_symbol_names[regnum];
23604 if (aarch64_sls_shared_thunks[regnum] == NULL)
23605 {
23606 /* Build a decl representing this function stub and record it for
23607 later. We build a decl here so we can use the GCC machinery for
23608 handling sections automatically (through `get_named_section` and
23609 `make_decl_one_only`). That saves us a lot of trouble handling
23610 the specifics of different output file formats. */
23611 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
23612 get_identifier (thunk_name),
23613 build_function_type_list (void_type_node,
23614 NULL_TREE));
23615 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
23616 NULL_TREE, void_type_node);
23617 TREE_PUBLIC (decl) = 1;
23618 TREE_STATIC (decl) = 1;
23619 DECL_IGNORED_P (decl) = 1;
23620 DECL_ARTIFICIAL (decl) = 1;
23621 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
23622 resolve_unique_section (decl, 0, false);
23623 aarch64_sls_shared_thunks[regnum] = decl;
23624 }
23625
23626 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
23627 }
23628
23629 if (cfun->machine->call_via[regnum] == NULL)
23630 cfun->machine->call_via[regnum]
23631 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
23632 return cfun->machine->call_via[regnum];
23633 }
23634
23635 /* Helper function for aarch64_sls_emit_blr_function_thunks and
23636 aarch64_sls_emit_shared_blr_thunks below. */
23637 static void
23638 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
23639 {
23640 /* Save in x16 and branch to that function so this transformation does
23641 not prevent jumping to `BTI c` instructions. */
23642 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
23643 asm_fprintf (out_file, "\tbr\tx16\n");
23644 }
23645
23646 /* Emit all BLR stubs for this particular function.
23647 Here we emit all the BLR stubs needed for the current function. Since we
23648 emit these stubs in a consecutive block we know there will be no speculation
23649 gadgets between each stub, and hence we only emit a speculation barrier at
23650 the end of the stub sequences.
23651
23652 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
23653 void
23654 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
23655 {
23656 if (! aarch64_harden_sls_blr_p ())
23657 return;
23658
23659 bool any_functions_emitted = false;
23660 /* We must save and restore the current function section since this assembly
23661 is emitted at the end of the function. This means it can be emitted *just
23662 after* the cold section of a function. That cold part would be emitted in
23663 a different section. That switch would trigger a `.cfi_endproc` directive
23664 to be emitted in the original section and a `.cfi_startproc` directive to
23665 be emitted in the new section. Switching to the original section without
23666 restoring would mean that the `.cfi_endproc` emitted as a function ends
23667 would happen in a different section -- leaving an unmatched
23668 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
23669 in the standard text section. */
23670 section *save_text_section = in_section;
23671 switch_to_section (function_section (current_function_decl));
23672 for (int regnum = 0; regnum < 30; ++regnum)
23673 {
23674 rtx specu_label = cfun->machine->call_via[regnum];
23675 if (specu_label == NULL)
23676 continue;
23677
23678 targetm.asm_out.print_operand (out_file, specu_label, 0);
23679 asm_fprintf (out_file, ":\n");
23680 aarch64_sls_emit_function_stub (out_file, regnum);
23681 any_functions_emitted = true;
23682 }
23683 if (any_functions_emitted)
23684 /* Can use the SB if needs be here, since this stub will only be used
23685 by the current function, and hence for the current target. */
23686 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
23687 switch_to_section (save_text_section);
23688 }
23689
23690 /* Emit shared BLR stubs for the current compilation unit.
23691 Over the course of compiling this unit we may have converted some BLR
23692 instructions to a BL to a shared stub function. This is where we emit those
23693 stub functions.
23694 This function is for the stubs shared between different functions in this
23695 compilation unit. We share when optimizing for size instead of speed.
23696
23697 This function is called through the TARGET_ASM_FILE_END hook. */
23698 void
23699 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
23700 {
23701 if (! aarch64_sls_shared_thunks_needed)
23702 return;
23703
23704 for (int regnum = 0; regnum < 30; ++regnum)
23705 {
23706 tree decl = aarch64_sls_shared_thunks[regnum];
23707 if (!decl)
23708 continue;
23709
23710 const char *name = indirect_symbol_names[regnum];
23711 switch_to_section (get_named_section (decl, NULL, 0));
23712 ASM_OUTPUT_ALIGN (out_file, 2);
23713 targetm.asm_out.globalize_label (out_file, name);
23714 /* Only emits if the compiler is configured for an assembler that can
23715 handle visibility directives. */
23716 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
23717 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
23718 ASM_OUTPUT_LABEL (out_file, name);
23719 aarch64_sls_emit_function_stub (out_file, regnum);
23720 /* Use the most conservative target to ensure it can always be used by any
23721 function in the translation unit. */
23722 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
23723 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
23724 }
23725 }
23726
23727 /* Implement TARGET_ASM_FILE_END. */
23728 void
23729 aarch64_asm_file_end ()
23730 {
23731 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
23732 /* Since this function will be called for the ASM_FILE_END hook, we ensure
23733 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
23734 for FreeBSD) still gets called. */
23735 #ifdef TARGET_ASM_FILE_END
23736 TARGET_ASM_FILE_END ();
23737 #endif
23738 }
23739
23740 const char *
23741 aarch64_indirect_call_asm (rtx addr)
23742 {
23743 gcc_assert (REG_P (addr));
23744 if (aarch64_harden_sls_blr_p ())
23745 {
23746 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
23747 output_asm_insn ("bl\t%0", &stub_label);
23748 }
23749 else
23750 output_asm_insn ("blr\t%0", &addr);
23751 return "";
23752 }
23753
23754 /* Target-specific selftests. */
23755
23756 #if CHECKING_P
23757
23758 namespace selftest {
23759
23760 /* Selftest for the RTL loader.
23761 Verify that the RTL loader copes with a dump from
23762 print_rtx_function. This is essentially just a test that class
23763 function_reader can handle a real dump, but it also verifies
23764 that lookup_reg_by_dump_name correctly handles hard regs.
23765 The presence of hard reg names in the dump means that the test is
23766 target-specific, hence it is in this file. */
23767
23768 static void
23769 aarch64_test_loading_full_dump ()
23770 {
23771 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
23772
23773 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
23774
23775 rtx_insn *insn_1 = get_insn_by_uid (1);
23776 ASSERT_EQ (NOTE, GET_CODE (insn_1));
23777
23778 rtx_insn *insn_15 = get_insn_by_uid (15);
23779 ASSERT_EQ (INSN, GET_CODE (insn_15));
23780 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
23781
23782 /* Verify crtl->return_rtx. */
23783 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
23784 ASSERT_EQ (0, REGNO (crtl->return_rtx));
23785 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
23786 }
23787
23788 /* Run all target-specific selftests. */
23789
23790 static void
23791 aarch64_run_selftests (void)
23792 {
23793 aarch64_test_loading_full_dump ();
23794 }
23795
23796 } // namespace selftest
23797
23798 #endif /* #if CHECKING_P */
23799
23800 #undef TARGET_STACK_PROTECT_GUARD
23801 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
23802
23803 #undef TARGET_ADDRESS_COST
23804 #define TARGET_ADDRESS_COST aarch64_address_cost
23805
23806 /* This hook will determines whether unnamed bitfields affect the alignment
23807 of the containing structure. The hook returns true if the structure
23808 should inherit the alignment requirements of an unnamed bitfield's
23809 type. */
23810 #undef TARGET_ALIGN_ANON_BITFIELD
23811 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
23812
23813 #undef TARGET_ASM_ALIGNED_DI_OP
23814 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
23815
23816 #undef TARGET_ASM_ALIGNED_HI_OP
23817 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
23818
23819 #undef TARGET_ASM_ALIGNED_SI_OP
23820 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
23821
23822 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
23823 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
23824 hook_bool_const_tree_hwi_hwi_const_tree_true
23825
23826 #undef TARGET_ASM_FILE_START
23827 #define TARGET_ASM_FILE_START aarch64_start_file
23828
23829 #undef TARGET_ASM_OUTPUT_MI_THUNK
23830 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
23831
23832 #undef TARGET_ASM_SELECT_RTX_SECTION
23833 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
23834
23835 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
23836 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
23837
23838 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
23839 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
23840
23841 #undef TARGET_BUILD_BUILTIN_VA_LIST
23842 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
23843
23844 #undef TARGET_CALLEE_COPIES
23845 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
23846
23847 #undef TARGET_CAN_ELIMINATE
23848 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
23849
23850 #undef TARGET_CAN_INLINE_P
23851 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
23852
23853 #undef TARGET_CANNOT_FORCE_CONST_MEM
23854 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
23855
23856 #undef TARGET_CASE_VALUES_THRESHOLD
23857 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
23858
23859 #undef TARGET_CONDITIONAL_REGISTER_USAGE
23860 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
23861
23862 #undef TARGET_MEMBER_TYPE_FORCES_BLK
23863 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
23864
23865 /* Only the least significant bit is used for initialization guard
23866 variables. */
23867 #undef TARGET_CXX_GUARD_MASK_BIT
23868 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
23869
23870 #undef TARGET_C_MODE_FOR_SUFFIX
23871 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
23872
23873 #ifdef TARGET_BIG_ENDIAN_DEFAULT
23874 #undef TARGET_DEFAULT_TARGET_FLAGS
23875 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
23876 #endif
23877
23878 #undef TARGET_CLASS_MAX_NREGS
23879 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
23880
23881 #undef TARGET_BUILTIN_DECL
23882 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
23883
23884 #undef TARGET_BUILTIN_RECIPROCAL
23885 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
23886
23887 #undef TARGET_C_EXCESS_PRECISION
23888 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
23889
23890 #undef TARGET_EXPAND_BUILTIN
23891 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
23892
23893 #undef TARGET_EXPAND_BUILTIN_VA_START
23894 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
23895
23896 #undef TARGET_FOLD_BUILTIN
23897 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
23898
23899 #undef TARGET_FUNCTION_ARG
23900 #define TARGET_FUNCTION_ARG aarch64_function_arg
23901
23902 #undef TARGET_FUNCTION_ARG_ADVANCE
23903 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23904
23905 #undef TARGET_FUNCTION_ARG_BOUNDARY
23906 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23907
23908 #undef TARGET_FUNCTION_ARG_PADDING
23909 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23910
23911 #undef TARGET_GET_RAW_RESULT_MODE
23912 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23913 #undef TARGET_GET_RAW_ARG_MODE
23914 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23915
23916 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
23917 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23918
23919 #undef TARGET_FUNCTION_VALUE
23920 #define TARGET_FUNCTION_VALUE aarch64_function_value
23921
23922 #undef TARGET_FUNCTION_VALUE_REGNO_P
23923 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23924
23925 #undef TARGET_GIMPLE_FOLD_BUILTIN
23926 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
23927
23928 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
23929 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23930
23931 #undef TARGET_INIT_BUILTINS
23932 #define TARGET_INIT_BUILTINS aarch64_init_builtins
23933
23934 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23935 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23936 aarch64_ira_change_pseudo_allocno_class
23937
23938 #undef TARGET_LEGITIMATE_ADDRESS_P
23939 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23940
23941 #undef TARGET_LEGITIMATE_CONSTANT_P
23942 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23943
23944 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23945 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23946 aarch64_legitimize_address_displacement
23947
23948 #undef TARGET_LIBGCC_CMP_RETURN_MODE
23949 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23950
23951 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23952 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23953 aarch64_libgcc_floating_mode_supported_p
23954
23955 #undef TARGET_MANGLE_TYPE
23956 #define TARGET_MANGLE_TYPE aarch64_mangle_type
23957
23958 #undef TARGET_INVALID_CONVERSION
23959 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23960
23961 #undef TARGET_INVALID_UNARY_OP
23962 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23963
23964 #undef TARGET_INVALID_BINARY_OP
23965 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23966
23967 #undef TARGET_VERIFY_TYPE_CONTEXT
23968 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23969
23970 #undef TARGET_MEMORY_MOVE_COST
23971 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23972
23973 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23974 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23975
23976 #undef TARGET_MUST_PASS_IN_STACK
23977 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23978
23979 /* This target hook should return true if accesses to volatile bitfields
23980 should use the narrowest mode possible. It should return false if these
23981 accesses should use the bitfield container type. */
23982 #undef TARGET_NARROW_VOLATILE_BITFIELD
23983 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
23984
23985 #undef TARGET_OPTION_OVERRIDE
23986 #define TARGET_OPTION_OVERRIDE aarch64_override_options
23987
23988 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
23989 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
23990 aarch64_override_options_after_change
23991
23992 #undef TARGET_OFFLOAD_OPTIONS
23993 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
23994
23995 #undef TARGET_OPTION_SAVE
23996 #define TARGET_OPTION_SAVE aarch64_option_save
23997
23998 #undef TARGET_OPTION_RESTORE
23999 #define TARGET_OPTION_RESTORE aarch64_option_restore
24000
24001 #undef TARGET_OPTION_PRINT
24002 #define TARGET_OPTION_PRINT aarch64_option_print
24003
24004 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
24005 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
24006
24007 #undef TARGET_SET_CURRENT_FUNCTION
24008 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
24009
24010 #undef TARGET_PASS_BY_REFERENCE
24011 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
24012
24013 #undef TARGET_PREFERRED_RELOAD_CLASS
24014 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
24015
24016 #undef TARGET_SCHED_REASSOCIATION_WIDTH
24017 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
24018
24019 #undef TARGET_PROMOTED_TYPE
24020 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
24021
24022 #undef TARGET_SECONDARY_RELOAD
24023 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
24024
24025 #undef TARGET_SHIFT_TRUNCATION_MASK
24026 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
24027
24028 #undef TARGET_SETUP_INCOMING_VARARGS
24029 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
24030
24031 #undef TARGET_STRUCT_VALUE_RTX
24032 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
24033
24034 #undef TARGET_REGISTER_MOVE_COST
24035 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
24036
24037 #undef TARGET_RETURN_IN_MEMORY
24038 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
24039
24040 #undef TARGET_RETURN_IN_MSB
24041 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
24042
24043 #undef TARGET_RTX_COSTS
24044 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
24045
24046 #undef TARGET_SCALAR_MODE_SUPPORTED_P
24047 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
24048
24049 #undef TARGET_SCHED_ISSUE_RATE
24050 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
24051
24052 #undef TARGET_SCHED_VARIABLE_ISSUE
24053 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
24054
24055 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
24056 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
24057 aarch64_sched_first_cycle_multipass_dfa_lookahead
24058
24059 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
24060 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
24061 aarch64_first_cycle_multipass_dfa_lookahead_guard
24062
24063 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
24064 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
24065 aarch64_get_separate_components
24066
24067 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
24068 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
24069 aarch64_components_for_bb
24070
24071 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
24072 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
24073 aarch64_disqualify_components
24074
24075 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
24076 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
24077 aarch64_emit_prologue_components
24078
24079 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
24080 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
24081 aarch64_emit_epilogue_components
24082
24083 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
24084 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
24085 aarch64_set_handled_components
24086
24087 #undef TARGET_TRAMPOLINE_INIT
24088 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
24089
24090 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
24091 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
24092
24093 #undef TARGET_VECTOR_MODE_SUPPORTED_P
24094 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
24095
24096 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
24097 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
24098
24099 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
24100 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
24101 aarch64_builtin_support_vector_misalignment
24102
24103 #undef TARGET_ARRAY_MODE
24104 #define TARGET_ARRAY_MODE aarch64_array_mode
24105
24106 #undef TARGET_ARRAY_MODE_SUPPORTED_P
24107 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
24108
24109 #undef TARGET_VECTORIZE_ADD_STMT_COST
24110 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
24111
24112 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
24113 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
24114 aarch64_builtin_vectorization_cost
24115
24116 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
24117 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
24118
24119 #undef TARGET_VECTORIZE_BUILTINS
24120 #define TARGET_VECTORIZE_BUILTINS
24121
24122 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
24123 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
24124 aarch64_builtin_vectorized_function
24125
24126 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
24127 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
24128 aarch64_autovectorize_vector_modes
24129
24130 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
24131 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
24132 aarch64_atomic_assign_expand_fenv
24133
24134 /* Section anchor support. */
24135
24136 #undef TARGET_MIN_ANCHOR_OFFSET
24137 #define TARGET_MIN_ANCHOR_OFFSET -256
24138
24139 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
24140 byte offset; we can do much more for larger data types, but have no way
24141 to determine the size of the access. We assume accesses are aligned. */
24142 #undef TARGET_MAX_ANCHOR_OFFSET
24143 #define TARGET_MAX_ANCHOR_OFFSET 4095
24144
24145 #undef TARGET_VECTOR_ALIGNMENT
24146 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
24147
24148 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
24149 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
24150 aarch64_vectorize_preferred_vector_alignment
24151 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
24152 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
24153 aarch64_simd_vector_alignment_reachable
24154
24155 /* vec_perm support. */
24156
24157 #undef TARGET_VECTORIZE_VEC_PERM_CONST
24158 #define TARGET_VECTORIZE_VEC_PERM_CONST \
24159 aarch64_vectorize_vec_perm_const
24160
24161 #undef TARGET_VECTORIZE_RELATED_MODE
24162 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
24163 #undef TARGET_VECTORIZE_GET_MASK_MODE
24164 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
24165 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
24166 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
24167 aarch64_empty_mask_is_expensive
24168 #undef TARGET_PREFERRED_ELSE_VALUE
24169 #define TARGET_PREFERRED_ELSE_VALUE \
24170 aarch64_preferred_else_value
24171
24172 #undef TARGET_INIT_LIBFUNCS
24173 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
24174
24175 #undef TARGET_FIXED_CONDITION_CODE_REGS
24176 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
24177
24178 #undef TARGET_FLAGS_REGNUM
24179 #define TARGET_FLAGS_REGNUM CC_REGNUM
24180
24181 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
24182 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
24183
24184 #undef TARGET_ASAN_SHADOW_OFFSET
24185 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
24186
24187 #undef TARGET_LEGITIMIZE_ADDRESS
24188 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
24189
24190 #undef TARGET_SCHED_CAN_SPECULATE_INSN
24191 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
24192
24193 #undef TARGET_CAN_USE_DOLOOP_P
24194 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
24195
24196 #undef TARGET_SCHED_ADJUST_PRIORITY
24197 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
24198
24199 #undef TARGET_SCHED_MACRO_FUSION_P
24200 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
24201
24202 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
24203 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
24204
24205 #undef TARGET_SCHED_FUSION_PRIORITY
24206 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
24207
24208 #undef TARGET_UNSPEC_MAY_TRAP_P
24209 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
24210
24211 #undef TARGET_USE_PSEUDO_PIC_REG
24212 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
24213
24214 #undef TARGET_PRINT_OPERAND
24215 #define TARGET_PRINT_OPERAND aarch64_print_operand
24216
24217 #undef TARGET_PRINT_OPERAND_ADDRESS
24218 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
24219
24220 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
24221 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
24222
24223 #undef TARGET_OPTAB_SUPPORTED_P
24224 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
24225
24226 #undef TARGET_OMIT_STRUCT_RETURN_REG
24227 #define TARGET_OMIT_STRUCT_RETURN_REG true
24228
24229 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
24230 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
24231 aarch64_dwarf_poly_indeterminate_value
24232
24233 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
24234 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
24235 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
24236
24237 #undef TARGET_HARD_REGNO_NREGS
24238 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
24239 #undef TARGET_HARD_REGNO_MODE_OK
24240 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
24241
24242 #undef TARGET_MODES_TIEABLE_P
24243 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
24244
24245 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
24246 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
24247 aarch64_hard_regno_call_part_clobbered
24248
24249 #undef TARGET_INSN_CALLEE_ABI
24250 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
24251
24252 #undef TARGET_CONSTANT_ALIGNMENT
24253 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
24254
24255 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
24256 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
24257 aarch64_stack_clash_protection_alloca_probe_range
24258
24259 #undef TARGET_COMPUTE_PRESSURE_CLASSES
24260 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
24261
24262 #undef TARGET_CAN_CHANGE_MODE_CLASS
24263 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
24264
24265 #undef TARGET_SELECT_EARLY_REMAT_MODES
24266 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
24267
24268 #undef TARGET_SPECULATION_SAFE_VALUE
24269 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
24270
24271 #undef TARGET_ESTIMATED_POLY_VALUE
24272 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
24273
24274 #undef TARGET_ATTRIBUTE_TABLE
24275 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
24276
24277 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
24278 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
24279 aarch64_simd_clone_compute_vecsize_and_simdlen
24280
24281 #undef TARGET_SIMD_CLONE_ADJUST
24282 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
24283
24284 #undef TARGET_SIMD_CLONE_USABLE
24285 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
24286
24287 #undef TARGET_COMP_TYPE_ATTRIBUTES
24288 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
24289
24290 #undef TARGET_GET_MULTILIB_ABI_NAME
24291 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
24292
24293 #undef TARGET_FNTYPE_ABI
24294 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
24295
24296 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
24297 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
24298
24299 #if CHECKING_P
24300 #undef TARGET_RUN_TARGET_SELFTESTS
24301 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
24302 #endif /* #if CHECKING_P */
24303
24304 #undef TARGET_ASM_POST_CFI_STARTPROC
24305 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
24306
24307 #undef TARGET_STRICT_ARGUMENT_NAMING
24308 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
24309
24310 #undef TARGET_MD_ASM_ADJUST
24311 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
24312
24313 #undef TARGET_ASM_FILE_END
24314 #define TARGET_ASM_FILE_END aarch64_asm_file_end
24315
24316 #undef TARGET_ASM_FUNCTION_EPILOGUE
24317 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
24318
24319 struct gcc_target targetm = TARGET_INITIALIZER;
24320
24321 #include "gt-aarch64.h"