30d6f4176b8a03533780a260216297e547af8938
[gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "diagnostic.h"
40 #include "insn-attr.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "stor-layout.h"
44 #include "calls.h"
45 #include "varasm.h"
46 #include "output.h"
47 #include "flags.h"
48 #include "explow.h"
49 #include "expr.h"
50 #include "reload.h"
51 #include "langhooks.h"
52 #include "opts.h"
53 #include "params.h"
54 #include "gimplify.h"
55 #include "dwarf2.h"
56 #include "gimple-iterator.h"
57 #include "tree-vectorizer.h"
58 #include "aarch64-cost-tables.h"
59 #include "dumpfile.h"
60 #include "builtins.h"
61 #include "rtl-iter.h"
62 #include "tm-constrs.h"
63 #include "sched-int.h"
64 #include "cortex-a57-fma-steering.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67
68 /* This file should be included last. */
69 #include "target-def.h"
70
71 /* Defined for convenience. */
72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
73
74 /* Classifies an address.
75
76 ADDRESS_REG_IMM
77 A simple base register plus immediate offset.
78
79 ADDRESS_REG_WB
80 A base register indexed by immediate offset with writeback.
81
82 ADDRESS_REG_REG
83 A base register indexed by (optionally scaled) register.
84
85 ADDRESS_REG_UXTW
86 A base register indexed by (optionally scaled) zero-extended register.
87
88 ADDRESS_REG_SXTW
89 A base register indexed by (optionally scaled) sign-extended register.
90
91 ADDRESS_LO_SUM
92 A LO_SUM rtx with a base register and "LO12" symbol relocation.
93
94 ADDRESS_SYMBOLIC:
95 A constant symbolic address, in pc-relative literal pool. */
96
97 enum aarch64_address_type {
98 ADDRESS_REG_IMM,
99 ADDRESS_REG_WB,
100 ADDRESS_REG_REG,
101 ADDRESS_REG_UXTW,
102 ADDRESS_REG_SXTW,
103 ADDRESS_LO_SUM,
104 ADDRESS_SYMBOLIC
105 };
106
107 struct aarch64_address_info {
108 enum aarch64_address_type type;
109 rtx base;
110 rtx offset;
111 int shift;
112 enum aarch64_symbol_type symbol_type;
113 };
114
115 struct simd_immediate_info
116 {
117 rtx value;
118 int shift;
119 int element_width;
120 bool mvn;
121 bool msl;
122 };
123
124 /* The current code model. */
125 enum aarch64_code_model aarch64_cmodel;
126
127 #ifdef HAVE_AS_TLS
128 #undef TARGET_HAVE_TLS
129 #define TARGET_HAVE_TLS 1
130 #endif
131
132 static bool aarch64_composite_type_p (const_tree, machine_mode);
133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
134 const_tree,
135 machine_mode *, int *,
136 bool *);
137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
139 static void aarch64_override_options_after_change (void);
140 static bool aarch64_vector_mode_supported_p (machine_mode);
141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
142 const unsigned char *sel);
143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
144
145 /* Major revision number of the ARM Architecture implemented by the target. */
146 unsigned aarch64_architecture_version;
147
148 /* The processor for which instructions should be scheduled. */
149 enum aarch64_processor aarch64_tune = cortexa53;
150
151 /* Mask to specify which instruction scheduling options should be used. */
152 unsigned long aarch64_tune_flags = 0;
153
154 /* Global flag for PC relative loads. */
155 bool aarch64_nopcrelative_literal_loads;
156
157 /* Support for command line parsing of boolean flags in the tuning
158 structures. */
159 struct aarch64_flag_desc
160 {
161 const char* name;
162 unsigned int flag;
163 };
164
165 #define AARCH64_FUSION_PAIR(name, internal_name) \
166 { name, AARCH64_FUSE_##internal_name },
167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
168 {
169 { "none", AARCH64_FUSE_NOTHING },
170 #include "aarch64-fusion-pairs.def"
171 { "all", AARCH64_FUSE_ALL },
172 { NULL, AARCH64_FUSE_NOTHING }
173 };
174 #undef AARCH64_FUION_PAIR
175
176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
177 { name, AARCH64_EXTRA_TUNE_##internal_name },
178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
179 {
180 { "none", AARCH64_EXTRA_TUNE_NONE },
181 #include "aarch64-tuning-flags.def"
182 { "all", AARCH64_EXTRA_TUNE_ALL },
183 { NULL, AARCH64_EXTRA_TUNE_NONE }
184 };
185 #undef AARCH64_EXTRA_TUNING_OPTION
186
187 /* Tuning parameters. */
188
189 static const struct cpu_addrcost_table generic_addrcost_table =
190 {
191 {
192 0, /* hi */
193 0, /* si */
194 0, /* di */
195 0, /* ti */
196 },
197 0, /* pre_modify */
198 0, /* post_modify */
199 0, /* register_offset */
200 0, /* register_sextend */
201 0, /* register_zextend */
202 0 /* imm_offset */
203 };
204
205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
206 {
207 {
208 1, /* hi */
209 0, /* si */
210 0, /* di */
211 1, /* ti */
212 },
213 0, /* pre_modify */
214 0, /* post_modify */
215 0, /* register_offset */
216 0, /* register_sextend */
217 0, /* register_zextend */
218 0, /* imm_offset */
219 };
220
221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
222 {
223 {
224 0, /* hi */
225 0, /* si */
226 0, /* di */
227 2, /* ti */
228 },
229 0, /* pre_modify */
230 0, /* post_modify */
231 1, /* register_offset */
232 1, /* register_sextend */
233 2, /* register_zextend */
234 0, /* imm_offset */
235 };
236
237 static const struct cpu_addrcost_table xgene1_addrcost_table =
238 {
239 {
240 1, /* hi */
241 0, /* si */
242 0, /* di */
243 1, /* ti */
244 },
245 1, /* pre_modify */
246 0, /* post_modify */
247 0, /* register_offset */
248 1, /* register_sextend */
249 1, /* register_zextend */
250 0, /* imm_offset */
251 };
252
253 static const struct cpu_regmove_cost generic_regmove_cost =
254 {
255 1, /* GP2GP */
256 /* Avoid the use of slow int<->fp moves for spilling by setting
257 their cost higher than memmov_cost. */
258 5, /* GP2FP */
259 5, /* FP2GP */
260 2 /* FP2FP */
261 };
262
263 static const struct cpu_regmove_cost cortexa57_regmove_cost =
264 {
265 1, /* GP2GP */
266 /* Avoid the use of slow int<->fp moves for spilling by setting
267 their cost higher than memmov_cost. */
268 5, /* GP2FP */
269 5, /* FP2GP */
270 2 /* FP2FP */
271 };
272
273 static const struct cpu_regmove_cost cortexa53_regmove_cost =
274 {
275 1, /* GP2GP */
276 /* Avoid the use of slow int<->fp moves for spilling by setting
277 their cost higher than memmov_cost. */
278 5, /* GP2FP */
279 5, /* FP2GP */
280 2 /* FP2FP */
281 };
282
283 static const struct cpu_regmove_cost exynosm1_regmove_cost =
284 {
285 1, /* GP2GP */
286 /* Avoid the use of slow int<->fp moves for spilling by setting
287 their cost higher than memmov_cost (actual, 4 and 9). */
288 9, /* GP2FP */
289 9, /* FP2GP */
290 1 /* FP2FP */
291 };
292
293 static const struct cpu_regmove_cost thunderx_regmove_cost =
294 {
295 2, /* GP2GP */
296 2, /* GP2FP */
297 6, /* FP2GP */
298 4 /* FP2FP */
299 };
300
301 static const struct cpu_regmove_cost xgene1_regmove_cost =
302 {
303 1, /* GP2GP */
304 /* Avoid the use of slow int<->fp moves for spilling by setting
305 their cost higher than memmov_cost. */
306 8, /* GP2FP */
307 8, /* FP2GP */
308 2 /* FP2FP */
309 };
310
311 /* Generic costs for vector insn classes. */
312 static const struct cpu_vector_cost generic_vector_cost =
313 {
314 1, /* scalar_stmt_cost */
315 1, /* scalar_load_cost */
316 1, /* scalar_store_cost */
317 1, /* vec_stmt_cost */
318 2, /* vec_permute_cost */
319 1, /* vec_to_scalar_cost */
320 1, /* scalar_to_vec_cost */
321 1, /* vec_align_load_cost */
322 1, /* vec_unalign_load_cost */
323 1, /* vec_unalign_store_cost */
324 1, /* vec_store_cost */
325 3, /* cond_taken_branch_cost */
326 1 /* cond_not_taken_branch_cost */
327 };
328
329 /* Generic costs for vector insn classes. */
330 static const struct cpu_vector_cost cortexa57_vector_cost =
331 {
332 1, /* scalar_stmt_cost */
333 4, /* scalar_load_cost */
334 1, /* scalar_store_cost */
335 3, /* vec_stmt_cost */
336 3, /* vec_permute_cost */
337 8, /* vec_to_scalar_cost */
338 8, /* scalar_to_vec_cost */
339 5, /* vec_align_load_cost */
340 5, /* vec_unalign_load_cost */
341 1, /* vec_unalign_store_cost */
342 1, /* vec_store_cost */
343 1, /* cond_taken_branch_cost */
344 1 /* cond_not_taken_branch_cost */
345 };
346
347 static const struct cpu_vector_cost exynosm1_vector_cost =
348 {
349 1, /* scalar_stmt_cost */
350 5, /* scalar_load_cost */
351 1, /* scalar_store_cost */
352 3, /* vec_stmt_cost */
353 3, /* vec_permute_cost */
354 3, /* vec_to_scalar_cost */
355 3, /* scalar_to_vec_cost */
356 5, /* vec_align_load_cost */
357 5, /* vec_unalign_load_cost */
358 1, /* vec_unalign_store_cost */
359 1, /* vec_store_cost */
360 1, /* cond_taken_branch_cost */
361 1 /* cond_not_taken_branch_cost */
362 };
363
364 /* Generic costs for vector insn classes. */
365 static const struct cpu_vector_cost xgene1_vector_cost =
366 {
367 1, /* scalar_stmt_cost */
368 5, /* scalar_load_cost */
369 1, /* scalar_store_cost */
370 2, /* vec_stmt_cost */
371 2, /* vec_permute_cost */
372 4, /* vec_to_scalar_cost */
373 4, /* scalar_to_vec_cost */
374 10, /* vec_align_load_cost */
375 10, /* vec_unalign_load_cost */
376 2, /* vec_unalign_store_cost */
377 2, /* vec_store_cost */
378 2, /* cond_taken_branch_cost */
379 1 /* cond_not_taken_branch_cost */
380 };
381
382 /* Generic costs for branch instructions. */
383 static const struct cpu_branch_cost generic_branch_cost =
384 {
385 2, /* Predictable. */
386 2 /* Unpredictable. */
387 };
388
389 /* Branch costs for Cortex-A57. */
390 static const struct cpu_branch_cost cortexa57_branch_cost =
391 {
392 1, /* Predictable. */
393 3 /* Unpredictable. */
394 };
395
396 static const struct tune_params generic_tunings =
397 {
398 &cortexa57_extra_costs,
399 &generic_addrcost_table,
400 &generic_regmove_cost,
401 &generic_vector_cost,
402 &generic_branch_cost,
403 4, /* memmov_cost */
404 2, /* issue_rate */
405 AARCH64_FUSE_NOTHING, /* fusible_ops */
406 8, /* function_align. */
407 8, /* jump_align. */
408 4, /* loop_align. */
409 2, /* int_reassoc_width. */
410 4, /* fp_reassoc_width. */
411 1, /* vec_reassoc_width. */
412 2, /* min_div_recip_mul_sf. */
413 2, /* min_div_recip_mul_df. */
414 0, /* max_case_values. */
415 0, /* cache_line_size. */
416 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
417 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
418 };
419
420 static const struct tune_params cortexa35_tunings =
421 {
422 &cortexa53_extra_costs,
423 &generic_addrcost_table,
424 &cortexa53_regmove_cost,
425 &generic_vector_cost,
426 &generic_branch_cost,
427 4, /* memmov_cost */
428 1, /* issue_rate */
429 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
430 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
431 8, /* function_align. */
432 8, /* jump_align. */
433 4, /* loop_align. */
434 2, /* int_reassoc_width. */
435 4, /* fp_reassoc_width. */
436 1, /* vec_reassoc_width. */
437 2, /* min_div_recip_mul_sf. */
438 2, /* min_div_recip_mul_df. */
439 0, /* max_case_values. */
440 0, /* cache_line_size. */
441 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
442 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
443 };
444
445 static const struct tune_params cortexa53_tunings =
446 {
447 &cortexa53_extra_costs,
448 &generic_addrcost_table,
449 &cortexa53_regmove_cost,
450 &generic_vector_cost,
451 &generic_branch_cost,
452 4, /* memmov_cost */
453 2, /* issue_rate */
454 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
455 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
456 8, /* function_align. */
457 8, /* jump_align. */
458 4, /* loop_align. */
459 2, /* int_reassoc_width. */
460 4, /* fp_reassoc_width. */
461 1, /* vec_reassoc_width. */
462 2, /* min_div_recip_mul_sf. */
463 2, /* min_div_recip_mul_df. */
464 0, /* max_case_values. */
465 0, /* cache_line_size. */
466 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
467 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
468 };
469
470 static const struct tune_params cortexa57_tunings =
471 {
472 &cortexa57_extra_costs,
473 &cortexa57_addrcost_table,
474 &cortexa57_regmove_cost,
475 &cortexa57_vector_cost,
476 &cortexa57_branch_cost,
477 4, /* memmov_cost */
478 3, /* issue_rate */
479 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
480 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
481 16, /* function_align. */
482 8, /* jump_align. */
483 4, /* loop_align. */
484 2, /* int_reassoc_width. */
485 4, /* fp_reassoc_width. */
486 1, /* vec_reassoc_width. */
487 2, /* min_div_recip_mul_sf. */
488 2, /* min_div_recip_mul_df. */
489 0, /* max_case_values. */
490 0, /* cache_line_size. */
491 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
492 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
493 };
494
495 static const struct tune_params cortexa72_tunings =
496 {
497 &cortexa57_extra_costs,
498 &cortexa57_addrcost_table,
499 &cortexa57_regmove_cost,
500 &cortexa57_vector_cost,
501 &generic_branch_cost,
502 4, /* memmov_cost */
503 3, /* issue_rate */
504 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
505 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
506 16, /* function_align. */
507 8, /* jump_align. */
508 4, /* loop_align. */
509 2, /* int_reassoc_width. */
510 4, /* fp_reassoc_width. */
511 1, /* vec_reassoc_width. */
512 2, /* min_div_recip_mul_sf. */
513 2, /* min_div_recip_mul_df. */
514 0, /* max_case_values. */
515 0, /* cache_line_size. */
516 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
517 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
518 };
519
520 static const struct tune_params exynosm1_tunings =
521 {
522 &exynosm1_extra_costs,
523 &exynosm1_addrcost_table,
524 &exynosm1_regmove_cost,
525 &exynosm1_vector_cost,
526 &generic_branch_cost,
527 4, /* memmov_cost */
528 3, /* issue_rate */
529 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
530 4, /* function_align. */
531 4, /* jump_align. */
532 4, /* loop_align. */
533 2, /* int_reassoc_width. */
534 4, /* fp_reassoc_width. */
535 1, /* vec_reassoc_width. */
536 2, /* min_div_recip_mul_sf. */
537 2, /* min_div_recip_mul_df. */
538 48, /* max_case_values. */
539 64, /* cache_line_size. */
540 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
541 (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
542 };
543
544 static const struct tune_params thunderx_tunings =
545 {
546 &thunderx_extra_costs,
547 &generic_addrcost_table,
548 &thunderx_regmove_cost,
549 &generic_vector_cost,
550 &generic_branch_cost,
551 6, /* memmov_cost */
552 2, /* issue_rate */
553 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
554 8, /* function_align. */
555 8, /* jump_align. */
556 8, /* loop_align. */
557 2, /* int_reassoc_width. */
558 4, /* fp_reassoc_width. */
559 1, /* vec_reassoc_width. */
560 2, /* min_div_recip_mul_sf. */
561 2, /* min_div_recip_mul_df. */
562 0, /* max_case_values. */
563 0, /* cache_line_size. */
564 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
565 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
566 };
567
568 static const struct tune_params xgene1_tunings =
569 {
570 &xgene1_extra_costs,
571 &xgene1_addrcost_table,
572 &xgene1_regmove_cost,
573 &xgene1_vector_cost,
574 &generic_branch_cost,
575 6, /* memmov_cost */
576 4, /* issue_rate */
577 AARCH64_FUSE_NOTHING, /* fusible_ops */
578 16, /* function_align. */
579 8, /* jump_align. */
580 16, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 0, /* cache_line_size. */
588 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
589 (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
590 };
591
592 /* Support for fine-grained override of the tuning structures. */
593 struct aarch64_tuning_override_function
594 {
595 const char* name;
596 void (*parse_override)(const char*, struct tune_params*);
597 };
598
599 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
600 static void aarch64_parse_tune_string (const char*, struct tune_params*);
601
602 static const struct aarch64_tuning_override_function
603 aarch64_tuning_override_functions[] =
604 {
605 { "fuse", aarch64_parse_fuse_string },
606 { "tune", aarch64_parse_tune_string },
607 { NULL, NULL }
608 };
609
610 /* A processor implementing AArch64. */
611 struct processor
612 {
613 const char *const name;
614 enum aarch64_processor ident;
615 enum aarch64_processor sched_core;
616 enum aarch64_arch arch;
617 unsigned architecture_version;
618 const unsigned long flags;
619 const struct tune_params *const tune;
620 };
621
622 /* Architectures implementing AArch64. */
623 static const struct processor all_architectures[] =
624 {
625 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
626 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
627 #include "aarch64-arches.def"
628 #undef AARCH64_ARCH
629 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
630 };
631
632 /* Processor cores implementing AArch64. */
633 static const struct processor all_cores[] =
634 {
635 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
636 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
637 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
638 FLAGS, &COSTS##_tunings},
639 #include "aarch64-cores.def"
640 #undef AARCH64_CORE
641 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
642 AARCH64_FL_FOR_ARCH8, &generic_tunings},
643 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
644 };
645
646
647 /* Target specification. These are populated by the -march, -mtune, -mcpu
648 handling code or by target attributes. */
649 static const struct processor *selected_arch;
650 static const struct processor *selected_cpu;
651 static const struct processor *selected_tune;
652
653 /* The current tuning set. */
654 struct tune_params aarch64_tune_params = generic_tunings;
655
656 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
657
658 /* An ISA extension in the co-processor and main instruction set space. */
659 struct aarch64_option_extension
660 {
661 const char *const name;
662 const unsigned long flags_on;
663 const unsigned long flags_off;
664 };
665
666 /* ISA extensions in AArch64. */
667 static const struct aarch64_option_extension all_extensions[] =
668 {
669 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
670 {NAME, FLAGS_ON, FLAGS_OFF},
671 #include "aarch64-option-extensions.def"
672 #undef AARCH64_OPT_EXTENSION
673 {NULL, 0, 0}
674 };
675
676 typedef enum aarch64_cond_code
677 {
678 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
679 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
680 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
681 }
682 aarch64_cc;
683
684 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
685
686 /* The condition codes of the processor, and the inverse function. */
687 static const char * const aarch64_condition_codes[] =
688 {
689 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
690 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
691 };
692
693 /* Generate code to enable conditional branches in functions over 1 MiB. */
694 const char *
695 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
696 const char * branch_format)
697 {
698 rtx_code_label * tmp_label = gen_label_rtx ();
699 char label_buf[256];
700 char buffer[128];
701 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
702 CODE_LABEL_NUMBER (tmp_label));
703 const char *label_ptr = targetm.strip_name_encoding (label_buf);
704 rtx dest_label = operands[pos_label];
705 operands[pos_label] = tmp_label;
706
707 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
708 output_asm_insn (buffer, operands);
709
710 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
711 operands[pos_label] = dest_label;
712 output_asm_insn (buffer, operands);
713 return "";
714 }
715
716 void
717 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
718 {
719 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
720 if (TARGET_GENERAL_REGS_ONLY)
721 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
722 else
723 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
724 }
725
726 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
727 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
728 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
729 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
730 cost (in this case the best class is the lowest cost one). Using ALL_REGS
731 irrespectively of its cost results in bad allocations with many redundant
732 int<->FP moves which are expensive on various cores.
733 To avoid this we don't allow ALL_REGS as the allocno class, but force a
734 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
735 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
736 Otherwise set the allocno class depending on the mode.
737 The result of this is that it is no longer inefficient to have a higher
738 memory move cost than the register move cost.
739 */
740
741 static reg_class_t
742 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
743 reg_class_t best_class)
744 {
745 enum machine_mode mode;
746
747 if (allocno_class != ALL_REGS)
748 return allocno_class;
749
750 if (best_class != ALL_REGS)
751 return best_class;
752
753 mode = PSEUDO_REGNO_MODE (regno);
754 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
755 }
756
757 static unsigned int
758 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
759 {
760 if (GET_MODE_UNIT_SIZE (mode) == 4)
761 return aarch64_tune_params.min_div_recip_mul_sf;
762 return aarch64_tune_params.min_div_recip_mul_df;
763 }
764
765 static int
766 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
767 enum machine_mode mode)
768 {
769 if (VECTOR_MODE_P (mode))
770 return aarch64_tune_params.vec_reassoc_width;
771 if (INTEGRAL_MODE_P (mode))
772 return aarch64_tune_params.int_reassoc_width;
773 if (FLOAT_MODE_P (mode))
774 return aarch64_tune_params.fp_reassoc_width;
775 return 1;
776 }
777
778 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
779 unsigned
780 aarch64_dbx_register_number (unsigned regno)
781 {
782 if (GP_REGNUM_P (regno))
783 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
784 else if (regno == SP_REGNUM)
785 return AARCH64_DWARF_SP;
786 else if (FP_REGNUM_P (regno))
787 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
788
789 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
790 equivalent DWARF register. */
791 return DWARF_FRAME_REGISTERS;
792 }
793
794 /* Return TRUE if MODE is any of the large INT modes. */
795 static bool
796 aarch64_vect_struct_mode_p (machine_mode mode)
797 {
798 return mode == OImode || mode == CImode || mode == XImode;
799 }
800
801 /* Return TRUE if MODE is any of the vector modes. */
802 static bool
803 aarch64_vector_mode_p (machine_mode mode)
804 {
805 return aarch64_vector_mode_supported_p (mode)
806 || aarch64_vect_struct_mode_p (mode);
807 }
808
809 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
810 static bool
811 aarch64_array_mode_supported_p (machine_mode mode,
812 unsigned HOST_WIDE_INT nelems)
813 {
814 if (TARGET_SIMD
815 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
816 || AARCH64_VALID_SIMD_DREG_MODE (mode))
817 && (nelems >= 2 && nelems <= 4))
818 return true;
819
820 return false;
821 }
822
823 /* Implement HARD_REGNO_NREGS. */
824
825 int
826 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
827 {
828 switch (aarch64_regno_regclass (regno))
829 {
830 case FP_REGS:
831 case FP_LO_REGS:
832 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
833 default:
834 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
835 }
836 gcc_unreachable ();
837 }
838
839 /* Implement HARD_REGNO_MODE_OK. */
840
841 int
842 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
843 {
844 if (GET_MODE_CLASS (mode) == MODE_CC)
845 return regno == CC_REGNUM;
846
847 if (regno == SP_REGNUM)
848 /* The purpose of comparing with ptr_mode is to support the
849 global register variable associated with the stack pointer
850 register via the syntax of asm ("wsp") in ILP32. */
851 return mode == Pmode || mode == ptr_mode;
852
853 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
854 return mode == Pmode;
855
856 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
857 return 1;
858
859 if (FP_REGNUM_P (regno))
860 {
861 if (aarch64_vect_struct_mode_p (mode))
862 return
863 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
864 else
865 return 1;
866 }
867
868 return 0;
869 }
870
871 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
872 machine_mode
873 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
874 machine_mode mode)
875 {
876 /* Handle modes that fit within single registers. */
877 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
878 {
879 if (GET_MODE_SIZE (mode) >= 4)
880 return mode;
881 else
882 return SImode;
883 }
884 /* Fall back to generic for multi-reg and very large modes. */
885 else
886 return choose_hard_reg_mode (regno, nregs, false);
887 }
888
889 /* Return true if calls to DECL should be treated as
890 long-calls (ie called via a register). */
891 static bool
892 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
893 {
894 return false;
895 }
896
897 /* Return true if calls to symbol-ref SYM should be treated as
898 long-calls (ie called via a register). */
899 bool
900 aarch64_is_long_call_p (rtx sym)
901 {
902 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
903 }
904
905 /* Return true if calls to symbol-ref SYM should not go through
906 plt stubs. */
907
908 bool
909 aarch64_is_noplt_call_p (rtx sym)
910 {
911 const_tree decl = SYMBOL_REF_DECL (sym);
912
913 if (flag_pic
914 && decl
915 && (!flag_plt
916 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
917 && !targetm.binds_local_p (decl))
918 return true;
919
920 return false;
921 }
922
923 /* Return true if the offsets to a zero/sign-extract operation
924 represent an expression that matches an extend operation. The
925 operands represent the paramters from
926
927 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
928 bool
929 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
930 rtx extract_imm)
931 {
932 HOST_WIDE_INT mult_val, extract_val;
933
934 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
935 return false;
936
937 mult_val = INTVAL (mult_imm);
938 extract_val = INTVAL (extract_imm);
939
940 if (extract_val > 8
941 && extract_val < GET_MODE_BITSIZE (mode)
942 && exact_log2 (extract_val & ~7) > 0
943 && (extract_val & 7) <= 4
944 && mult_val == (1 << (extract_val & 7)))
945 return true;
946
947 return false;
948 }
949
950 /* Emit an insn that's a simple single-set. Both the operands must be
951 known to be valid. */
952 inline static rtx
953 emit_set_insn (rtx x, rtx y)
954 {
955 return emit_insn (gen_rtx_SET (x, y));
956 }
957
958 /* X and Y are two things to compare using CODE. Emit the compare insn and
959 return the rtx for register 0 in the proper mode. */
960 rtx
961 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
962 {
963 machine_mode mode = SELECT_CC_MODE (code, x, y);
964 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
965
966 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
967 return cc_reg;
968 }
969
970 /* Build the SYMBOL_REF for __tls_get_addr. */
971
972 static GTY(()) rtx tls_get_addr_libfunc;
973
974 rtx
975 aarch64_tls_get_addr (void)
976 {
977 if (!tls_get_addr_libfunc)
978 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
979 return tls_get_addr_libfunc;
980 }
981
982 /* Return the TLS model to use for ADDR. */
983
984 static enum tls_model
985 tls_symbolic_operand_type (rtx addr)
986 {
987 enum tls_model tls_kind = TLS_MODEL_NONE;
988 rtx sym, addend;
989
990 if (GET_CODE (addr) == CONST)
991 {
992 split_const (addr, &sym, &addend);
993 if (GET_CODE (sym) == SYMBOL_REF)
994 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
995 }
996 else if (GET_CODE (addr) == SYMBOL_REF)
997 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
998
999 return tls_kind;
1000 }
1001
1002 /* We'll allow lo_sum's in addresses in our legitimate addresses
1003 so that combine would take care of combining addresses where
1004 necessary, but for generation purposes, we'll generate the address
1005 as :
1006 RTL Absolute
1007 tmp = hi (symbol_ref); adrp x1, foo
1008 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1009 nop
1010
1011 PIC TLS
1012 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1013 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1014 bl __tls_get_addr
1015 nop
1016
1017 Load TLS symbol, depending on TLS mechanism and TLS access model.
1018
1019 Global Dynamic - Traditional TLS:
1020 adrp tmp, :tlsgd:imm
1021 add dest, tmp, #:tlsgd_lo12:imm
1022 bl __tls_get_addr
1023
1024 Global Dynamic - TLS Descriptors:
1025 adrp dest, :tlsdesc:imm
1026 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1027 add dest, dest, #:tlsdesc_lo12:imm
1028 blr tmp
1029 mrs tp, tpidr_el0
1030 add dest, dest, tp
1031
1032 Initial Exec:
1033 mrs tp, tpidr_el0
1034 adrp tmp, :gottprel:imm
1035 ldr dest, [tmp, #:gottprel_lo12:imm]
1036 add dest, dest, tp
1037
1038 Local Exec:
1039 mrs tp, tpidr_el0
1040 add t0, tp, #:tprel_hi12:imm, lsl #12
1041 add t0, t0, #:tprel_lo12_nc:imm
1042 */
1043
1044 static void
1045 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1046 enum aarch64_symbol_type type)
1047 {
1048 switch (type)
1049 {
1050 case SYMBOL_SMALL_ABSOLUTE:
1051 {
1052 /* In ILP32, the mode of dest can be either SImode or DImode. */
1053 rtx tmp_reg = dest;
1054 machine_mode mode = GET_MODE (dest);
1055
1056 gcc_assert (mode == Pmode || mode == ptr_mode);
1057
1058 if (can_create_pseudo_p ())
1059 tmp_reg = gen_reg_rtx (mode);
1060
1061 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1062 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1063 return;
1064 }
1065
1066 case SYMBOL_TINY_ABSOLUTE:
1067 emit_insn (gen_rtx_SET (dest, imm));
1068 return;
1069
1070 case SYMBOL_SMALL_GOT_28K:
1071 {
1072 machine_mode mode = GET_MODE (dest);
1073 rtx gp_rtx = pic_offset_table_rtx;
1074 rtx insn;
1075 rtx mem;
1076
1077 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1078 here before rtl expand. Tree IVOPT will generate rtl pattern to
1079 decide rtx costs, in which case pic_offset_table_rtx is not
1080 initialized. For that case no need to generate the first adrp
1081 instruction as the final cost for global variable access is
1082 one instruction. */
1083 if (gp_rtx != NULL)
1084 {
1085 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1086 using the page base as GOT base, the first page may be wasted,
1087 in the worst scenario, there is only 28K space for GOT).
1088
1089 The generate instruction sequence for accessing global variable
1090 is:
1091
1092 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1093
1094 Only one instruction needed. But we must initialize
1095 pic_offset_table_rtx properly. We generate initialize insn for
1096 every global access, and allow CSE to remove all redundant.
1097
1098 The final instruction sequences will look like the following
1099 for multiply global variables access.
1100
1101 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1102
1103 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1104 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1105 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1106 ... */
1107
1108 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1109 crtl->uses_pic_offset_table = 1;
1110 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1111
1112 if (mode != GET_MODE (gp_rtx))
1113 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1114 }
1115
1116 if (mode == ptr_mode)
1117 {
1118 if (mode == DImode)
1119 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1120 else
1121 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1122
1123 mem = XVECEXP (SET_SRC (insn), 0, 0);
1124 }
1125 else
1126 {
1127 gcc_assert (mode == Pmode);
1128
1129 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1130 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1131 }
1132
1133 /* The operand is expected to be MEM. Whenever the related insn
1134 pattern changed, above code which calculate mem should be
1135 updated. */
1136 gcc_assert (GET_CODE (mem) == MEM);
1137 MEM_READONLY_P (mem) = 1;
1138 MEM_NOTRAP_P (mem) = 1;
1139 emit_insn (insn);
1140 return;
1141 }
1142
1143 case SYMBOL_SMALL_GOT_4G:
1144 {
1145 /* In ILP32, the mode of dest can be either SImode or DImode,
1146 while the got entry is always of SImode size. The mode of
1147 dest depends on how dest is used: if dest is assigned to a
1148 pointer (e.g. in the memory), it has SImode; it may have
1149 DImode if dest is dereferenced to access the memeory.
1150 This is why we have to handle three different ldr_got_small
1151 patterns here (two patterns for ILP32). */
1152
1153 rtx insn;
1154 rtx mem;
1155 rtx tmp_reg = dest;
1156 machine_mode mode = GET_MODE (dest);
1157
1158 if (can_create_pseudo_p ())
1159 tmp_reg = gen_reg_rtx (mode);
1160
1161 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1162 if (mode == ptr_mode)
1163 {
1164 if (mode == DImode)
1165 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1166 else
1167 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1168
1169 mem = XVECEXP (SET_SRC (insn), 0, 0);
1170 }
1171 else
1172 {
1173 gcc_assert (mode == Pmode);
1174
1175 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1176 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1177 }
1178
1179 gcc_assert (GET_CODE (mem) == MEM);
1180 MEM_READONLY_P (mem) = 1;
1181 MEM_NOTRAP_P (mem) = 1;
1182 emit_insn (insn);
1183 return;
1184 }
1185
1186 case SYMBOL_SMALL_TLSGD:
1187 {
1188 rtx_insn *insns;
1189 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1190
1191 start_sequence ();
1192 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1193 insns = get_insns ();
1194 end_sequence ();
1195
1196 RTL_CONST_CALL_P (insns) = 1;
1197 emit_libcall_block (insns, dest, result, imm);
1198 return;
1199 }
1200
1201 case SYMBOL_SMALL_TLSDESC:
1202 {
1203 machine_mode mode = GET_MODE (dest);
1204 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1205 rtx tp;
1206
1207 gcc_assert (mode == Pmode || mode == ptr_mode);
1208
1209 /* In ILP32, the got entry is always of SImode size. Unlike
1210 small GOT, the dest is fixed at reg 0. */
1211 if (TARGET_ILP32)
1212 emit_insn (gen_tlsdesc_small_si (imm));
1213 else
1214 emit_insn (gen_tlsdesc_small_di (imm));
1215 tp = aarch64_load_tp (NULL);
1216
1217 if (mode != Pmode)
1218 tp = gen_lowpart (mode, tp);
1219
1220 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1221 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1222 return;
1223 }
1224
1225 case SYMBOL_SMALL_TLSIE:
1226 {
1227 /* In ILP32, the mode of dest can be either SImode or DImode,
1228 while the got entry is always of SImode size. The mode of
1229 dest depends on how dest is used: if dest is assigned to a
1230 pointer (e.g. in the memory), it has SImode; it may have
1231 DImode if dest is dereferenced to access the memeory.
1232 This is why we have to handle three different tlsie_small
1233 patterns here (two patterns for ILP32). */
1234 machine_mode mode = GET_MODE (dest);
1235 rtx tmp_reg = gen_reg_rtx (mode);
1236 rtx tp = aarch64_load_tp (NULL);
1237
1238 if (mode == ptr_mode)
1239 {
1240 if (mode == DImode)
1241 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1242 else
1243 {
1244 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1245 tp = gen_lowpart (mode, tp);
1246 }
1247 }
1248 else
1249 {
1250 gcc_assert (mode == Pmode);
1251 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1252 }
1253
1254 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1255 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1256 return;
1257 }
1258
1259 case SYMBOL_TLSLE12:
1260 case SYMBOL_TLSLE24:
1261 case SYMBOL_TLSLE32:
1262 case SYMBOL_TLSLE48:
1263 {
1264 machine_mode mode = GET_MODE (dest);
1265 rtx tp = aarch64_load_tp (NULL);
1266
1267 if (mode != Pmode)
1268 tp = gen_lowpart (mode, tp);
1269
1270 switch (type)
1271 {
1272 case SYMBOL_TLSLE12:
1273 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1274 (dest, tp, imm));
1275 break;
1276 case SYMBOL_TLSLE24:
1277 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1278 (dest, tp, imm));
1279 break;
1280 case SYMBOL_TLSLE32:
1281 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1282 (dest, imm));
1283 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1284 (dest, dest, tp));
1285 break;
1286 case SYMBOL_TLSLE48:
1287 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1288 (dest, imm));
1289 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1290 (dest, dest, tp));
1291 break;
1292 default:
1293 gcc_unreachable ();
1294 }
1295
1296 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1297 return;
1298 }
1299
1300 case SYMBOL_TINY_GOT:
1301 emit_insn (gen_ldr_got_tiny (dest, imm));
1302 return;
1303
1304 case SYMBOL_TINY_TLSIE:
1305 {
1306 machine_mode mode = GET_MODE (dest);
1307 rtx tp = aarch64_load_tp (NULL);
1308
1309 if (mode == ptr_mode)
1310 {
1311 if (mode == DImode)
1312 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1313 else
1314 {
1315 tp = gen_lowpart (mode, tp);
1316 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1317 }
1318 }
1319 else
1320 {
1321 gcc_assert (mode == Pmode);
1322 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1323 }
1324
1325 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1326 return;
1327 }
1328
1329 default:
1330 gcc_unreachable ();
1331 }
1332 }
1333
1334 /* Emit a move from SRC to DEST. Assume that the move expanders can
1335 handle all moves if !can_create_pseudo_p (). The distinction is
1336 important because, unlike emit_move_insn, the move expanders know
1337 how to force Pmode objects into the constant pool even when the
1338 constant pool address is not itself legitimate. */
1339 static rtx
1340 aarch64_emit_move (rtx dest, rtx src)
1341 {
1342 return (can_create_pseudo_p ()
1343 ? emit_move_insn (dest, src)
1344 : emit_move_insn_1 (dest, src));
1345 }
1346
1347 /* Split a 128-bit move operation into two 64-bit move operations,
1348 taking care to handle partial overlap of register to register
1349 copies. Special cases are needed when moving between GP regs and
1350 FP regs. SRC can be a register, constant or memory; DST a register
1351 or memory. If either operand is memory it must not have any side
1352 effects. */
1353 void
1354 aarch64_split_128bit_move (rtx dst, rtx src)
1355 {
1356 rtx dst_lo, dst_hi;
1357 rtx src_lo, src_hi;
1358
1359 machine_mode mode = GET_MODE (dst);
1360
1361 gcc_assert (mode == TImode || mode == TFmode);
1362 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1363 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1364
1365 if (REG_P (dst) && REG_P (src))
1366 {
1367 int src_regno = REGNO (src);
1368 int dst_regno = REGNO (dst);
1369
1370 /* Handle FP <-> GP regs. */
1371 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1372 {
1373 src_lo = gen_lowpart (word_mode, src);
1374 src_hi = gen_highpart (word_mode, src);
1375
1376 if (mode == TImode)
1377 {
1378 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1379 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1380 }
1381 else
1382 {
1383 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1384 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1385 }
1386 return;
1387 }
1388 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1389 {
1390 dst_lo = gen_lowpart (word_mode, dst);
1391 dst_hi = gen_highpart (word_mode, dst);
1392
1393 if (mode == TImode)
1394 {
1395 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1396 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1397 }
1398 else
1399 {
1400 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1401 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1402 }
1403 return;
1404 }
1405 }
1406
1407 dst_lo = gen_lowpart (word_mode, dst);
1408 dst_hi = gen_highpart (word_mode, dst);
1409 src_lo = gen_lowpart (word_mode, src);
1410 src_hi = gen_highpart_mode (word_mode, mode, src);
1411
1412 /* At most one pairing may overlap. */
1413 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1414 {
1415 aarch64_emit_move (dst_hi, src_hi);
1416 aarch64_emit_move (dst_lo, src_lo);
1417 }
1418 else
1419 {
1420 aarch64_emit_move (dst_lo, src_lo);
1421 aarch64_emit_move (dst_hi, src_hi);
1422 }
1423 }
1424
1425 bool
1426 aarch64_split_128bit_move_p (rtx dst, rtx src)
1427 {
1428 return (! REG_P (src)
1429 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1430 }
1431
1432 /* Split a complex SIMD combine. */
1433
1434 void
1435 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1436 {
1437 machine_mode src_mode = GET_MODE (src1);
1438 machine_mode dst_mode = GET_MODE (dst);
1439
1440 gcc_assert (VECTOR_MODE_P (dst_mode));
1441
1442 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1443 {
1444 rtx (*gen) (rtx, rtx, rtx);
1445
1446 switch (src_mode)
1447 {
1448 case V8QImode:
1449 gen = gen_aarch64_simd_combinev8qi;
1450 break;
1451 case V4HImode:
1452 gen = gen_aarch64_simd_combinev4hi;
1453 break;
1454 case V2SImode:
1455 gen = gen_aarch64_simd_combinev2si;
1456 break;
1457 case V4HFmode:
1458 gen = gen_aarch64_simd_combinev4hf;
1459 break;
1460 case V2SFmode:
1461 gen = gen_aarch64_simd_combinev2sf;
1462 break;
1463 case DImode:
1464 gen = gen_aarch64_simd_combinedi;
1465 break;
1466 case DFmode:
1467 gen = gen_aarch64_simd_combinedf;
1468 break;
1469 default:
1470 gcc_unreachable ();
1471 }
1472
1473 emit_insn (gen (dst, src1, src2));
1474 return;
1475 }
1476 }
1477
1478 /* Split a complex SIMD move. */
1479
1480 void
1481 aarch64_split_simd_move (rtx dst, rtx src)
1482 {
1483 machine_mode src_mode = GET_MODE (src);
1484 machine_mode dst_mode = GET_MODE (dst);
1485
1486 gcc_assert (VECTOR_MODE_P (dst_mode));
1487
1488 if (REG_P (dst) && REG_P (src))
1489 {
1490 rtx (*gen) (rtx, rtx);
1491
1492 gcc_assert (VECTOR_MODE_P (src_mode));
1493
1494 switch (src_mode)
1495 {
1496 case V16QImode:
1497 gen = gen_aarch64_split_simd_movv16qi;
1498 break;
1499 case V8HImode:
1500 gen = gen_aarch64_split_simd_movv8hi;
1501 break;
1502 case V4SImode:
1503 gen = gen_aarch64_split_simd_movv4si;
1504 break;
1505 case V2DImode:
1506 gen = gen_aarch64_split_simd_movv2di;
1507 break;
1508 case V8HFmode:
1509 gen = gen_aarch64_split_simd_movv8hf;
1510 break;
1511 case V4SFmode:
1512 gen = gen_aarch64_split_simd_movv4sf;
1513 break;
1514 case V2DFmode:
1515 gen = gen_aarch64_split_simd_movv2df;
1516 break;
1517 default:
1518 gcc_unreachable ();
1519 }
1520
1521 emit_insn (gen (dst, src));
1522 return;
1523 }
1524 }
1525
1526 bool
1527 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1528 machine_mode ymode, rtx y)
1529 {
1530 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1531 gcc_assert (r != NULL);
1532 return rtx_equal_p (x, r);
1533 }
1534
1535
1536 static rtx
1537 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1538 {
1539 if (can_create_pseudo_p ())
1540 return force_reg (mode, value);
1541 else
1542 {
1543 x = aarch64_emit_move (x, value);
1544 return x;
1545 }
1546 }
1547
1548
1549 static rtx
1550 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1551 {
1552 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1553 {
1554 rtx high;
1555 /* Load the full offset into a register. This
1556 might be improvable in the future. */
1557 high = GEN_INT (offset);
1558 offset = 0;
1559 high = aarch64_force_temporary (mode, temp, high);
1560 reg = aarch64_force_temporary (mode, temp,
1561 gen_rtx_PLUS (mode, high, reg));
1562 }
1563 return plus_constant (mode, reg, offset);
1564 }
1565
1566 static int
1567 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1568 machine_mode mode)
1569 {
1570 int i;
1571 unsigned HOST_WIDE_INT val, val2, mask;
1572 int one_match, zero_match;
1573 int num_insns;
1574
1575 val = INTVAL (imm);
1576
1577 if (aarch64_move_imm (val, mode))
1578 {
1579 if (generate)
1580 emit_insn (gen_rtx_SET (dest, imm));
1581 return 1;
1582 }
1583
1584 if ((val >> 32) == 0 || mode == SImode)
1585 {
1586 if (generate)
1587 {
1588 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1589 if (mode == SImode)
1590 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1591 GEN_INT ((val >> 16) & 0xffff)));
1592 else
1593 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1594 GEN_INT ((val >> 16) & 0xffff)));
1595 }
1596 return 2;
1597 }
1598
1599 /* Remaining cases are all for DImode. */
1600
1601 mask = 0xffff;
1602 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1603 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1604 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1605 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1606
1607 if (zero_match != 2 && one_match != 2)
1608 {
1609 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1610 For a 64-bit bitmask try whether changing 16 bits to all ones or
1611 zeroes creates a valid bitmask. To check any repeated bitmask,
1612 try using 16 bits from the other 32-bit half of val. */
1613
1614 for (i = 0; i < 64; i += 16, mask <<= 16)
1615 {
1616 val2 = val & ~mask;
1617 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1618 break;
1619 val2 = val | mask;
1620 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1621 break;
1622 val2 = val2 & ~mask;
1623 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1624 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1625 break;
1626 }
1627 if (i != 64)
1628 {
1629 if (generate)
1630 {
1631 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1632 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1633 GEN_INT ((val >> i) & 0xffff)));
1634 }
1635 return 2;
1636 }
1637 }
1638
1639 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1640 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1641 otherwise skip zero bits. */
1642
1643 num_insns = 1;
1644 mask = 0xffff;
1645 val2 = one_match > zero_match ? ~val : val;
1646 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1647
1648 if (generate)
1649 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1650 ? (val | ~(mask << i))
1651 : (val & (mask << i)))));
1652 for (i += 16; i < 64; i += 16)
1653 {
1654 if ((val2 & (mask << i)) == 0)
1655 continue;
1656 if (generate)
1657 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1658 GEN_INT ((val >> i) & 0xffff)));
1659 num_insns ++;
1660 }
1661
1662 return num_insns;
1663 }
1664
1665
1666 void
1667 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1668 {
1669 machine_mode mode = GET_MODE (dest);
1670
1671 gcc_assert (mode == SImode || mode == DImode);
1672
1673 /* Check on what type of symbol it is. */
1674 if (GET_CODE (imm) == SYMBOL_REF
1675 || GET_CODE (imm) == LABEL_REF
1676 || GET_CODE (imm) == CONST)
1677 {
1678 rtx mem, base, offset;
1679 enum aarch64_symbol_type sty;
1680
1681 /* If we have (const (plus symbol offset)), separate out the offset
1682 before we start classifying the symbol. */
1683 split_const (imm, &base, &offset);
1684
1685 sty = aarch64_classify_symbol (base, offset);
1686 switch (sty)
1687 {
1688 case SYMBOL_FORCE_TO_MEM:
1689 if (offset != const0_rtx
1690 && targetm.cannot_force_const_mem (mode, imm))
1691 {
1692 gcc_assert (can_create_pseudo_p ());
1693 base = aarch64_force_temporary (mode, dest, base);
1694 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1695 aarch64_emit_move (dest, base);
1696 return;
1697 }
1698
1699 mem = force_const_mem (ptr_mode, imm);
1700 gcc_assert (mem);
1701
1702 /* If we aren't generating PC relative literals, then
1703 we need to expand the literal pool access carefully.
1704 This is something that needs to be done in a number
1705 of places, so could well live as a separate function. */
1706 if (aarch64_nopcrelative_literal_loads)
1707 {
1708 gcc_assert (can_create_pseudo_p ());
1709 base = gen_reg_rtx (ptr_mode);
1710 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1711 mem = gen_rtx_MEM (ptr_mode, base);
1712 }
1713
1714 if (mode != ptr_mode)
1715 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1716
1717 emit_insn (gen_rtx_SET (dest, mem));
1718
1719 return;
1720
1721 case SYMBOL_SMALL_TLSGD:
1722 case SYMBOL_SMALL_TLSDESC:
1723 case SYMBOL_SMALL_TLSIE:
1724 case SYMBOL_SMALL_GOT_28K:
1725 case SYMBOL_SMALL_GOT_4G:
1726 case SYMBOL_TINY_GOT:
1727 case SYMBOL_TINY_TLSIE:
1728 if (offset != const0_rtx)
1729 {
1730 gcc_assert(can_create_pseudo_p ());
1731 base = aarch64_force_temporary (mode, dest, base);
1732 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1733 aarch64_emit_move (dest, base);
1734 return;
1735 }
1736 /* FALLTHRU */
1737
1738 case SYMBOL_SMALL_ABSOLUTE:
1739 case SYMBOL_TINY_ABSOLUTE:
1740 case SYMBOL_TLSLE12:
1741 case SYMBOL_TLSLE24:
1742 case SYMBOL_TLSLE32:
1743 case SYMBOL_TLSLE48:
1744 aarch64_load_symref_appropriately (dest, imm, sty);
1745 return;
1746
1747 default:
1748 gcc_unreachable ();
1749 }
1750 }
1751
1752 if (!CONST_INT_P (imm))
1753 {
1754 if (GET_CODE (imm) == HIGH)
1755 emit_insn (gen_rtx_SET (dest, imm));
1756 else
1757 {
1758 rtx mem = force_const_mem (mode, imm);
1759 gcc_assert (mem);
1760 emit_insn (gen_rtx_SET (dest, mem));
1761 }
1762
1763 return;
1764 }
1765
1766 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1767 }
1768
1769 static bool
1770 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1771 tree exp ATTRIBUTE_UNUSED)
1772 {
1773 /* Currently, always true. */
1774 return true;
1775 }
1776
1777 /* Implement TARGET_PASS_BY_REFERENCE. */
1778
1779 static bool
1780 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1781 machine_mode mode,
1782 const_tree type,
1783 bool named ATTRIBUTE_UNUSED)
1784 {
1785 HOST_WIDE_INT size;
1786 machine_mode dummymode;
1787 int nregs;
1788
1789 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1790 size = (mode == BLKmode && type)
1791 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1792
1793 /* Aggregates are passed by reference based on their size. */
1794 if (type && AGGREGATE_TYPE_P (type))
1795 {
1796 size = int_size_in_bytes (type);
1797 }
1798
1799 /* Variable sized arguments are always returned by reference. */
1800 if (size < 0)
1801 return true;
1802
1803 /* Can this be a candidate to be passed in fp/simd register(s)? */
1804 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1805 &dummymode, &nregs,
1806 NULL))
1807 return false;
1808
1809 /* Arguments which are variable sized or larger than 2 registers are
1810 passed by reference unless they are a homogenous floating point
1811 aggregate. */
1812 return size > 2 * UNITS_PER_WORD;
1813 }
1814
1815 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1816 static bool
1817 aarch64_return_in_msb (const_tree valtype)
1818 {
1819 machine_mode dummy_mode;
1820 int dummy_int;
1821
1822 /* Never happens in little-endian mode. */
1823 if (!BYTES_BIG_ENDIAN)
1824 return false;
1825
1826 /* Only composite types smaller than or equal to 16 bytes can
1827 be potentially returned in registers. */
1828 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1829 || int_size_in_bytes (valtype) <= 0
1830 || int_size_in_bytes (valtype) > 16)
1831 return false;
1832
1833 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1834 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1835 is always passed/returned in the least significant bits of fp/simd
1836 register(s). */
1837 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1838 &dummy_mode, &dummy_int, NULL))
1839 return false;
1840
1841 return true;
1842 }
1843
1844 /* Implement TARGET_FUNCTION_VALUE.
1845 Define how to find the value returned by a function. */
1846
1847 static rtx
1848 aarch64_function_value (const_tree type, const_tree func,
1849 bool outgoing ATTRIBUTE_UNUSED)
1850 {
1851 machine_mode mode;
1852 int unsignedp;
1853 int count;
1854 machine_mode ag_mode;
1855
1856 mode = TYPE_MODE (type);
1857 if (INTEGRAL_TYPE_P (type))
1858 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1859
1860 if (aarch64_return_in_msb (type))
1861 {
1862 HOST_WIDE_INT size = int_size_in_bytes (type);
1863
1864 if (size % UNITS_PER_WORD != 0)
1865 {
1866 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1867 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1868 }
1869 }
1870
1871 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1872 &ag_mode, &count, NULL))
1873 {
1874 if (!aarch64_composite_type_p (type, mode))
1875 {
1876 gcc_assert (count == 1 && mode == ag_mode);
1877 return gen_rtx_REG (mode, V0_REGNUM);
1878 }
1879 else
1880 {
1881 int i;
1882 rtx par;
1883
1884 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1885 for (i = 0; i < count; i++)
1886 {
1887 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1888 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1889 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1890 XVECEXP (par, 0, i) = tmp;
1891 }
1892 return par;
1893 }
1894 }
1895 else
1896 return gen_rtx_REG (mode, R0_REGNUM);
1897 }
1898
1899 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1900 Return true if REGNO is the number of a hard register in which the values
1901 of called function may come back. */
1902
1903 static bool
1904 aarch64_function_value_regno_p (const unsigned int regno)
1905 {
1906 /* Maximum of 16 bytes can be returned in the general registers. Examples
1907 of 16-byte return values are: 128-bit integers and 16-byte small
1908 structures (excluding homogeneous floating-point aggregates). */
1909 if (regno == R0_REGNUM || regno == R1_REGNUM)
1910 return true;
1911
1912 /* Up to four fp/simd registers can return a function value, e.g. a
1913 homogeneous floating-point aggregate having four members. */
1914 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1915 return TARGET_FLOAT;
1916
1917 return false;
1918 }
1919
1920 /* Implement TARGET_RETURN_IN_MEMORY.
1921
1922 If the type T of the result of a function is such that
1923 void func (T arg)
1924 would require that arg be passed as a value in a register (or set of
1925 registers) according to the parameter passing rules, then the result
1926 is returned in the same registers as would be used for such an
1927 argument. */
1928
1929 static bool
1930 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1931 {
1932 HOST_WIDE_INT size;
1933 machine_mode ag_mode;
1934 int count;
1935
1936 if (!AGGREGATE_TYPE_P (type)
1937 && TREE_CODE (type) != COMPLEX_TYPE
1938 && TREE_CODE (type) != VECTOR_TYPE)
1939 /* Simple scalar types always returned in registers. */
1940 return false;
1941
1942 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1943 type,
1944 &ag_mode,
1945 &count,
1946 NULL))
1947 return false;
1948
1949 /* Types larger than 2 registers returned in memory. */
1950 size = int_size_in_bytes (type);
1951 return (size < 0 || size > 2 * UNITS_PER_WORD);
1952 }
1953
1954 static bool
1955 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1956 const_tree type, int *nregs)
1957 {
1958 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1959 return aarch64_vfp_is_call_or_return_candidate (mode,
1960 type,
1961 &pcum->aapcs_vfp_rmode,
1962 nregs,
1963 NULL);
1964 }
1965
1966 /* Given MODE and TYPE of a function argument, return the alignment in
1967 bits. The idea is to suppress any stronger alignment requested by
1968 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1969 This is a helper function for local use only. */
1970
1971 static unsigned int
1972 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1973 {
1974 unsigned int alignment;
1975
1976 if (type)
1977 {
1978 if (!integer_zerop (TYPE_SIZE (type)))
1979 {
1980 if (TYPE_MODE (type) == mode)
1981 alignment = TYPE_ALIGN (type);
1982 else
1983 alignment = GET_MODE_ALIGNMENT (mode);
1984 }
1985 else
1986 alignment = 0;
1987 }
1988 else
1989 alignment = GET_MODE_ALIGNMENT (mode);
1990
1991 return alignment;
1992 }
1993
1994 /* Layout a function argument according to the AAPCS64 rules. The rule
1995 numbers refer to the rule numbers in the AAPCS64. */
1996
1997 static void
1998 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1999 const_tree type,
2000 bool named ATTRIBUTE_UNUSED)
2001 {
2002 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2003 int ncrn, nvrn, nregs;
2004 bool allocate_ncrn, allocate_nvrn;
2005 HOST_WIDE_INT size;
2006
2007 /* We need to do this once per argument. */
2008 if (pcum->aapcs_arg_processed)
2009 return;
2010
2011 pcum->aapcs_arg_processed = true;
2012
2013 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2014 size
2015 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2016 UNITS_PER_WORD);
2017
2018 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2019 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2020 mode,
2021 type,
2022 &nregs);
2023
2024 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2025 The following code thus handles passing by SIMD/FP registers first. */
2026
2027 nvrn = pcum->aapcs_nvrn;
2028
2029 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2030 and homogenous short-vector aggregates (HVA). */
2031 if (allocate_nvrn)
2032 {
2033 if (!TARGET_FLOAT)
2034 aarch64_err_no_fpadvsimd (mode, "argument");
2035
2036 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2037 {
2038 pcum->aapcs_nextnvrn = nvrn + nregs;
2039 if (!aarch64_composite_type_p (type, mode))
2040 {
2041 gcc_assert (nregs == 1);
2042 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2043 }
2044 else
2045 {
2046 rtx par;
2047 int i;
2048 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2049 for (i = 0; i < nregs; i++)
2050 {
2051 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2052 V0_REGNUM + nvrn + i);
2053 tmp = gen_rtx_EXPR_LIST
2054 (VOIDmode, tmp,
2055 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2056 XVECEXP (par, 0, i) = tmp;
2057 }
2058 pcum->aapcs_reg = par;
2059 }
2060 return;
2061 }
2062 else
2063 {
2064 /* C.3 NSRN is set to 8. */
2065 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2066 goto on_stack;
2067 }
2068 }
2069
2070 ncrn = pcum->aapcs_ncrn;
2071 nregs = size / UNITS_PER_WORD;
2072
2073 /* C6 - C9. though the sign and zero extension semantics are
2074 handled elsewhere. This is the case where the argument fits
2075 entirely general registers. */
2076 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2077 {
2078 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2079
2080 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2081
2082 /* C.8 if the argument has an alignment of 16 then the NGRN is
2083 rounded up to the next even number. */
2084 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2085 {
2086 ++ncrn;
2087 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2088 }
2089 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2090 A reg is still generated for it, but the caller should be smart
2091 enough not to use it. */
2092 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2093 {
2094 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2095 }
2096 else
2097 {
2098 rtx par;
2099 int i;
2100
2101 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2102 for (i = 0; i < nregs; i++)
2103 {
2104 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2105 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2106 GEN_INT (i * UNITS_PER_WORD));
2107 XVECEXP (par, 0, i) = tmp;
2108 }
2109 pcum->aapcs_reg = par;
2110 }
2111
2112 pcum->aapcs_nextncrn = ncrn + nregs;
2113 return;
2114 }
2115
2116 /* C.11 */
2117 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2118
2119 /* The argument is passed on stack; record the needed number of words for
2120 this argument and align the total size if necessary. */
2121 on_stack:
2122 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2123 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2124 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2125 16 / UNITS_PER_WORD);
2126 return;
2127 }
2128
2129 /* Implement TARGET_FUNCTION_ARG. */
2130
2131 static rtx
2132 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2133 const_tree type, bool named)
2134 {
2135 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2136 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2137
2138 if (mode == VOIDmode)
2139 return NULL_RTX;
2140
2141 aarch64_layout_arg (pcum_v, mode, type, named);
2142 return pcum->aapcs_reg;
2143 }
2144
2145 void
2146 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2147 const_tree fntype ATTRIBUTE_UNUSED,
2148 rtx libname ATTRIBUTE_UNUSED,
2149 const_tree fndecl ATTRIBUTE_UNUSED,
2150 unsigned n_named ATTRIBUTE_UNUSED)
2151 {
2152 pcum->aapcs_ncrn = 0;
2153 pcum->aapcs_nvrn = 0;
2154 pcum->aapcs_nextncrn = 0;
2155 pcum->aapcs_nextnvrn = 0;
2156 pcum->pcs_variant = ARM_PCS_AAPCS64;
2157 pcum->aapcs_reg = NULL_RTX;
2158 pcum->aapcs_arg_processed = false;
2159 pcum->aapcs_stack_words = 0;
2160 pcum->aapcs_stack_size = 0;
2161
2162 if (!TARGET_FLOAT
2163 && fndecl && TREE_PUBLIC (fndecl)
2164 && fntype && fntype != error_mark_node)
2165 {
2166 const_tree type = TREE_TYPE (fntype);
2167 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2168 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2169 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2170 &mode, &nregs, NULL))
2171 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2172 }
2173 return;
2174 }
2175
2176 static void
2177 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2178 machine_mode mode,
2179 const_tree type,
2180 bool named)
2181 {
2182 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2183 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2184 {
2185 aarch64_layout_arg (pcum_v, mode, type, named);
2186 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2187 != (pcum->aapcs_stack_words != 0));
2188 pcum->aapcs_arg_processed = false;
2189 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2190 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2191 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2192 pcum->aapcs_stack_words = 0;
2193 pcum->aapcs_reg = NULL_RTX;
2194 }
2195 }
2196
2197 bool
2198 aarch64_function_arg_regno_p (unsigned regno)
2199 {
2200 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2201 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2202 }
2203
2204 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2205 PARM_BOUNDARY bits of alignment, but will be given anything up
2206 to STACK_BOUNDARY bits if the type requires it. This makes sure
2207 that both before and after the layout of each argument, the Next
2208 Stacked Argument Address (NSAA) will have a minimum alignment of
2209 8 bytes. */
2210
2211 static unsigned int
2212 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2213 {
2214 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2215
2216 if (alignment < PARM_BOUNDARY)
2217 alignment = PARM_BOUNDARY;
2218 if (alignment > STACK_BOUNDARY)
2219 alignment = STACK_BOUNDARY;
2220 return alignment;
2221 }
2222
2223 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2224
2225 Return true if an argument passed on the stack should be padded upwards,
2226 i.e. if the least-significant byte of the stack slot has useful data.
2227
2228 Small aggregate types are placed in the lowest memory address.
2229
2230 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2231
2232 bool
2233 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2234 {
2235 /* On little-endian targets, the least significant byte of every stack
2236 argument is passed at the lowest byte address of the stack slot. */
2237 if (!BYTES_BIG_ENDIAN)
2238 return true;
2239
2240 /* Otherwise, integral, floating-point and pointer types are padded downward:
2241 the least significant byte of a stack argument is passed at the highest
2242 byte address of the stack slot. */
2243 if (type
2244 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2245 || POINTER_TYPE_P (type))
2246 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2247 return false;
2248
2249 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2250 return true;
2251 }
2252
2253 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2254
2255 It specifies padding for the last (may also be the only)
2256 element of a block move between registers and memory. If
2257 assuming the block is in the memory, padding upward means that
2258 the last element is padded after its highest significant byte,
2259 while in downward padding, the last element is padded at the
2260 its least significant byte side.
2261
2262 Small aggregates and small complex types are always padded
2263 upwards.
2264
2265 We don't need to worry about homogeneous floating-point or
2266 short-vector aggregates; their move is not affected by the
2267 padding direction determined here. Regardless of endianness,
2268 each element of such an aggregate is put in the least
2269 significant bits of a fp/simd register.
2270
2271 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2272 register has useful data, and return the opposite if the most
2273 significant byte does. */
2274
2275 bool
2276 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2277 bool first ATTRIBUTE_UNUSED)
2278 {
2279
2280 /* Small composite types are always padded upward. */
2281 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2282 {
2283 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2284 : GET_MODE_SIZE (mode));
2285 if (size < 2 * UNITS_PER_WORD)
2286 return true;
2287 }
2288
2289 /* Otherwise, use the default padding. */
2290 return !BYTES_BIG_ENDIAN;
2291 }
2292
2293 static machine_mode
2294 aarch64_libgcc_cmp_return_mode (void)
2295 {
2296 return SImode;
2297 }
2298
2299 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2300
2301 /* We use the 12-bit shifted immediate arithmetic instructions so values
2302 must be multiple of (1 << 12), i.e. 4096. */
2303 #define ARITH_FACTOR 4096
2304
2305 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2306 #error Cannot use simple address calculation for stack probing
2307 #endif
2308
2309 /* The pair of scratch registers used for stack probing. */
2310 #define PROBE_STACK_FIRST_REG 9
2311 #define PROBE_STACK_SECOND_REG 10
2312
2313 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2314 inclusive. These are offsets from the current stack pointer. */
2315
2316 static void
2317 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2318 {
2319 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2320
2321 /* See the same assertion on PROBE_INTERVAL above. */
2322 gcc_assert ((first % ARITH_FACTOR) == 0);
2323
2324 /* See if we have a constant small number of probes to generate. If so,
2325 that's the easy case. */
2326 if (size <= PROBE_INTERVAL)
2327 {
2328 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2329
2330 emit_set_insn (reg1,
2331 plus_constant (ptr_mode,
2332 stack_pointer_rtx, -(first + base)));
2333 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2334 }
2335
2336 /* The run-time loop is made up of 8 insns in the generic case while the
2337 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2338 else if (size <= 4 * PROBE_INTERVAL)
2339 {
2340 HOST_WIDE_INT i, rem;
2341
2342 emit_set_insn (reg1,
2343 plus_constant (ptr_mode,
2344 stack_pointer_rtx,
2345 -(first + PROBE_INTERVAL)));
2346 emit_stack_probe (reg1);
2347
2348 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2349 it exceeds SIZE. If only two probes are needed, this will not
2350 generate any code. Then probe at FIRST + SIZE. */
2351 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2352 {
2353 emit_set_insn (reg1,
2354 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2355 emit_stack_probe (reg1);
2356 }
2357
2358 rem = size - (i - PROBE_INTERVAL);
2359 if (rem > 256)
2360 {
2361 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2362
2363 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2364 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2365 }
2366 else
2367 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2368 }
2369
2370 /* Otherwise, do the same as above, but in a loop. Note that we must be
2371 extra careful with variables wrapping around because we might be at
2372 the very top (or the very bottom) of the address space and we have
2373 to be able to handle this case properly; in particular, we use an
2374 equality test for the loop condition. */
2375 else
2376 {
2377 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2378
2379 /* Step 1: round SIZE to the previous multiple of the interval. */
2380
2381 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2382
2383
2384 /* Step 2: compute initial and final value of the loop counter. */
2385
2386 /* TEST_ADDR = SP + FIRST. */
2387 emit_set_insn (reg1,
2388 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2389
2390 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2391 emit_set_insn (reg2,
2392 plus_constant (ptr_mode, stack_pointer_rtx,
2393 -(first + rounded_size)));
2394
2395
2396 /* Step 3: the loop
2397
2398 do
2399 {
2400 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2401 probe at TEST_ADDR
2402 }
2403 while (TEST_ADDR != LAST_ADDR)
2404
2405 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2406 until it is equal to ROUNDED_SIZE. */
2407
2408 if (ptr_mode == DImode)
2409 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2410 else
2411 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2412
2413
2414 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2415 that SIZE is equal to ROUNDED_SIZE. */
2416
2417 if (size != rounded_size)
2418 {
2419 HOST_WIDE_INT rem = size - rounded_size;
2420
2421 if (rem > 256)
2422 {
2423 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2424
2425 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2426 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2427 }
2428 else
2429 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2430 }
2431 }
2432
2433 /* Make sure nothing is scheduled before we are done. */
2434 emit_insn (gen_blockage ());
2435 }
2436
2437 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2438 absolute addresses. */
2439
2440 const char *
2441 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2442 {
2443 static int labelno = 0;
2444 char loop_lab[32];
2445 rtx xops[2];
2446
2447 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2448
2449 /* Loop. */
2450 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2451
2452 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2453 xops[0] = reg1;
2454 xops[1] = GEN_INT (PROBE_INTERVAL);
2455 output_asm_insn ("sub\t%0, %0, %1", xops);
2456
2457 /* Probe at TEST_ADDR. */
2458 output_asm_insn ("str\txzr, [%0]", xops);
2459
2460 /* Test if TEST_ADDR == LAST_ADDR. */
2461 xops[1] = reg2;
2462 output_asm_insn ("cmp\t%0, %1", xops);
2463
2464 /* Branch. */
2465 fputs ("\tb.ne\t", asm_out_file);
2466 assemble_name_raw (asm_out_file, loop_lab);
2467 fputc ('\n', asm_out_file);
2468
2469 return "";
2470 }
2471
2472 static bool
2473 aarch64_frame_pointer_required (void)
2474 {
2475 /* In aarch64_override_options_after_change
2476 flag_omit_leaf_frame_pointer turns off the frame pointer by
2477 default. Turn it back on now if we've not got a leaf
2478 function. */
2479 if (flag_omit_leaf_frame_pointer
2480 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2481 return true;
2482
2483 return false;
2484 }
2485
2486 /* Mark the registers that need to be saved by the callee and calculate
2487 the size of the callee-saved registers area and frame record (both FP
2488 and LR may be omitted). */
2489 static void
2490 aarch64_layout_frame (void)
2491 {
2492 HOST_WIDE_INT offset = 0;
2493 int regno;
2494
2495 if (reload_completed && cfun->machine->frame.laid_out)
2496 return;
2497
2498 #define SLOT_NOT_REQUIRED (-2)
2499 #define SLOT_REQUIRED (-1)
2500
2501 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2502 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2503
2504 /* First mark all the registers that really need to be saved... */
2505 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2506 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2507
2508 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2509 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2510
2511 /* ... that includes the eh data registers (if needed)... */
2512 if (crtl->calls_eh_return)
2513 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2514 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2515 = SLOT_REQUIRED;
2516
2517 /* ... and any callee saved register that dataflow says is live. */
2518 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2519 if (df_regs_ever_live_p (regno)
2520 && (regno == R30_REGNUM
2521 || !call_used_regs[regno]))
2522 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2523
2524 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2525 if (df_regs_ever_live_p (regno)
2526 && !call_used_regs[regno])
2527 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2528
2529 if (frame_pointer_needed)
2530 {
2531 /* FP and LR are placed in the linkage record. */
2532 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2533 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2534 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2535 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2536 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2537 offset += 2 * UNITS_PER_WORD;
2538 }
2539
2540 /* Now assign stack slots for them. */
2541 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2542 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2543 {
2544 cfun->machine->frame.reg_offset[regno] = offset;
2545 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2546 cfun->machine->frame.wb_candidate1 = regno;
2547 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2548 cfun->machine->frame.wb_candidate2 = regno;
2549 offset += UNITS_PER_WORD;
2550 }
2551
2552 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2553 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2554 {
2555 cfun->machine->frame.reg_offset[regno] = offset;
2556 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2557 cfun->machine->frame.wb_candidate1 = regno;
2558 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2559 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2560 cfun->machine->frame.wb_candidate2 = regno;
2561 offset += UNITS_PER_WORD;
2562 }
2563
2564 cfun->machine->frame.padding0 =
2565 (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2566 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2567
2568 cfun->machine->frame.saved_regs_size = offset;
2569
2570 cfun->machine->frame.hard_fp_offset
2571 = ROUND_UP (cfun->machine->frame.saved_varargs_size
2572 + get_frame_size ()
2573 + cfun->machine->frame.saved_regs_size,
2574 STACK_BOUNDARY / BITS_PER_UNIT);
2575
2576 cfun->machine->frame.frame_size
2577 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2578 + crtl->outgoing_args_size,
2579 STACK_BOUNDARY / BITS_PER_UNIT);
2580
2581 cfun->machine->frame.laid_out = true;
2582 }
2583
2584 static bool
2585 aarch64_register_saved_on_entry (int regno)
2586 {
2587 return cfun->machine->frame.reg_offset[regno] >= 0;
2588 }
2589
2590 static unsigned
2591 aarch64_next_callee_save (unsigned regno, unsigned limit)
2592 {
2593 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2594 regno ++;
2595 return regno;
2596 }
2597
2598 static void
2599 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2600 HOST_WIDE_INT adjustment)
2601 {
2602 rtx base_rtx = stack_pointer_rtx;
2603 rtx insn, reg, mem;
2604
2605 reg = gen_rtx_REG (mode, regno);
2606 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2607 plus_constant (Pmode, base_rtx, -adjustment));
2608 mem = gen_rtx_MEM (mode, mem);
2609
2610 insn = emit_move_insn (mem, reg);
2611 RTX_FRAME_RELATED_P (insn) = 1;
2612 }
2613
2614 static rtx
2615 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2616 HOST_WIDE_INT adjustment)
2617 {
2618 switch (mode)
2619 {
2620 case DImode:
2621 return gen_storewb_pairdi_di (base, base, reg, reg2,
2622 GEN_INT (-adjustment),
2623 GEN_INT (UNITS_PER_WORD - adjustment));
2624 case DFmode:
2625 return gen_storewb_pairdf_di (base, base, reg, reg2,
2626 GEN_INT (-adjustment),
2627 GEN_INT (UNITS_PER_WORD - adjustment));
2628 default:
2629 gcc_unreachable ();
2630 }
2631 }
2632
2633 static void
2634 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2635 unsigned regno2, HOST_WIDE_INT adjustment)
2636 {
2637 rtx_insn *insn;
2638 rtx reg1 = gen_rtx_REG (mode, regno1);
2639 rtx reg2 = gen_rtx_REG (mode, regno2);
2640
2641 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2642 reg2, adjustment));
2643 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2644 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2645 RTX_FRAME_RELATED_P (insn) = 1;
2646 }
2647
2648 static rtx
2649 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2650 HOST_WIDE_INT adjustment)
2651 {
2652 switch (mode)
2653 {
2654 case DImode:
2655 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2656 GEN_INT (UNITS_PER_WORD));
2657 case DFmode:
2658 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2659 GEN_INT (UNITS_PER_WORD));
2660 default:
2661 gcc_unreachable ();
2662 }
2663 }
2664
2665 static rtx
2666 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2667 rtx reg2)
2668 {
2669 switch (mode)
2670 {
2671 case DImode:
2672 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2673
2674 case DFmode:
2675 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2676
2677 default:
2678 gcc_unreachable ();
2679 }
2680 }
2681
2682 static rtx
2683 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2684 rtx mem2)
2685 {
2686 switch (mode)
2687 {
2688 case DImode:
2689 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2690
2691 case DFmode:
2692 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2693
2694 default:
2695 gcc_unreachable ();
2696 }
2697 }
2698
2699
2700 static void
2701 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2702 unsigned start, unsigned limit, bool skip_wb)
2703 {
2704 rtx_insn *insn;
2705 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2706 ? gen_frame_mem : gen_rtx_MEM);
2707 unsigned regno;
2708 unsigned regno2;
2709
2710 for (regno = aarch64_next_callee_save (start, limit);
2711 regno <= limit;
2712 regno = aarch64_next_callee_save (regno + 1, limit))
2713 {
2714 rtx reg, mem;
2715 HOST_WIDE_INT offset;
2716
2717 if (skip_wb
2718 && (regno == cfun->machine->frame.wb_candidate1
2719 || regno == cfun->machine->frame.wb_candidate2))
2720 continue;
2721
2722 reg = gen_rtx_REG (mode, regno);
2723 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2724 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2725 offset));
2726
2727 regno2 = aarch64_next_callee_save (regno + 1, limit);
2728
2729 if (regno2 <= limit
2730 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2731 == cfun->machine->frame.reg_offset[regno2]))
2732
2733 {
2734 rtx reg2 = gen_rtx_REG (mode, regno2);
2735 rtx mem2;
2736
2737 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2738 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2739 offset));
2740 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2741 reg2));
2742
2743 /* The first part of a frame-related parallel insn is
2744 always assumed to be relevant to the frame
2745 calculations; subsequent parts, are only
2746 frame-related if explicitly marked. */
2747 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2748 regno = regno2;
2749 }
2750 else
2751 insn = emit_move_insn (mem, reg);
2752
2753 RTX_FRAME_RELATED_P (insn) = 1;
2754 }
2755 }
2756
2757 static void
2758 aarch64_restore_callee_saves (machine_mode mode,
2759 HOST_WIDE_INT start_offset, unsigned start,
2760 unsigned limit, bool skip_wb, rtx *cfi_ops)
2761 {
2762 rtx base_rtx = stack_pointer_rtx;
2763 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2764 ? gen_frame_mem : gen_rtx_MEM);
2765 unsigned regno;
2766 unsigned regno2;
2767 HOST_WIDE_INT offset;
2768
2769 for (regno = aarch64_next_callee_save (start, limit);
2770 regno <= limit;
2771 regno = aarch64_next_callee_save (regno + 1, limit))
2772 {
2773 rtx reg, mem;
2774
2775 if (skip_wb
2776 && (regno == cfun->machine->frame.wb_candidate1
2777 || regno == cfun->machine->frame.wb_candidate2))
2778 continue;
2779
2780 reg = gen_rtx_REG (mode, regno);
2781 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2782 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2783
2784 regno2 = aarch64_next_callee_save (regno + 1, limit);
2785
2786 if (regno2 <= limit
2787 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2788 == cfun->machine->frame.reg_offset[regno2]))
2789 {
2790 rtx reg2 = gen_rtx_REG (mode, regno2);
2791 rtx mem2;
2792
2793 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2794 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2795 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2796
2797 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2798 regno = regno2;
2799 }
2800 else
2801 emit_move_insn (reg, mem);
2802 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2803 }
2804 }
2805
2806 /* AArch64 stack frames generated by this compiler look like:
2807
2808 +-------------------------------+
2809 | |
2810 | incoming stack arguments |
2811 | |
2812 +-------------------------------+
2813 | | <-- incoming stack pointer (aligned)
2814 | callee-allocated save area |
2815 | for register varargs |
2816 | |
2817 +-------------------------------+
2818 | local variables | <-- frame_pointer_rtx
2819 | |
2820 +-------------------------------+
2821 | padding0 | \
2822 +-------------------------------+ |
2823 | callee-saved registers | | frame.saved_regs_size
2824 +-------------------------------+ |
2825 | LR' | |
2826 +-------------------------------+ |
2827 | FP' | / <- hard_frame_pointer_rtx (aligned)
2828 +-------------------------------+
2829 | dynamic allocation |
2830 +-------------------------------+
2831 | padding |
2832 +-------------------------------+
2833 | outgoing stack arguments | <-- arg_pointer
2834 | |
2835 +-------------------------------+
2836 | | <-- stack_pointer_rtx (aligned)
2837
2838 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2839 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2840 unchanged. */
2841
2842 /* Generate the prologue instructions for entry into a function.
2843 Establish the stack frame by decreasing the stack pointer with a
2844 properly calculated size and, if necessary, create a frame record
2845 filled with the values of LR and previous frame pointer. The
2846 current FP is also set up if it is in use. */
2847
2848 void
2849 aarch64_expand_prologue (void)
2850 {
2851 /* sub sp, sp, #<frame_size>
2852 stp {fp, lr}, [sp, #<frame_size> - 16]
2853 add fp, sp, #<frame_size> - hardfp_offset
2854 stp {cs_reg}, [fp, #-16] etc.
2855
2856 sub sp, sp, <final_adjustment_if_any>
2857 */
2858 HOST_WIDE_INT frame_size, offset;
2859 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2860 HOST_WIDE_INT hard_fp_offset;
2861 rtx_insn *insn;
2862
2863 aarch64_layout_frame ();
2864
2865 offset = frame_size = cfun->machine->frame.frame_size;
2866 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2867 fp_offset = frame_size - hard_fp_offset;
2868
2869 if (flag_stack_usage_info)
2870 current_function_static_stack_size = frame_size;
2871
2872 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
2873 {
2874 if (crtl->is_leaf && !cfun->calls_alloca)
2875 {
2876 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
2877 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
2878 frame_size - STACK_CHECK_PROTECT);
2879 }
2880 else if (frame_size > 0)
2881 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
2882 }
2883
2884 /* Store pairs and load pairs have a range only -512 to 504. */
2885 if (offset >= 512)
2886 {
2887 /* When the frame has a large size, an initial decrease is done on
2888 the stack pointer to jump over the callee-allocated save area for
2889 register varargs, the local variable area and/or the callee-saved
2890 register area. This will allow the pre-index write-back
2891 store pair instructions to be used for setting up the stack frame
2892 efficiently. */
2893 offset = hard_fp_offset;
2894 if (offset >= 512)
2895 offset = cfun->machine->frame.saved_regs_size;
2896
2897 frame_size -= (offset + crtl->outgoing_args_size);
2898 fp_offset = 0;
2899
2900 if (frame_size >= 0x1000000)
2901 {
2902 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2903 emit_move_insn (op0, GEN_INT (-frame_size));
2904 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2905
2906 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2907 gen_rtx_SET (stack_pointer_rtx,
2908 plus_constant (Pmode, stack_pointer_rtx,
2909 -frame_size)));
2910 RTX_FRAME_RELATED_P (insn) = 1;
2911 }
2912 else if (frame_size > 0)
2913 {
2914 int hi_ofs = frame_size & 0xfff000;
2915 int lo_ofs = frame_size & 0x000fff;
2916
2917 if (hi_ofs)
2918 {
2919 insn = emit_insn (gen_add2_insn
2920 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2921 RTX_FRAME_RELATED_P (insn) = 1;
2922 }
2923 if (lo_ofs)
2924 {
2925 insn = emit_insn (gen_add2_insn
2926 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2927 RTX_FRAME_RELATED_P (insn) = 1;
2928 }
2929 }
2930 }
2931 else
2932 frame_size = -1;
2933
2934 if (offset > 0)
2935 {
2936 bool skip_wb = false;
2937
2938 if (frame_pointer_needed)
2939 {
2940 skip_wb = true;
2941
2942 if (fp_offset)
2943 {
2944 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2945 GEN_INT (-offset)));
2946 RTX_FRAME_RELATED_P (insn) = 1;
2947
2948 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2949 R30_REGNUM, false);
2950 }
2951 else
2952 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2953
2954 /* Set up frame pointer to point to the location of the
2955 previous frame pointer on the stack. */
2956 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2957 stack_pointer_rtx,
2958 GEN_INT (fp_offset)));
2959 RTX_FRAME_RELATED_P (insn) = 1;
2960 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2961 }
2962 else
2963 {
2964 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2965 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2966
2967 if (fp_offset
2968 || reg1 == FIRST_PSEUDO_REGISTER
2969 || (reg2 == FIRST_PSEUDO_REGISTER
2970 && offset >= 256))
2971 {
2972 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2973 GEN_INT (-offset)));
2974 RTX_FRAME_RELATED_P (insn) = 1;
2975 }
2976 else
2977 {
2978 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2979
2980 skip_wb = true;
2981
2982 if (reg2 == FIRST_PSEUDO_REGISTER)
2983 aarch64_pushwb_single_reg (mode1, reg1, offset);
2984 else
2985 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2986 }
2987 }
2988
2989 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2990 skip_wb);
2991 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2992 skip_wb);
2993 }
2994
2995 /* when offset >= 512,
2996 sub sp, sp, #<outgoing_args_size> */
2997 if (frame_size > -1)
2998 {
2999 if (crtl->outgoing_args_size > 0)
3000 {
3001 insn = emit_insn (gen_add2_insn
3002 (stack_pointer_rtx,
3003 GEN_INT (- crtl->outgoing_args_size)));
3004 RTX_FRAME_RELATED_P (insn) = 1;
3005 }
3006 }
3007 }
3008
3009 /* Return TRUE if we can use a simple_return insn.
3010
3011 This function checks whether the callee saved stack is empty, which
3012 means no restore actions are need. The pro_and_epilogue will use
3013 this to check whether shrink-wrapping opt is feasible. */
3014
3015 bool
3016 aarch64_use_return_insn_p (void)
3017 {
3018 if (!reload_completed)
3019 return false;
3020
3021 if (crtl->profile)
3022 return false;
3023
3024 aarch64_layout_frame ();
3025
3026 return cfun->machine->frame.frame_size == 0;
3027 }
3028
3029 /* Generate the epilogue instructions for returning from a function. */
3030 void
3031 aarch64_expand_epilogue (bool for_sibcall)
3032 {
3033 HOST_WIDE_INT frame_size, offset;
3034 HOST_WIDE_INT fp_offset;
3035 HOST_WIDE_INT hard_fp_offset;
3036 rtx_insn *insn;
3037 /* We need to add memory barrier to prevent read from deallocated stack. */
3038 bool need_barrier_p = (get_frame_size () != 0
3039 || cfun->machine->frame.saved_varargs_size);
3040
3041 aarch64_layout_frame ();
3042
3043 offset = frame_size = cfun->machine->frame.frame_size;
3044 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
3045 fp_offset = frame_size - hard_fp_offset;
3046
3047 /* Store pairs and load pairs have a range only -512 to 504. */
3048 if (offset >= 512)
3049 {
3050 offset = hard_fp_offset;
3051 if (offset >= 512)
3052 offset = cfun->machine->frame.saved_regs_size;
3053
3054 frame_size -= (offset + crtl->outgoing_args_size);
3055 fp_offset = 0;
3056 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
3057 {
3058 insn = emit_insn (gen_add2_insn
3059 (stack_pointer_rtx,
3060 GEN_INT (crtl->outgoing_args_size)));
3061 RTX_FRAME_RELATED_P (insn) = 1;
3062 }
3063 }
3064 else
3065 frame_size = -1;
3066
3067 /* If there were outgoing arguments or we've done dynamic stack
3068 allocation, then restore the stack pointer from the frame
3069 pointer. This is at most one insn and more efficient than using
3070 GCC's internal mechanism. */
3071 if (frame_pointer_needed
3072 && (crtl->outgoing_args_size || cfun->calls_alloca))
3073 {
3074 if (cfun->calls_alloca)
3075 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3076
3077 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3078 hard_frame_pointer_rtx,
3079 GEN_INT (0)));
3080 offset = offset - fp_offset;
3081 }
3082
3083 if (offset > 0)
3084 {
3085 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3086 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3087 bool skip_wb = true;
3088 rtx cfi_ops = NULL;
3089
3090 if (frame_pointer_needed)
3091 fp_offset = 0;
3092 else if (fp_offset
3093 || reg1 == FIRST_PSEUDO_REGISTER
3094 || (reg2 == FIRST_PSEUDO_REGISTER
3095 && offset >= 256))
3096 skip_wb = false;
3097
3098 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
3099 skip_wb, &cfi_ops);
3100 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
3101 skip_wb, &cfi_ops);
3102
3103 if (need_barrier_p)
3104 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3105
3106 if (skip_wb)
3107 {
3108 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
3109 rtx rreg1 = gen_rtx_REG (mode1, reg1);
3110
3111 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
3112 if (reg2 == FIRST_PSEUDO_REGISTER)
3113 {
3114 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
3115 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3116 mem = gen_rtx_MEM (mode1, mem);
3117 insn = emit_move_insn (rreg1, mem);
3118 }
3119 else
3120 {
3121 rtx rreg2 = gen_rtx_REG (mode1, reg2);
3122
3123 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
3124 insn = emit_insn (aarch64_gen_loadwb_pair
3125 (mode1, stack_pointer_rtx, rreg1,
3126 rreg2, offset));
3127 }
3128 }
3129 else
3130 {
3131 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
3132 GEN_INT (offset)));
3133 }
3134
3135 /* Reset the CFA to be SP + FRAME_SIZE. */
3136 rtx new_cfa = stack_pointer_rtx;
3137 if (frame_size > 0)
3138 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
3139 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3140 REG_NOTES (insn) = cfi_ops;
3141 RTX_FRAME_RELATED_P (insn) = 1;
3142 }
3143
3144 if (frame_size > 0)
3145 {
3146 if (need_barrier_p)
3147 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3148
3149 if (frame_size >= 0x1000000)
3150 {
3151 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3152 emit_move_insn (op0, GEN_INT (frame_size));
3153 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
3154 }
3155 else
3156 {
3157 int hi_ofs = frame_size & 0xfff000;
3158 int lo_ofs = frame_size & 0x000fff;
3159
3160 if (hi_ofs && lo_ofs)
3161 {
3162 insn = emit_insn (gen_add2_insn
3163 (stack_pointer_rtx, GEN_INT (hi_ofs)));
3164 RTX_FRAME_RELATED_P (insn) = 1;
3165 frame_size = lo_ofs;
3166 }
3167 insn = emit_insn (gen_add2_insn
3168 (stack_pointer_rtx, GEN_INT (frame_size)));
3169 }
3170
3171 /* Reset the CFA to be SP + 0. */
3172 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
3173 RTX_FRAME_RELATED_P (insn) = 1;
3174 }
3175
3176 /* Stack adjustment for exception handler. */
3177 if (crtl->calls_eh_return)
3178 {
3179 /* We need to unwind the stack by the offset computed by
3180 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3181 to be SP; letting the CFA move during this adjustment
3182 is just as correct as retaining the CFA from the body
3183 of the function. Therefore, do nothing special. */
3184 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3185 }
3186
3187 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3188 if (!for_sibcall)
3189 emit_jump_insn (ret_rtx);
3190 }
3191
3192 /* Return the place to copy the exception unwinding return address to.
3193 This will probably be a stack slot, but could (in theory be the
3194 return register). */
3195 rtx
3196 aarch64_final_eh_return_addr (void)
3197 {
3198 HOST_WIDE_INT fp_offset;
3199
3200 aarch64_layout_frame ();
3201
3202 fp_offset = cfun->machine->frame.frame_size
3203 - cfun->machine->frame.hard_fp_offset;
3204
3205 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3206 return gen_rtx_REG (DImode, LR_REGNUM);
3207
3208 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
3209 result in a store to save LR introduced by builtin_eh_return () being
3210 incorrectly deleted because the alias is not detected.
3211 So in the calculation of the address to copy the exception unwinding
3212 return address to, we note 2 cases.
3213 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3214 we return a SP-relative location since all the addresses are SP-relative
3215 in this case. This prevents the store from being optimized away.
3216 If the fp_offset is not 0, then the addresses will be FP-relative and
3217 therefore we return a FP-relative location. */
3218
3219 if (frame_pointer_needed)
3220 {
3221 if (fp_offset)
3222 return gen_frame_mem (DImode,
3223 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3224 else
3225 return gen_frame_mem (DImode,
3226 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3227 }
3228
3229 /* If FP is not needed, we calculate the location of LR, which would be
3230 at the top of the saved registers block. */
3231
3232 return gen_frame_mem (DImode,
3233 plus_constant (Pmode,
3234 stack_pointer_rtx,
3235 fp_offset
3236 + cfun->machine->frame.saved_regs_size
3237 - 2 * UNITS_PER_WORD));
3238 }
3239
3240 /* Possibly output code to build up a constant in a register. For
3241 the benefit of the costs infrastructure, returns the number of
3242 instructions which would be emitted. GENERATE inhibits or
3243 enables code generation. */
3244
3245 static int
3246 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
3247 {
3248 int insns = 0;
3249
3250 if (aarch64_bitmask_imm (val, DImode))
3251 {
3252 if (generate)
3253 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
3254 insns = 1;
3255 }
3256 else
3257 {
3258 int i;
3259 int ncount = 0;
3260 int zcount = 0;
3261 HOST_WIDE_INT valp = val >> 16;
3262 HOST_WIDE_INT valm;
3263 HOST_WIDE_INT tval;
3264
3265 for (i = 16; i < 64; i += 16)
3266 {
3267 valm = (valp & 0xffff);
3268
3269 if (valm != 0)
3270 ++ zcount;
3271
3272 if (valm != 0xffff)
3273 ++ ncount;
3274
3275 valp >>= 16;
3276 }
3277
3278 /* zcount contains the number of additional MOVK instructions
3279 required if the constant is built up with an initial MOVZ instruction,
3280 while ncount is the number of MOVK instructions required if starting
3281 with a MOVN instruction. Choose the sequence that yields the fewest
3282 number of instructions, preferring MOVZ instructions when they are both
3283 the same. */
3284 if (ncount < zcount)
3285 {
3286 if (generate)
3287 emit_move_insn (gen_rtx_REG (Pmode, regnum),
3288 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
3289 tval = 0xffff;
3290 insns++;
3291 }
3292 else
3293 {
3294 if (generate)
3295 emit_move_insn (gen_rtx_REG (Pmode, regnum),
3296 GEN_INT (val & 0xffff));
3297 tval = 0;
3298 insns++;
3299 }
3300
3301 val >>= 16;
3302
3303 for (i = 16; i < 64; i += 16)
3304 {
3305 if ((val & 0xffff) != tval)
3306 {
3307 if (generate)
3308 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3309 GEN_INT (i),
3310 GEN_INT (val & 0xffff)));
3311 insns++;
3312 }
3313 val >>= 16;
3314 }
3315 }
3316 return insns;
3317 }
3318
3319 static void
3320 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3321 {
3322 HOST_WIDE_INT mdelta = delta;
3323 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3324 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3325
3326 if (mdelta < 0)
3327 mdelta = -mdelta;
3328
3329 if (mdelta >= 4096 * 4096)
3330 {
3331 (void) aarch64_build_constant (scratchreg, delta, true);
3332 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3333 }
3334 else if (mdelta > 0)
3335 {
3336 if (mdelta >= 4096)
3337 {
3338 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3339 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3340 if (delta < 0)
3341 emit_insn (gen_rtx_SET (this_rtx,
3342 gen_rtx_MINUS (Pmode, this_rtx, shift)));
3343 else
3344 emit_insn (gen_rtx_SET (this_rtx,
3345 gen_rtx_PLUS (Pmode, this_rtx, shift)));
3346 }
3347 if (mdelta % 4096 != 0)
3348 {
3349 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3350 emit_insn (gen_rtx_SET (this_rtx,
3351 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3352 }
3353 }
3354 }
3355
3356 /* Output code to add DELTA to the first argument, and then jump
3357 to FUNCTION. Used for C++ multiple inheritance. */
3358 static void
3359 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3360 HOST_WIDE_INT delta,
3361 HOST_WIDE_INT vcall_offset,
3362 tree function)
3363 {
3364 /* The this pointer is always in x0. Note that this differs from
3365 Arm where the this pointer maybe bumped to r1 if r0 is required
3366 to return a pointer to an aggregate. On AArch64 a result value
3367 pointer will be in x8. */
3368 int this_regno = R0_REGNUM;
3369 rtx this_rtx, temp0, temp1, addr, funexp;
3370 rtx_insn *insn;
3371
3372 reload_completed = 1;
3373 emit_note (NOTE_INSN_PROLOGUE_END);
3374
3375 if (vcall_offset == 0)
3376 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3377 else
3378 {
3379 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3380
3381 this_rtx = gen_rtx_REG (Pmode, this_regno);
3382 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3383 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3384
3385 addr = this_rtx;
3386 if (delta != 0)
3387 {
3388 if (delta >= -256 && delta < 256)
3389 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3390 plus_constant (Pmode, this_rtx, delta));
3391 else
3392 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3393 }
3394
3395 if (Pmode == ptr_mode)
3396 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3397 else
3398 aarch64_emit_move (temp0,
3399 gen_rtx_ZERO_EXTEND (Pmode,
3400 gen_rtx_MEM (ptr_mode, addr)));
3401
3402 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3403 addr = plus_constant (Pmode, temp0, vcall_offset);
3404 else
3405 {
3406 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3407 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3408 }
3409
3410 if (Pmode == ptr_mode)
3411 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3412 else
3413 aarch64_emit_move (temp1,
3414 gen_rtx_SIGN_EXTEND (Pmode,
3415 gen_rtx_MEM (ptr_mode, addr)));
3416
3417 emit_insn (gen_add2_insn (this_rtx, temp1));
3418 }
3419
3420 /* Generate a tail call to the target function. */
3421 if (!TREE_USED (function))
3422 {
3423 assemble_external (function);
3424 TREE_USED (function) = 1;
3425 }
3426 funexp = XEXP (DECL_RTL (function), 0);
3427 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3428 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3429 SIBLING_CALL_P (insn) = 1;
3430
3431 insn = get_insns ();
3432 shorten_branches (insn);
3433 final_start_function (insn, file, 1);
3434 final (insn, file, 1);
3435 final_end_function ();
3436
3437 /* Stop pretending to be a post-reload pass. */
3438 reload_completed = 0;
3439 }
3440
3441 static bool
3442 aarch64_tls_referenced_p (rtx x)
3443 {
3444 if (!TARGET_HAVE_TLS)
3445 return false;
3446 subrtx_iterator::array_type array;
3447 FOR_EACH_SUBRTX (iter, array, x, ALL)
3448 {
3449 const_rtx x = *iter;
3450 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3451 return true;
3452 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3453 TLS offsets, not real symbol references. */
3454 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3455 iter.skip_subrtxes ();
3456 }
3457 return false;
3458 }
3459
3460
3461 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3462 a left shift of 0 or 12 bits. */
3463 bool
3464 aarch64_uimm12_shift (HOST_WIDE_INT val)
3465 {
3466 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3467 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3468 );
3469 }
3470
3471
3472 /* Return true if val is an immediate that can be loaded into a
3473 register by a MOVZ instruction. */
3474 static bool
3475 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3476 {
3477 if (GET_MODE_SIZE (mode) > 4)
3478 {
3479 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3480 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3481 return 1;
3482 }
3483 else
3484 {
3485 /* Ignore sign extension. */
3486 val &= (HOST_WIDE_INT) 0xffffffff;
3487 }
3488 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3489 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3490 }
3491
3492 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3493
3494 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3495 {
3496 0x0000000100000001ull,
3497 0x0001000100010001ull,
3498 0x0101010101010101ull,
3499 0x1111111111111111ull,
3500 0x5555555555555555ull,
3501 };
3502
3503
3504 /* Return true if val is a valid bitmask immediate. */
3505
3506 bool
3507 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3508 {
3509 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3510 int bits;
3511
3512 /* Check for a single sequence of one bits and return quickly if so.
3513 The special cases of all ones and all zeroes returns false. */
3514 val = (unsigned HOST_WIDE_INT) val_in;
3515 tmp = val + (val & -val);
3516
3517 if (tmp == (tmp & -tmp))
3518 return (val + 1) > 1;
3519
3520 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3521 if (mode == SImode)
3522 val = (val << 32) | (val & 0xffffffff);
3523
3524 /* Invert if the immediate doesn't start with a zero bit - this means we
3525 only need to search for sequences of one bits. */
3526 if (val & 1)
3527 val = ~val;
3528
3529 /* Find the first set bit and set tmp to val with the first sequence of one
3530 bits removed. Return success if there is a single sequence of ones. */
3531 first_one = val & -val;
3532 tmp = val & (val + first_one);
3533
3534 if (tmp == 0)
3535 return true;
3536
3537 /* Find the next set bit and compute the difference in bit position. */
3538 next_one = tmp & -tmp;
3539 bits = clz_hwi (first_one) - clz_hwi (next_one);
3540 mask = val ^ tmp;
3541
3542 /* Check the bit position difference is a power of 2, and that the first
3543 sequence of one bits fits within 'bits' bits. */
3544 if ((mask >> bits) != 0 || bits != (bits & -bits))
3545 return false;
3546
3547 /* Check the sequence of one bits is repeated 64/bits times. */
3548 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3549 }
3550
3551
3552 /* Return true if val is an immediate that can be loaded into a
3553 register in a single instruction. */
3554 bool
3555 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3556 {
3557 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3558 return 1;
3559 return aarch64_bitmask_imm (val, mode);
3560 }
3561
3562 static bool
3563 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3564 {
3565 rtx base, offset;
3566
3567 if (GET_CODE (x) == HIGH)
3568 return true;
3569
3570 split_const (x, &base, &offset);
3571 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3572 {
3573 if (aarch64_classify_symbol (base, offset)
3574 != SYMBOL_FORCE_TO_MEM)
3575 return true;
3576 else
3577 /* Avoid generating a 64-bit relocation in ILP32; leave
3578 to aarch64_expand_mov_immediate to handle it properly. */
3579 return mode != ptr_mode;
3580 }
3581
3582 return aarch64_tls_referenced_p (x);
3583 }
3584
3585 /* Implement TARGET_CASE_VALUES_THRESHOLD. */
3586
3587 static unsigned int
3588 aarch64_case_values_threshold (void)
3589 {
3590 /* Use the specified limit for the number of cases before using jump
3591 tables at higher optimization levels. */
3592 if (optimize > 2
3593 && selected_cpu->tune->max_case_values != 0)
3594 return selected_cpu->tune->max_case_values;
3595 else
3596 return default_case_values_threshold ();
3597 }
3598
3599 /* Return true if register REGNO is a valid index register.
3600 STRICT_P is true if REG_OK_STRICT is in effect. */
3601
3602 bool
3603 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3604 {
3605 if (!HARD_REGISTER_NUM_P (regno))
3606 {
3607 if (!strict_p)
3608 return true;
3609
3610 if (!reg_renumber)
3611 return false;
3612
3613 regno = reg_renumber[regno];
3614 }
3615 return GP_REGNUM_P (regno);
3616 }
3617
3618 /* Return true if register REGNO is a valid base register for mode MODE.
3619 STRICT_P is true if REG_OK_STRICT is in effect. */
3620
3621 bool
3622 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3623 {
3624 if (!HARD_REGISTER_NUM_P (regno))
3625 {
3626 if (!strict_p)
3627 return true;
3628
3629 if (!reg_renumber)
3630 return false;
3631
3632 regno = reg_renumber[regno];
3633 }
3634
3635 /* The fake registers will be eliminated to either the stack or
3636 hard frame pointer, both of which are usually valid base registers.
3637 Reload deals with the cases where the eliminated form isn't valid. */
3638 return (GP_REGNUM_P (regno)
3639 || regno == SP_REGNUM
3640 || regno == FRAME_POINTER_REGNUM
3641 || regno == ARG_POINTER_REGNUM);
3642 }
3643
3644 /* Return true if X is a valid base register for mode MODE.
3645 STRICT_P is true if REG_OK_STRICT is in effect. */
3646
3647 static bool
3648 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3649 {
3650 if (!strict_p && GET_CODE (x) == SUBREG)
3651 x = SUBREG_REG (x);
3652
3653 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3654 }
3655
3656 /* Return true if address offset is a valid index. If it is, fill in INFO
3657 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3658
3659 static bool
3660 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3661 machine_mode mode, bool strict_p)
3662 {
3663 enum aarch64_address_type type;
3664 rtx index;
3665 int shift;
3666
3667 /* (reg:P) */
3668 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3669 && GET_MODE (x) == Pmode)
3670 {
3671 type = ADDRESS_REG_REG;
3672 index = x;
3673 shift = 0;
3674 }
3675 /* (sign_extend:DI (reg:SI)) */
3676 else if ((GET_CODE (x) == SIGN_EXTEND
3677 || GET_CODE (x) == ZERO_EXTEND)
3678 && GET_MODE (x) == DImode
3679 && GET_MODE (XEXP (x, 0)) == SImode)
3680 {
3681 type = (GET_CODE (x) == SIGN_EXTEND)
3682 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3683 index = XEXP (x, 0);
3684 shift = 0;
3685 }
3686 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3687 else if (GET_CODE (x) == MULT
3688 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3689 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3690 && GET_MODE (XEXP (x, 0)) == DImode
3691 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3692 && CONST_INT_P (XEXP (x, 1)))
3693 {
3694 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3695 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3696 index = XEXP (XEXP (x, 0), 0);
3697 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3698 }
3699 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3700 else if (GET_CODE (x) == ASHIFT
3701 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3702 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3703 && GET_MODE (XEXP (x, 0)) == DImode
3704 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3705 && CONST_INT_P (XEXP (x, 1)))
3706 {
3707 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3708 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3709 index = XEXP (XEXP (x, 0), 0);
3710 shift = INTVAL (XEXP (x, 1));
3711 }
3712 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3713 else if ((GET_CODE (x) == SIGN_EXTRACT
3714 || GET_CODE (x) == ZERO_EXTRACT)
3715 && GET_MODE (x) == DImode
3716 && GET_CODE (XEXP (x, 0)) == MULT
3717 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3718 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3719 {
3720 type = (GET_CODE (x) == SIGN_EXTRACT)
3721 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3722 index = XEXP (XEXP (x, 0), 0);
3723 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3724 if (INTVAL (XEXP (x, 1)) != 32 + shift
3725 || INTVAL (XEXP (x, 2)) != 0)
3726 shift = -1;
3727 }
3728 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3729 (const_int 0xffffffff<<shift)) */
3730 else if (GET_CODE (x) == AND
3731 && GET_MODE (x) == DImode
3732 && GET_CODE (XEXP (x, 0)) == MULT
3733 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3734 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3735 && CONST_INT_P (XEXP (x, 1)))
3736 {
3737 type = ADDRESS_REG_UXTW;
3738 index = XEXP (XEXP (x, 0), 0);
3739 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3740 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3741 shift = -1;
3742 }
3743 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3744 else if ((GET_CODE (x) == SIGN_EXTRACT
3745 || GET_CODE (x) == ZERO_EXTRACT)
3746 && GET_MODE (x) == DImode
3747 && GET_CODE (XEXP (x, 0)) == ASHIFT
3748 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3749 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3750 {
3751 type = (GET_CODE (x) == SIGN_EXTRACT)
3752 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3753 index = XEXP (XEXP (x, 0), 0);
3754 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3755 if (INTVAL (XEXP (x, 1)) != 32 + shift
3756 || INTVAL (XEXP (x, 2)) != 0)
3757 shift = -1;
3758 }
3759 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3760 (const_int 0xffffffff<<shift)) */
3761 else if (GET_CODE (x) == AND
3762 && GET_MODE (x) == DImode
3763 && GET_CODE (XEXP (x, 0)) == ASHIFT
3764 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3765 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3766 && CONST_INT_P (XEXP (x, 1)))
3767 {
3768 type = ADDRESS_REG_UXTW;
3769 index = XEXP (XEXP (x, 0), 0);
3770 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3771 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3772 shift = -1;
3773 }
3774 /* (mult:P (reg:P) (const_int scale)) */
3775 else if (GET_CODE (x) == MULT
3776 && GET_MODE (x) == Pmode
3777 && GET_MODE (XEXP (x, 0)) == Pmode
3778 && CONST_INT_P (XEXP (x, 1)))
3779 {
3780 type = ADDRESS_REG_REG;
3781 index = XEXP (x, 0);
3782 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3783 }
3784 /* (ashift:P (reg:P) (const_int shift)) */
3785 else if (GET_CODE (x) == ASHIFT
3786 && GET_MODE (x) == Pmode
3787 && GET_MODE (XEXP (x, 0)) == Pmode
3788 && CONST_INT_P (XEXP (x, 1)))
3789 {
3790 type = ADDRESS_REG_REG;
3791 index = XEXP (x, 0);
3792 shift = INTVAL (XEXP (x, 1));
3793 }
3794 else
3795 return false;
3796
3797 if (GET_CODE (index) == SUBREG)
3798 index = SUBREG_REG (index);
3799
3800 if ((shift == 0 ||
3801 (shift > 0 && shift <= 3
3802 && (1 << shift) == GET_MODE_SIZE (mode)))
3803 && REG_P (index)
3804 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3805 {
3806 info->type = type;
3807 info->offset = index;
3808 info->shift = shift;
3809 return true;
3810 }
3811
3812 return false;
3813 }
3814
3815 bool
3816 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3817 {
3818 return (offset >= -64 * GET_MODE_SIZE (mode)
3819 && offset < 64 * GET_MODE_SIZE (mode)
3820 && offset % GET_MODE_SIZE (mode) == 0);
3821 }
3822
3823 static inline bool
3824 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3825 HOST_WIDE_INT offset)
3826 {
3827 return offset >= -256 && offset < 256;
3828 }
3829
3830 static inline bool
3831 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3832 {
3833 return (offset >= 0
3834 && offset < 4096 * GET_MODE_SIZE (mode)
3835 && offset % GET_MODE_SIZE (mode) == 0);
3836 }
3837
3838 /* Return true if MODE is one of the modes for which we
3839 support LDP/STP operations. */
3840
3841 static bool
3842 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3843 {
3844 return mode == SImode || mode == DImode
3845 || mode == SFmode || mode == DFmode
3846 || (aarch64_vector_mode_supported_p (mode)
3847 && GET_MODE_SIZE (mode) == 8);
3848 }
3849
3850 /* Return true if X is a valid address for machine mode MODE. If it is,
3851 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3852 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3853
3854 static bool
3855 aarch64_classify_address (struct aarch64_address_info *info,
3856 rtx x, machine_mode mode,
3857 RTX_CODE outer_code, bool strict_p)
3858 {
3859 enum rtx_code code = GET_CODE (x);
3860 rtx op0, op1;
3861
3862 /* On BE, we use load/store pair for all large int mode load/stores. */
3863 bool load_store_pair_p = (outer_code == PARALLEL
3864 || (BYTES_BIG_ENDIAN
3865 && aarch64_vect_struct_mode_p (mode)));
3866
3867 bool allow_reg_index_p =
3868 !load_store_pair_p
3869 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3870 && !aarch64_vect_struct_mode_p (mode);
3871
3872 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3873 REG addressing. */
3874 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3875 && (code != POST_INC && code != REG))
3876 return false;
3877
3878 switch (code)
3879 {
3880 case REG:
3881 case SUBREG:
3882 info->type = ADDRESS_REG_IMM;
3883 info->base = x;
3884 info->offset = const0_rtx;
3885 return aarch64_base_register_rtx_p (x, strict_p);
3886
3887 case PLUS:
3888 op0 = XEXP (x, 0);
3889 op1 = XEXP (x, 1);
3890
3891 if (! strict_p
3892 && REG_P (op0)
3893 && (op0 == virtual_stack_vars_rtx
3894 || op0 == frame_pointer_rtx
3895 || op0 == arg_pointer_rtx)
3896 && CONST_INT_P (op1))
3897 {
3898 info->type = ADDRESS_REG_IMM;
3899 info->base = op0;
3900 info->offset = op1;
3901
3902 return true;
3903 }
3904
3905 if (GET_MODE_SIZE (mode) != 0
3906 && CONST_INT_P (op1)
3907 && aarch64_base_register_rtx_p (op0, strict_p))
3908 {
3909 HOST_WIDE_INT offset = INTVAL (op1);
3910
3911 info->type = ADDRESS_REG_IMM;
3912 info->base = op0;
3913 info->offset = op1;
3914
3915 /* TImode and TFmode values are allowed in both pairs of X
3916 registers and individual Q registers. The available
3917 address modes are:
3918 X,X: 7-bit signed scaled offset
3919 Q: 9-bit signed offset
3920 We conservatively require an offset representable in either mode.
3921 */
3922 if (mode == TImode || mode == TFmode)
3923 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3924 && offset_9bit_signed_unscaled_p (mode, offset));
3925
3926 /* A 7bit offset check because OImode will emit a ldp/stp
3927 instruction (only big endian will get here).
3928 For ldp/stp instructions, the offset is scaled for the size of a
3929 single element of the pair. */
3930 if (mode == OImode)
3931 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3932
3933 /* Three 9/12 bit offsets checks because CImode will emit three
3934 ldr/str instructions (only big endian will get here). */
3935 if (mode == CImode)
3936 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3937 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3938 || offset_12bit_unsigned_scaled_p (V16QImode,
3939 offset + 32)));
3940
3941 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3942 instructions (only big endian will get here). */
3943 if (mode == XImode)
3944 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3945 && aarch64_offset_7bit_signed_scaled_p (TImode,
3946 offset + 32));
3947
3948 if (load_store_pair_p)
3949 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3950 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3951 else
3952 return (offset_9bit_signed_unscaled_p (mode, offset)
3953 || offset_12bit_unsigned_scaled_p (mode, offset));
3954 }
3955
3956 if (allow_reg_index_p)
3957 {
3958 /* Look for base + (scaled/extended) index register. */
3959 if (aarch64_base_register_rtx_p (op0, strict_p)
3960 && aarch64_classify_index (info, op1, mode, strict_p))
3961 {
3962 info->base = op0;
3963 return true;
3964 }
3965 if (aarch64_base_register_rtx_p (op1, strict_p)
3966 && aarch64_classify_index (info, op0, mode, strict_p))
3967 {
3968 info->base = op1;
3969 return true;
3970 }
3971 }
3972
3973 return false;
3974
3975 case POST_INC:
3976 case POST_DEC:
3977 case PRE_INC:
3978 case PRE_DEC:
3979 info->type = ADDRESS_REG_WB;
3980 info->base = XEXP (x, 0);
3981 info->offset = NULL_RTX;
3982 return aarch64_base_register_rtx_p (info->base, strict_p);
3983
3984 case POST_MODIFY:
3985 case PRE_MODIFY:
3986 info->type = ADDRESS_REG_WB;
3987 info->base = XEXP (x, 0);
3988 if (GET_CODE (XEXP (x, 1)) == PLUS
3989 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3990 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3991 && aarch64_base_register_rtx_p (info->base, strict_p))
3992 {
3993 HOST_WIDE_INT offset;
3994 info->offset = XEXP (XEXP (x, 1), 1);
3995 offset = INTVAL (info->offset);
3996
3997 /* TImode and TFmode values are allowed in both pairs of X
3998 registers and individual Q registers. The available
3999 address modes are:
4000 X,X: 7-bit signed scaled offset
4001 Q: 9-bit signed offset
4002 We conservatively require an offset representable in either mode.
4003 */
4004 if (mode == TImode || mode == TFmode)
4005 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4006 && offset_9bit_signed_unscaled_p (mode, offset));
4007
4008 if (load_store_pair_p)
4009 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4010 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4011 else
4012 return offset_9bit_signed_unscaled_p (mode, offset);
4013 }
4014 return false;
4015
4016 case CONST:
4017 case SYMBOL_REF:
4018 case LABEL_REF:
4019 /* load literal: pc-relative constant pool entry. Only supported
4020 for SI mode or larger. */
4021 info->type = ADDRESS_SYMBOLIC;
4022
4023 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4024 {
4025 rtx sym, addend;
4026
4027 split_const (x, &sym, &addend);
4028 return ((GET_CODE (sym) == LABEL_REF
4029 || (GET_CODE (sym) == SYMBOL_REF
4030 && CONSTANT_POOL_ADDRESS_P (sym)
4031 && !aarch64_nopcrelative_literal_loads)));
4032 }
4033 return false;
4034
4035 case LO_SUM:
4036 info->type = ADDRESS_LO_SUM;
4037 info->base = XEXP (x, 0);
4038 info->offset = XEXP (x, 1);
4039 if (allow_reg_index_p
4040 && aarch64_base_register_rtx_p (info->base, strict_p))
4041 {
4042 rtx sym, offs;
4043 split_const (info->offset, &sym, &offs);
4044 if (GET_CODE (sym) == SYMBOL_REF
4045 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4046 {
4047 /* The symbol and offset must be aligned to the access size. */
4048 unsigned int align;
4049 unsigned int ref_size;
4050
4051 if (CONSTANT_POOL_ADDRESS_P (sym))
4052 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4053 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4054 {
4055 tree exp = SYMBOL_REF_DECL (sym);
4056 align = TYPE_ALIGN (TREE_TYPE (exp));
4057 align = CONSTANT_ALIGNMENT (exp, align);
4058 }
4059 else if (SYMBOL_REF_DECL (sym))
4060 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4061 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4062 && SYMBOL_REF_BLOCK (sym) != NULL)
4063 align = SYMBOL_REF_BLOCK (sym)->alignment;
4064 else
4065 align = BITS_PER_UNIT;
4066
4067 ref_size = GET_MODE_SIZE (mode);
4068 if (ref_size == 0)
4069 ref_size = GET_MODE_SIZE (DImode);
4070
4071 return ((INTVAL (offs) & (ref_size - 1)) == 0
4072 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4073 }
4074 }
4075 return false;
4076
4077 default:
4078 return false;
4079 }
4080 }
4081
4082 bool
4083 aarch64_symbolic_address_p (rtx x)
4084 {
4085 rtx offset;
4086
4087 split_const (x, &x, &offset);
4088 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4089 }
4090
4091 /* Classify the base of symbolic expression X. */
4092
4093 enum aarch64_symbol_type
4094 aarch64_classify_symbolic_expression (rtx x)
4095 {
4096 rtx offset;
4097
4098 split_const (x, &x, &offset);
4099 return aarch64_classify_symbol (x, offset);
4100 }
4101
4102
4103 /* Return TRUE if X is a legitimate address for accessing memory in
4104 mode MODE. */
4105 static bool
4106 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4107 {
4108 struct aarch64_address_info addr;
4109
4110 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4111 }
4112
4113 /* Return TRUE if X is a legitimate address for accessing memory in
4114 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4115 pair operation. */
4116 bool
4117 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4118 RTX_CODE outer_code, bool strict_p)
4119 {
4120 struct aarch64_address_info addr;
4121
4122 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4123 }
4124
4125 /* Return TRUE if rtx X is immediate constant 0.0 */
4126 bool
4127 aarch64_float_const_zero_rtx_p (rtx x)
4128 {
4129 if (GET_MODE (x) == VOIDmode)
4130 return false;
4131
4132 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4133 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4134 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4135 }
4136
4137 /* Return the fixed registers used for condition codes. */
4138
4139 static bool
4140 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4141 {
4142 *p1 = CC_REGNUM;
4143 *p2 = INVALID_REGNUM;
4144 return true;
4145 }
4146
4147 /* Emit call insn with PAT and do aarch64-specific handling. */
4148
4149 void
4150 aarch64_emit_call_insn (rtx pat)
4151 {
4152 rtx insn = emit_call_insn (pat);
4153
4154 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4155 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4156 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4157 }
4158
4159 machine_mode
4160 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4161 {
4162 /* All floating point compares return CCFP if it is an equality
4163 comparison, and CCFPE otherwise. */
4164 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4165 {
4166 switch (code)
4167 {
4168 case EQ:
4169 case NE:
4170 case UNORDERED:
4171 case ORDERED:
4172 case UNLT:
4173 case UNLE:
4174 case UNGT:
4175 case UNGE:
4176 case UNEQ:
4177 case LTGT:
4178 return CCFPmode;
4179
4180 case LT:
4181 case LE:
4182 case GT:
4183 case GE:
4184 return CCFPEmode;
4185
4186 default:
4187 gcc_unreachable ();
4188 }
4189 }
4190
4191 /* Equality comparisons of short modes against zero can be performed
4192 using the TST instruction with the appropriate bitmask. */
4193 if (y == const0_rtx && REG_P (x)
4194 && (code == EQ || code == NE)
4195 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4196 return CC_NZmode;
4197
4198 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4199 && y == const0_rtx
4200 && (code == EQ || code == NE || code == LT || code == GE)
4201 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4202 || GET_CODE (x) == NEG
4203 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4204 && CONST_INT_P (XEXP (x, 2)))))
4205 return CC_NZmode;
4206
4207 /* A compare with a shifted operand. Because of canonicalization,
4208 the comparison will have to be swapped when we emit the assembly
4209 code. */
4210 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4211 && (REG_P (y) || GET_CODE (y) == SUBREG)
4212 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4213 || GET_CODE (x) == LSHIFTRT
4214 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4215 return CC_SWPmode;
4216
4217 /* Similarly for a negated operand, but we can only do this for
4218 equalities. */
4219 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4220 && (REG_P (y) || GET_CODE (y) == SUBREG)
4221 && (code == EQ || code == NE)
4222 && GET_CODE (x) == NEG)
4223 return CC_Zmode;
4224
4225 /* A compare of a mode narrower than SI mode against zero can be done
4226 by extending the value in the comparison. */
4227 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
4228 && y == const0_rtx)
4229 /* Only use sign-extension if we really need it. */
4230 return ((code == GT || code == GE || code == LE || code == LT)
4231 ? CC_SESWPmode : CC_ZESWPmode);
4232
4233 /* A test for unsigned overflow. */
4234 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4235 && code == NE
4236 && GET_CODE (x) == PLUS
4237 && GET_CODE (y) == ZERO_EXTEND)
4238 return CC_Cmode;
4239
4240 /* For everything else, return CCmode. */
4241 return CCmode;
4242 }
4243
4244 static int
4245 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4246
4247 int
4248 aarch64_get_condition_code (rtx x)
4249 {
4250 machine_mode mode = GET_MODE (XEXP (x, 0));
4251 enum rtx_code comp_code = GET_CODE (x);
4252
4253 if (GET_MODE_CLASS (mode) != MODE_CC)
4254 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4255 return aarch64_get_condition_code_1 (mode, comp_code);
4256 }
4257
4258 static int
4259 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4260 {
4261 switch (mode)
4262 {
4263 case CCFPmode:
4264 case CCFPEmode:
4265 switch (comp_code)
4266 {
4267 case GE: return AARCH64_GE;
4268 case GT: return AARCH64_GT;
4269 case LE: return AARCH64_LS;
4270 case LT: return AARCH64_MI;
4271 case NE: return AARCH64_NE;
4272 case EQ: return AARCH64_EQ;
4273 case ORDERED: return AARCH64_VC;
4274 case UNORDERED: return AARCH64_VS;
4275 case UNLT: return AARCH64_LT;
4276 case UNLE: return AARCH64_LE;
4277 case UNGT: return AARCH64_HI;
4278 case UNGE: return AARCH64_PL;
4279 default: return -1;
4280 }
4281 break;
4282
4283 case CCmode:
4284 switch (comp_code)
4285 {
4286 case NE: return AARCH64_NE;
4287 case EQ: return AARCH64_EQ;
4288 case GE: return AARCH64_GE;
4289 case GT: return AARCH64_GT;
4290 case LE: return AARCH64_LE;
4291 case LT: return AARCH64_LT;
4292 case GEU: return AARCH64_CS;
4293 case GTU: return AARCH64_HI;
4294 case LEU: return AARCH64_LS;
4295 case LTU: return AARCH64_CC;
4296 default: return -1;
4297 }
4298 break;
4299
4300 case CC_SWPmode:
4301 case CC_ZESWPmode:
4302 case CC_SESWPmode:
4303 switch (comp_code)
4304 {
4305 case NE: return AARCH64_NE;
4306 case EQ: return AARCH64_EQ;
4307 case GE: return AARCH64_LE;
4308 case GT: return AARCH64_LT;
4309 case LE: return AARCH64_GE;
4310 case LT: return AARCH64_GT;
4311 case GEU: return AARCH64_LS;
4312 case GTU: return AARCH64_CC;
4313 case LEU: return AARCH64_CS;
4314 case LTU: return AARCH64_HI;
4315 default: return -1;
4316 }
4317 break;
4318
4319 case CC_NZmode:
4320 switch (comp_code)
4321 {
4322 case NE: return AARCH64_NE;
4323 case EQ: return AARCH64_EQ;
4324 case GE: return AARCH64_PL;
4325 case LT: return AARCH64_MI;
4326 default: return -1;
4327 }
4328 break;
4329
4330 case CC_Zmode:
4331 switch (comp_code)
4332 {
4333 case NE: return AARCH64_NE;
4334 case EQ: return AARCH64_EQ;
4335 default: return -1;
4336 }
4337 break;
4338
4339 case CC_Cmode:
4340 switch (comp_code)
4341 {
4342 case NE: return AARCH64_CS;
4343 case EQ: return AARCH64_CC;
4344 default: return -1;
4345 }
4346 break;
4347
4348 default:
4349 return -1;
4350 break;
4351 }
4352
4353 return -1;
4354 }
4355
4356 bool
4357 aarch64_const_vec_all_same_in_range_p (rtx x,
4358 HOST_WIDE_INT minval,
4359 HOST_WIDE_INT maxval)
4360 {
4361 HOST_WIDE_INT firstval;
4362 int count, i;
4363
4364 if (GET_CODE (x) != CONST_VECTOR
4365 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4366 return false;
4367
4368 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4369 if (firstval < minval || firstval > maxval)
4370 return false;
4371
4372 count = CONST_VECTOR_NUNITS (x);
4373 for (i = 1; i < count; i++)
4374 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4375 return false;
4376
4377 return true;
4378 }
4379
4380 bool
4381 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4382 {
4383 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4384 }
4385
4386
4387 /* N Z C V. */
4388 #define AARCH64_CC_V 1
4389 #define AARCH64_CC_C (1 << 1)
4390 #define AARCH64_CC_Z (1 << 2)
4391 #define AARCH64_CC_N (1 << 3)
4392
4393 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4394 static const int aarch64_nzcv_codes[] =
4395 {
4396 0, /* EQ, Z == 1. */
4397 AARCH64_CC_Z, /* NE, Z == 0. */
4398 0, /* CS, C == 1. */
4399 AARCH64_CC_C, /* CC, C == 0. */
4400 0, /* MI, N == 1. */
4401 AARCH64_CC_N, /* PL, N == 0. */
4402 0, /* VS, V == 1. */
4403 AARCH64_CC_V, /* VC, V == 0. */
4404 0, /* HI, C ==1 && Z == 0. */
4405 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4406 AARCH64_CC_V, /* GE, N == V. */
4407 0, /* LT, N != V. */
4408 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4409 0, /* LE, !(Z == 0 && N == V). */
4410 0, /* AL, Any. */
4411 0 /* NV, Any. */
4412 };
4413
4414 static void
4415 aarch64_print_operand (FILE *f, rtx x, int code)
4416 {
4417 switch (code)
4418 {
4419 /* An integer or symbol address without a preceding # sign. */
4420 case 'c':
4421 switch (GET_CODE (x))
4422 {
4423 case CONST_INT:
4424 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4425 break;
4426
4427 case SYMBOL_REF:
4428 output_addr_const (f, x);
4429 break;
4430
4431 case CONST:
4432 if (GET_CODE (XEXP (x, 0)) == PLUS
4433 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4434 {
4435 output_addr_const (f, x);
4436 break;
4437 }
4438 /* Fall through. */
4439
4440 default:
4441 output_operand_lossage ("Unsupported operand for code '%c'", code);
4442 }
4443 break;
4444
4445 case 'e':
4446 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4447 {
4448 int n;
4449
4450 if (!CONST_INT_P (x)
4451 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4452 {
4453 output_operand_lossage ("invalid operand for '%%%c'", code);
4454 return;
4455 }
4456
4457 switch (n)
4458 {
4459 case 3:
4460 fputc ('b', f);
4461 break;
4462 case 4:
4463 fputc ('h', f);
4464 break;
4465 case 5:
4466 fputc ('w', f);
4467 break;
4468 default:
4469 output_operand_lossage ("invalid operand for '%%%c'", code);
4470 return;
4471 }
4472 }
4473 break;
4474
4475 case 'p':
4476 {
4477 int n;
4478
4479 /* Print N such that 2^N == X. */
4480 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4481 {
4482 output_operand_lossage ("invalid operand for '%%%c'", code);
4483 return;
4484 }
4485
4486 asm_fprintf (f, "%d", n);
4487 }
4488 break;
4489
4490 case 'P':
4491 /* Print the number of non-zero bits in X (a const_int). */
4492 if (!CONST_INT_P (x))
4493 {
4494 output_operand_lossage ("invalid operand for '%%%c'", code);
4495 return;
4496 }
4497
4498 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4499 break;
4500
4501 case 'H':
4502 /* Print the higher numbered register of a pair (TImode) of regs. */
4503 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4504 {
4505 output_operand_lossage ("invalid operand for '%%%c'", code);
4506 return;
4507 }
4508
4509 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4510 break;
4511
4512 case 'M':
4513 case 'm':
4514 {
4515 int cond_code;
4516 /* Print a condition (eq, ne, etc) or its inverse. */
4517
4518 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4519 if (x == const_true_rtx)
4520 {
4521 if (code == 'M')
4522 fputs ("nv", f);
4523 return;
4524 }
4525
4526 if (!COMPARISON_P (x))
4527 {
4528 output_operand_lossage ("invalid operand for '%%%c'", code);
4529 return;
4530 }
4531
4532 cond_code = aarch64_get_condition_code (x);
4533 gcc_assert (cond_code >= 0);
4534 if (code == 'M')
4535 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4536 fputs (aarch64_condition_codes[cond_code], f);
4537 }
4538 break;
4539
4540 case 'b':
4541 case 'h':
4542 case 's':
4543 case 'd':
4544 case 'q':
4545 /* Print a scalar FP/SIMD register name. */
4546 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4547 {
4548 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4549 return;
4550 }
4551 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4552 break;
4553
4554 case 'S':
4555 case 'T':
4556 case 'U':
4557 case 'V':
4558 /* Print the first FP/SIMD register name in a list. */
4559 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4560 {
4561 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4562 return;
4563 }
4564 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4565 break;
4566
4567 case 'R':
4568 /* Print a scalar FP/SIMD register name + 1. */
4569 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4570 {
4571 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4572 return;
4573 }
4574 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4575 break;
4576
4577 case 'X':
4578 /* Print bottom 16 bits of integer constant in hex. */
4579 if (!CONST_INT_P (x))
4580 {
4581 output_operand_lossage ("invalid operand for '%%%c'", code);
4582 return;
4583 }
4584 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4585 break;
4586
4587 case 'w':
4588 case 'x':
4589 /* Print a general register name or the zero register (32-bit or
4590 64-bit). */
4591 if (x == const0_rtx
4592 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4593 {
4594 asm_fprintf (f, "%czr", code);
4595 break;
4596 }
4597
4598 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4599 {
4600 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4601 break;
4602 }
4603
4604 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4605 {
4606 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4607 break;
4608 }
4609
4610 /* Fall through */
4611
4612 case 0:
4613 /* Print a normal operand, if it's a general register, then we
4614 assume DImode. */
4615 if (x == NULL)
4616 {
4617 output_operand_lossage ("missing operand");
4618 return;
4619 }
4620
4621 switch (GET_CODE (x))
4622 {
4623 case REG:
4624 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4625 break;
4626
4627 case MEM:
4628 output_address (GET_MODE (x), XEXP (x, 0));
4629 break;
4630
4631 case CONST:
4632 case LABEL_REF:
4633 case SYMBOL_REF:
4634 output_addr_const (asm_out_file, x);
4635 break;
4636
4637 case CONST_INT:
4638 asm_fprintf (f, "%wd", INTVAL (x));
4639 break;
4640
4641 case CONST_VECTOR:
4642 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4643 {
4644 gcc_assert (
4645 aarch64_const_vec_all_same_in_range_p (x,
4646 HOST_WIDE_INT_MIN,
4647 HOST_WIDE_INT_MAX));
4648 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4649 }
4650 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4651 {
4652 fputc ('0', f);
4653 }
4654 else
4655 gcc_unreachable ();
4656 break;
4657
4658 case CONST_DOUBLE:
4659 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4660 be getting CONST_DOUBLEs holding integers. */
4661 gcc_assert (GET_MODE (x) != VOIDmode);
4662 if (aarch64_float_const_zero_rtx_p (x))
4663 {
4664 fputc ('0', f);
4665 break;
4666 }
4667 else if (aarch64_float_const_representable_p (x))
4668 {
4669 #define buf_size 20
4670 char float_buf[buf_size] = {'\0'};
4671 real_to_decimal_for_mode (float_buf,
4672 CONST_DOUBLE_REAL_VALUE (x),
4673 buf_size, buf_size,
4674 1, GET_MODE (x));
4675 asm_fprintf (asm_out_file, "%s", float_buf);
4676 break;
4677 #undef buf_size
4678 }
4679 output_operand_lossage ("invalid constant");
4680 return;
4681 default:
4682 output_operand_lossage ("invalid operand");
4683 return;
4684 }
4685 break;
4686
4687 case 'A':
4688 if (GET_CODE (x) == HIGH)
4689 x = XEXP (x, 0);
4690
4691 switch (aarch64_classify_symbolic_expression (x))
4692 {
4693 case SYMBOL_SMALL_GOT_4G:
4694 asm_fprintf (asm_out_file, ":got:");
4695 break;
4696
4697 case SYMBOL_SMALL_TLSGD:
4698 asm_fprintf (asm_out_file, ":tlsgd:");
4699 break;
4700
4701 case SYMBOL_SMALL_TLSDESC:
4702 asm_fprintf (asm_out_file, ":tlsdesc:");
4703 break;
4704
4705 case SYMBOL_SMALL_TLSIE:
4706 asm_fprintf (asm_out_file, ":gottprel:");
4707 break;
4708
4709 case SYMBOL_TLSLE24:
4710 asm_fprintf (asm_out_file, ":tprel:");
4711 break;
4712
4713 case SYMBOL_TINY_GOT:
4714 gcc_unreachable ();
4715 break;
4716
4717 default:
4718 break;
4719 }
4720 output_addr_const (asm_out_file, x);
4721 break;
4722
4723 case 'L':
4724 switch (aarch64_classify_symbolic_expression (x))
4725 {
4726 case SYMBOL_SMALL_GOT_4G:
4727 asm_fprintf (asm_out_file, ":lo12:");
4728 break;
4729
4730 case SYMBOL_SMALL_TLSGD:
4731 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4732 break;
4733
4734 case SYMBOL_SMALL_TLSDESC:
4735 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4736 break;
4737
4738 case SYMBOL_SMALL_TLSIE:
4739 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4740 break;
4741
4742 case SYMBOL_TLSLE12:
4743 asm_fprintf (asm_out_file, ":tprel_lo12:");
4744 break;
4745
4746 case SYMBOL_TLSLE24:
4747 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4748 break;
4749
4750 case SYMBOL_TINY_GOT:
4751 asm_fprintf (asm_out_file, ":got:");
4752 break;
4753
4754 case SYMBOL_TINY_TLSIE:
4755 asm_fprintf (asm_out_file, ":gottprel:");
4756 break;
4757
4758 default:
4759 break;
4760 }
4761 output_addr_const (asm_out_file, x);
4762 break;
4763
4764 case 'G':
4765
4766 switch (aarch64_classify_symbolic_expression (x))
4767 {
4768 case SYMBOL_TLSLE24:
4769 asm_fprintf (asm_out_file, ":tprel_hi12:");
4770 break;
4771 default:
4772 break;
4773 }
4774 output_addr_const (asm_out_file, x);
4775 break;
4776
4777 case 'k':
4778 {
4779 HOST_WIDE_INT cond_code;
4780 /* Print nzcv. */
4781
4782 if (!CONST_INT_P (x))
4783 {
4784 output_operand_lossage ("invalid operand for '%%%c'", code);
4785 return;
4786 }
4787
4788 cond_code = INTVAL (x);
4789 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4790 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4791 }
4792 break;
4793
4794 default:
4795 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4796 return;
4797 }
4798 }
4799
4800 static void
4801 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4802 {
4803 struct aarch64_address_info addr;
4804
4805 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4806 switch (addr.type)
4807 {
4808 case ADDRESS_REG_IMM:
4809 if (addr.offset == const0_rtx)
4810 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4811 else
4812 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4813 INTVAL (addr.offset));
4814 return;
4815
4816 case ADDRESS_REG_REG:
4817 if (addr.shift == 0)
4818 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4819 reg_names [REGNO (addr.offset)]);
4820 else
4821 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4822 reg_names [REGNO (addr.offset)], addr.shift);
4823 return;
4824
4825 case ADDRESS_REG_UXTW:
4826 if (addr.shift == 0)
4827 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4828 REGNO (addr.offset) - R0_REGNUM);
4829 else
4830 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4831 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4832 return;
4833
4834 case ADDRESS_REG_SXTW:
4835 if (addr.shift == 0)
4836 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4837 REGNO (addr.offset) - R0_REGNUM);
4838 else
4839 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4840 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4841 return;
4842
4843 case ADDRESS_REG_WB:
4844 switch (GET_CODE (x))
4845 {
4846 case PRE_INC:
4847 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4848 GET_MODE_SIZE (mode));
4849 return;
4850 case POST_INC:
4851 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4852 GET_MODE_SIZE (mode));
4853 return;
4854 case PRE_DEC:
4855 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4856 GET_MODE_SIZE (mode));
4857 return;
4858 case POST_DEC:
4859 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4860 GET_MODE_SIZE (mode));
4861 return;
4862 case PRE_MODIFY:
4863 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4864 INTVAL (addr.offset));
4865 return;
4866 case POST_MODIFY:
4867 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4868 INTVAL (addr.offset));
4869 return;
4870 default:
4871 break;
4872 }
4873 break;
4874
4875 case ADDRESS_LO_SUM:
4876 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4877 output_addr_const (f, addr.offset);
4878 asm_fprintf (f, "]");
4879 return;
4880
4881 case ADDRESS_SYMBOLIC:
4882 break;
4883 }
4884
4885 output_addr_const (f, x);
4886 }
4887
4888 bool
4889 aarch64_label_mentioned_p (rtx x)
4890 {
4891 const char *fmt;
4892 int i;
4893
4894 if (GET_CODE (x) == LABEL_REF)
4895 return true;
4896
4897 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4898 referencing instruction, but they are constant offsets, not
4899 symbols. */
4900 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4901 return false;
4902
4903 fmt = GET_RTX_FORMAT (GET_CODE (x));
4904 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4905 {
4906 if (fmt[i] == 'E')
4907 {
4908 int j;
4909
4910 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4911 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4912 return 1;
4913 }
4914 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4915 return 1;
4916 }
4917
4918 return 0;
4919 }
4920
4921 /* Implement REGNO_REG_CLASS. */
4922
4923 enum reg_class
4924 aarch64_regno_regclass (unsigned regno)
4925 {
4926 if (GP_REGNUM_P (regno))
4927 return GENERAL_REGS;
4928
4929 if (regno == SP_REGNUM)
4930 return STACK_REG;
4931
4932 if (regno == FRAME_POINTER_REGNUM
4933 || regno == ARG_POINTER_REGNUM)
4934 return POINTER_REGS;
4935
4936 if (FP_REGNUM_P (regno))
4937 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4938
4939 return NO_REGS;
4940 }
4941
4942 static rtx
4943 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4944 {
4945 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4946 where mask is selected by alignment and size of the offset.
4947 We try to pick as large a range for the offset as possible to
4948 maximize the chance of a CSE. However, for aligned addresses
4949 we limit the range to 4k so that structures with different sized
4950 elements are likely to use the same base. We need to be careful
4951 not to split a CONST for some forms of address expression, otherwise
4952 it will generate sub-optimal code. */
4953
4954 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4955 {
4956 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4957 HOST_WIDE_INT base_offset;
4958
4959 if (GET_CODE (XEXP (x, 0)) == PLUS)
4960 {
4961 rtx op0 = XEXP (XEXP (x, 0), 0);
4962 rtx op1 = XEXP (XEXP (x, 0), 1);
4963
4964 /* Address expressions of the form Ra + Rb + CONST.
4965
4966 If CONST is within the range supported by the addressing
4967 mode "reg+offset", do not split CONST and use the
4968 sequence
4969 Rt = Ra + Rb;
4970 addr = Rt + CONST. */
4971 if (REG_P (op0) && REG_P (op1))
4972 {
4973 machine_mode addr_mode = GET_MODE (x);
4974 rtx base = gen_reg_rtx (addr_mode);
4975 rtx addr = plus_constant (addr_mode, base, offset);
4976
4977 if (aarch64_legitimate_address_hook_p (mode, addr, false))
4978 {
4979 emit_insn (gen_adddi3 (base, op0, op1));
4980 return addr;
4981 }
4982 }
4983 /* Address expressions of the form Ra + Rb<<SCALE + CONST.
4984
4985 If Reg + Rb<<SCALE is a valid address expression, do not
4986 split CONST and use the sequence
4987 Rc = CONST;
4988 Rt = Ra + Rc;
4989 addr = Rt + Rb<<SCALE.
4990
4991 TODO: We really should split CONST out of memory referece
4992 because:
4993 a) We depend on GIMPLE optimizers to pick up common sub
4994 expression involving the scaling operation.
4995 b) The index Rb is likely a loop iv, it's better to split
4996 the CONST so that computation of new base Rt is a loop
4997 invariant and can be moved out of loop. This is more
4998 important when the original base Ra is sfp related.
4999
5000 Unfortunately, GIMPLE optimizers (e.g., SLSR) can not handle
5001 this kind of CSE opportunity at the time of this change, we
5002 have to force register scaling expr out of memory ref now. */
5003 else if (REG_P (op0) || REG_P (op1))
5004 {
5005 machine_mode addr_mode = GET_MODE (x);
5006 rtx base = gen_reg_rtx (addr_mode);
5007
5008 /* Switch to make sure that register is in op0. */
5009 if (REG_P (op1))
5010 std::swap (op0, op1);
5011
5012 rtx addr = plus_constant (addr_mode, base, offset);
5013
5014 if (aarch64_legitimate_address_hook_p (mode, addr, false))
5015 {
5016 base = force_operand (gen_rtx_PLUS (addr_mode, op1, op0),
5017 NULL_RTX);
5018 return plus_constant (addr_mode, base, offset);
5019 }
5020 }
5021 }
5022
5023 /* Does it look like we'll need a load/store-pair operation? */
5024 if (GET_MODE_SIZE (mode) > 16
5025 || mode == TImode)
5026 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5027 & ~((128 * GET_MODE_SIZE (mode)) - 1));
5028 /* For offsets aren't a multiple of the access size, the limit is
5029 -256...255. */
5030 else if (offset & (GET_MODE_SIZE (mode) - 1))
5031 base_offset = (offset + 0x100) & ~0x1ff;
5032 else
5033 base_offset = offset & ~0xfff;
5034
5035 if (base_offset == 0)
5036 return x;
5037
5038 offset -= base_offset;
5039 rtx base_reg = gen_reg_rtx (Pmode);
5040 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
5041 NULL_RTX);
5042 emit_move_insn (base_reg, val);
5043 x = plus_constant (Pmode, base_reg, offset);
5044 }
5045
5046 return x;
5047 }
5048
5049 /* Try a machine-dependent way of reloading an illegitimate address
5050 operand. If we find one, push the reload and return the new rtx. */
5051
5052 rtx
5053 aarch64_legitimize_reload_address (rtx *x_p,
5054 machine_mode mode,
5055 int opnum, int type,
5056 int ind_levels ATTRIBUTE_UNUSED)
5057 {
5058 rtx x = *x_p;
5059
5060 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
5061 if (aarch64_vect_struct_mode_p (mode)
5062 && GET_CODE (x) == PLUS
5063 && REG_P (XEXP (x, 0))
5064 && CONST_INT_P (XEXP (x, 1)))
5065 {
5066 rtx orig_rtx = x;
5067 x = copy_rtx (x);
5068 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
5069 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5070 opnum, (enum reload_type) type);
5071 return x;
5072 }
5073
5074 /* We must recognize output that we have already generated ourselves. */
5075 if (GET_CODE (x) == PLUS
5076 && GET_CODE (XEXP (x, 0)) == PLUS
5077 && REG_P (XEXP (XEXP (x, 0), 0))
5078 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5079 && CONST_INT_P (XEXP (x, 1)))
5080 {
5081 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5082 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5083 opnum, (enum reload_type) type);
5084 return x;
5085 }
5086
5087 /* We wish to handle large displacements off a base register by splitting
5088 the addend across an add and the mem insn. This can cut the number of
5089 extra insns needed from 3 to 1. It is only useful for load/store of a
5090 single register with 12 bit offset field. */
5091 if (GET_CODE (x) == PLUS
5092 && REG_P (XEXP (x, 0))
5093 && CONST_INT_P (XEXP (x, 1))
5094 && HARD_REGISTER_P (XEXP (x, 0))
5095 && mode != TImode
5096 && mode != TFmode
5097 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
5098 {
5099 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
5100 HOST_WIDE_INT low = val & 0xfff;
5101 HOST_WIDE_INT high = val - low;
5102 HOST_WIDE_INT offs;
5103 rtx cst;
5104 machine_mode xmode = GET_MODE (x);
5105
5106 /* In ILP32, xmode can be either DImode or SImode. */
5107 gcc_assert (xmode == DImode || xmode == SImode);
5108
5109 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
5110 BLKmode alignment. */
5111 if (GET_MODE_SIZE (mode) == 0)
5112 return NULL_RTX;
5113
5114 offs = low % GET_MODE_SIZE (mode);
5115
5116 /* Align misaligned offset by adjusting high part to compensate. */
5117 if (offs != 0)
5118 {
5119 if (aarch64_uimm12_shift (high + offs))
5120 {
5121 /* Align down. */
5122 low = low - offs;
5123 high = high + offs;
5124 }
5125 else
5126 {
5127 /* Align up. */
5128 offs = GET_MODE_SIZE (mode) - offs;
5129 low = low + offs;
5130 high = high + (low & 0x1000) - offs;
5131 low &= 0xfff;
5132 }
5133 }
5134
5135 /* Check for overflow. */
5136 if (high + low != val)
5137 return NULL_RTX;
5138
5139 cst = GEN_INT (high);
5140 if (!aarch64_uimm12_shift (high))
5141 cst = force_const_mem (xmode, cst);
5142
5143 /* Reload high part into base reg, leaving the low part
5144 in the mem instruction.
5145 Note that replacing this gen_rtx_PLUS with plus_constant is
5146 wrong in this case because we rely on the
5147 (plus (plus reg c1) c2) structure being preserved so that
5148 XEXP (*p, 0) in push_reload below uses the correct term. */
5149 x = gen_rtx_PLUS (xmode,
5150 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
5151 GEN_INT (low));
5152
5153 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5154 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
5155 opnum, (enum reload_type) type);
5156 return x;
5157 }
5158
5159 return NULL_RTX;
5160 }
5161
5162
5163 /* Return the reload icode required for a constant pool in mode. */
5164 static enum insn_code
5165 aarch64_constant_pool_reload_icode (machine_mode mode)
5166 {
5167 switch (mode)
5168 {
5169 case SFmode:
5170 return CODE_FOR_aarch64_reload_movcpsfdi;
5171
5172 case DFmode:
5173 return CODE_FOR_aarch64_reload_movcpdfdi;
5174
5175 case TFmode:
5176 return CODE_FOR_aarch64_reload_movcptfdi;
5177
5178 case V8QImode:
5179 return CODE_FOR_aarch64_reload_movcpv8qidi;
5180
5181 case V16QImode:
5182 return CODE_FOR_aarch64_reload_movcpv16qidi;
5183
5184 case V4HImode:
5185 return CODE_FOR_aarch64_reload_movcpv4hidi;
5186
5187 case V8HImode:
5188 return CODE_FOR_aarch64_reload_movcpv8hidi;
5189
5190 case V2SImode:
5191 return CODE_FOR_aarch64_reload_movcpv2sidi;
5192
5193 case V4SImode:
5194 return CODE_FOR_aarch64_reload_movcpv4sidi;
5195
5196 case V2DImode:
5197 return CODE_FOR_aarch64_reload_movcpv2didi;
5198
5199 case V2DFmode:
5200 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5201
5202 default:
5203 gcc_unreachable ();
5204 }
5205
5206 gcc_unreachable ();
5207 }
5208 static reg_class_t
5209 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5210 reg_class_t rclass,
5211 machine_mode mode,
5212 secondary_reload_info *sri)
5213 {
5214
5215 /* If we have to disable direct literal pool loads and stores because the
5216 function is too big, then we need a scratch register. */
5217 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5218 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5219 || targetm.vector_mode_supported_p (GET_MODE (x)))
5220 && aarch64_nopcrelative_literal_loads)
5221 {
5222 sri->icode = aarch64_constant_pool_reload_icode (mode);
5223 return NO_REGS;
5224 }
5225
5226 /* Without the TARGET_SIMD instructions we cannot move a Q register
5227 to a Q register directly. We need a scratch. */
5228 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5229 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5230 && reg_class_subset_p (rclass, FP_REGS))
5231 {
5232 if (mode == TFmode)
5233 sri->icode = CODE_FOR_aarch64_reload_movtf;
5234 else if (mode == TImode)
5235 sri->icode = CODE_FOR_aarch64_reload_movti;
5236 return NO_REGS;
5237 }
5238
5239 /* A TFmode or TImode memory access should be handled via an FP_REGS
5240 because AArch64 has richer addressing modes for LDR/STR instructions
5241 than LDP/STP instructions. */
5242 if (TARGET_FLOAT && rclass == GENERAL_REGS
5243 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5244 return FP_REGS;
5245
5246 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5247 return GENERAL_REGS;
5248
5249 return NO_REGS;
5250 }
5251
5252 static bool
5253 aarch64_can_eliminate (const int from, const int to)
5254 {
5255 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5256 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5257
5258 if (frame_pointer_needed)
5259 {
5260 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5261 return true;
5262 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5263 return false;
5264 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5265 && !cfun->calls_alloca)
5266 return true;
5267 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5268 return true;
5269
5270 return false;
5271 }
5272 else
5273 {
5274 /* If we decided that we didn't need a leaf frame pointer but then used
5275 LR in the function, then we'll want a frame pointer after all, so
5276 prevent this elimination to ensure a frame pointer is used. */
5277 if (to == STACK_POINTER_REGNUM
5278 && flag_omit_leaf_frame_pointer
5279 && df_regs_ever_live_p (LR_REGNUM))
5280 return false;
5281 }
5282
5283 return true;
5284 }
5285
5286 HOST_WIDE_INT
5287 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5288 {
5289 aarch64_layout_frame ();
5290
5291 if (to == HARD_FRAME_POINTER_REGNUM)
5292 {
5293 if (from == ARG_POINTER_REGNUM)
5294 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
5295
5296 if (from == FRAME_POINTER_REGNUM)
5297 return (cfun->machine->frame.hard_fp_offset
5298 - cfun->machine->frame.saved_varargs_size);
5299 }
5300
5301 if (to == STACK_POINTER_REGNUM)
5302 {
5303 if (from == FRAME_POINTER_REGNUM)
5304 return (cfun->machine->frame.frame_size
5305 - cfun->machine->frame.saved_varargs_size);
5306 }
5307
5308 return cfun->machine->frame.frame_size;
5309 }
5310
5311 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5312 previous frame. */
5313
5314 rtx
5315 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5316 {
5317 if (count != 0)
5318 return const0_rtx;
5319 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5320 }
5321
5322
5323 static void
5324 aarch64_asm_trampoline_template (FILE *f)
5325 {
5326 if (TARGET_ILP32)
5327 {
5328 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5329 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5330 }
5331 else
5332 {
5333 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5334 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5335 }
5336 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5337 assemble_aligned_integer (4, const0_rtx);
5338 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5339 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5340 }
5341
5342 static void
5343 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5344 {
5345 rtx fnaddr, mem, a_tramp;
5346 const int tramp_code_sz = 16;
5347
5348 /* Don't need to copy the trailing D-words, we fill those in below. */
5349 emit_block_move (m_tramp, assemble_trampoline_template (),
5350 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5351 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5352 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5353 if (GET_MODE (fnaddr) != ptr_mode)
5354 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5355 emit_move_insn (mem, fnaddr);
5356
5357 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5358 emit_move_insn (mem, chain_value);
5359
5360 /* XXX We should really define a "clear_cache" pattern and use
5361 gen_clear_cache(). */
5362 a_tramp = XEXP (m_tramp, 0);
5363 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5364 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5365 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5366 ptr_mode);
5367 }
5368
5369 static unsigned char
5370 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5371 {
5372 switch (regclass)
5373 {
5374 case CALLER_SAVE_REGS:
5375 case POINTER_REGS:
5376 case GENERAL_REGS:
5377 case ALL_REGS:
5378 case FP_REGS:
5379 case FP_LO_REGS:
5380 return
5381 aarch64_vector_mode_p (mode)
5382 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5383 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5384 case STACK_REG:
5385 return 1;
5386
5387 case NO_REGS:
5388 return 0;
5389
5390 default:
5391 break;
5392 }
5393 gcc_unreachable ();
5394 }
5395
5396 static reg_class_t
5397 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5398 {
5399 if (regclass == POINTER_REGS)
5400 return GENERAL_REGS;
5401
5402 if (regclass == STACK_REG)
5403 {
5404 if (REG_P(x)
5405 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5406 return regclass;
5407
5408 return NO_REGS;
5409 }
5410
5411 /* If it's an integer immediate that MOVI can't handle, then
5412 FP_REGS is not an option, so we return NO_REGS instead. */
5413 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5414 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5415 return NO_REGS;
5416
5417 /* Register eliminiation can result in a request for
5418 SP+constant->FP_REGS. We cannot support such operations which
5419 use SP as source and an FP_REG as destination, so reject out
5420 right now. */
5421 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5422 {
5423 rtx lhs = XEXP (x, 0);
5424
5425 /* Look through a possible SUBREG introduced by ILP32. */
5426 if (GET_CODE (lhs) == SUBREG)
5427 lhs = SUBREG_REG (lhs);
5428
5429 gcc_assert (REG_P (lhs));
5430 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5431 POINTER_REGS));
5432 return NO_REGS;
5433 }
5434
5435 return regclass;
5436 }
5437
5438 void
5439 aarch64_asm_output_labelref (FILE* f, const char *name)
5440 {
5441 asm_fprintf (f, "%U%s", name);
5442 }
5443
5444 static void
5445 aarch64_elf_asm_constructor (rtx symbol, int priority)
5446 {
5447 if (priority == DEFAULT_INIT_PRIORITY)
5448 default_ctor_section_asm_out_constructor (symbol, priority);
5449 else
5450 {
5451 section *s;
5452 char buf[18];
5453 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5454 s = get_section (buf, SECTION_WRITE, NULL);
5455 switch_to_section (s);
5456 assemble_align (POINTER_SIZE);
5457 assemble_aligned_integer (POINTER_BYTES, symbol);
5458 }
5459 }
5460
5461 static void
5462 aarch64_elf_asm_destructor (rtx symbol, int priority)
5463 {
5464 if (priority == DEFAULT_INIT_PRIORITY)
5465 default_dtor_section_asm_out_destructor (symbol, priority);
5466 else
5467 {
5468 section *s;
5469 char buf[18];
5470 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5471 s = get_section (buf, SECTION_WRITE, NULL);
5472 switch_to_section (s);
5473 assemble_align (POINTER_SIZE);
5474 assemble_aligned_integer (POINTER_BYTES, symbol);
5475 }
5476 }
5477
5478 const char*
5479 aarch64_output_casesi (rtx *operands)
5480 {
5481 char buf[100];
5482 char label[100];
5483 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5484 int index;
5485 static const char *const patterns[4][2] =
5486 {
5487 {
5488 "ldrb\t%w3, [%0,%w1,uxtw]",
5489 "add\t%3, %4, %w3, sxtb #2"
5490 },
5491 {
5492 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5493 "add\t%3, %4, %w3, sxth #2"
5494 },
5495 {
5496 "ldr\t%w3, [%0,%w1,uxtw #2]",
5497 "add\t%3, %4, %w3, sxtw #2"
5498 },
5499 /* We assume that DImode is only generated when not optimizing and
5500 that we don't really need 64-bit address offsets. That would
5501 imply an object file with 8GB of code in a single function! */
5502 {
5503 "ldr\t%w3, [%0,%w1,uxtw #2]",
5504 "add\t%3, %4, %w3, sxtw #2"
5505 }
5506 };
5507
5508 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5509
5510 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5511
5512 gcc_assert (index >= 0 && index <= 3);
5513
5514 /* Need to implement table size reduction, by chaning the code below. */
5515 output_asm_insn (patterns[index][0], operands);
5516 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5517 snprintf (buf, sizeof (buf),
5518 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5519 output_asm_insn (buf, operands);
5520 output_asm_insn (patterns[index][1], operands);
5521 output_asm_insn ("br\t%3", operands);
5522 assemble_label (asm_out_file, label);
5523 return "";
5524 }
5525
5526
5527 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5528 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5529 operator. */
5530
5531 int
5532 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5533 {
5534 if (shift >= 0 && shift <= 3)
5535 {
5536 int size;
5537 for (size = 8; size <= 32; size *= 2)
5538 {
5539 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5540 if (mask == bits << shift)
5541 return size;
5542 }
5543 }
5544 return 0;
5545 }
5546
5547 /* Constant pools are per function only when PC relative
5548 literal loads are true or we are in the large memory
5549 model. */
5550
5551 static inline bool
5552 aarch64_can_use_per_function_literal_pools_p (void)
5553 {
5554 return (!aarch64_nopcrelative_literal_loads
5555 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5556 }
5557
5558 static bool
5559 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5560 {
5561 /* Fixme:: In an ideal world this would work similar
5562 to the logic in aarch64_select_rtx_section but this
5563 breaks bootstrap in gcc go. For now we workaround
5564 this by returning false here. */
5565 return false;
5566 }
5567
5568 /* Select appropriate section for constants depending
5569 on where we place literal pools. */
5570
5571 static section *
5572 aarch64_select_rtx_section (machine_mode mode,
5573 rtx x,
5574 unsigned HOST_WIDE_INT align)
5575 {
5576 if (aarch64_can_use_per_function_literal_pools_p ())
5577 return function_section (current_function_decl);
5578
5579 return default_elf_select_rtx_section (mode, x, align);
5580 }
5581
5582 /* Costs. */
5583
5584 /* Helper function for rtx cost calculation. Strip a shift expression
5585 from X. Returns the inner operand if successful, or the original
5586 expression on failure. */
5587 static rtx
5588 aarch64_strip_shift (rtx x)
5589 {
5590 rtx op = x;
5591
5592 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5593 we can convert both to ROR during final output. */
5594 if ((GET_CODE (op) == ASHIFT
5595 || GET_CODE (op) == ASHIFTRT
5596 || GET_CODE (op) == LSHIFTRT
5597 || GET_CODE (op) == ROTATERT
5598 || GET_CODE (op) == ROTATE)
5599 && CONST_INT_P (XEXP (op, 1)))
5600 return XEXP (op, 0);
5601
5602 if (GET_CODE (op) == MULT
5603 && CONST_INT_P (XEXP (op, 1))
5604 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5605 return XEXP (op, 0);
5606
5607 return x;
5608 }
5609
5610 /* Helper function for rtx cost calculation. Strip an extend
5611 expression from X. Returns the inner operand if successful, or the
5612 original expression on failure. We deal with a number of possible
5613 canonicalization variations here. */
5614 static rtx
5615 aarch64_strip_extend (rtx x)
5616 {
5617 rtx op = x;
5618
5619 /* Zero and sign extraction of a widened value. */
5620 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5621 && XEXP (op, 2) == const0_rtx
5622 && GET_CODE (XEXP (op, 0)) == MULT
5623 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5624 XEXP (op, 1)))
5625 return XEXP (XEXP (op, 0), 0);
5626
5627 /* It can also be represented (for zero-extend) as an AND with an
5628 immediate. */
5629 if (GET_CODE (op) == AND
5630 && GET_CODE (XEXP (op, 0)) == MULT
5631 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5632 && CONST_INT_P (XEXP (op, 1))
5633 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5634 INTVAL (XEXP (op, 1))) != 0)
5635 return XEXP (XEXP (op, 0), 0);
5636
5637 /* Now handle extended register, as this may also have an optional
5638 left shift by 1..4. */
5639 if (GET_CODE (op) == ASHIFT
5640 && CONST_INT_P (XEXP (op, 1))
5641 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5642 op = XEXP (op, 0);
5643
5644 if (GET_CODE (op) == ZERO_EXTEND
5645 || GET_CODE (op) == SIGN_EXTEND)
5646 op = XEXP (op, 0);
5647
5648 if (op != x)
5649 return op;
5650
5651 return x;
5652 }
5653
5654 /* Return true iff CODE is a shift supported in combination
5655 with arithmetic instructions. */
5656
5657 static bool
5658 aarch64_shift_p (enum rtx_code code)
5659 {
5660 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5661 }
5662
5663 /* Helper function for rtx cost calculation. Calculate the cost of
5664 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5665 Return the calculated cost of the expression, recursing manually in to
5666 operands where needed. */
5667
5668 static int
5669 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5670 {
5671 rtx op0, op1;
5672 const struct cpu_cost_table *extra_cost
5673 = aarch64_tune_params.insn_extra_cost;
5674 int cost = 0;
5675 bool compound_p = (outer == PLUS || outer == MINUS);
5676 machine_mode mode = GET_MODE (x);
5677
5678 gcc_checking_assert (code == MULT);
5679
5680 op0 = XEXP (x, 0);
5681 op1 = XEXP (x, 1);
5682
5683 if (VECTOR_MODE_P (mode))
5684 mode = GET_MODE_INNER (mode);
5685
5686 /* Integer multiply/fma. */
5687 if (GET_MODE_CLASS (mode) == MODE_INT)
5688 {
5689 /* The multiply will be canonicalized as a shift, cost it as such. */
5690 if (aarch64_shift_p (GET_CODE (x))
5691 || (CONST_INT_P (op1)
5692 && exact_log2 (INTVAL (op1)) > 0))
5693 {
5694 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5695 || GET_CODE (op0) == SIGN_EXTEND;
5696 if (speed)
5697 {
5698 if (compound_p)
5699 {
5700 if (REG_P (op1))
5701 /* ARITH + shift-by-register. */
5702 cost += extra_cost->alu.arith_shift_reg;
5703 else if (is_extend)
5704 /* ARITH + extended register. We don't have a cost field
5705 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5706 cost += extra_cost->alu.extend_arith;
5707 else
5708 /* ARITH + shift-by-immediate. */
5709 cost += extra_cost->alu.arith_shift;
5710 }
5711 else
5712 /* LSL (immediate). */
5713 cost += extra_cost->alu.shift;
5714
5715 }
5716 /* Strip extends as we will have costed them in the case above. */
5717 if (is_extend)
5718 op0 = aarch64_strip_extend (op0);
5719
5720 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5721
5722 return cost;
5723 }
5724
5725 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5726 compound and let the below cases handle it. After all, MNEG is a
5727 special-case alias of MSUB. */
5728 if (GET_CODE (op0) == NEG)
5729 {
5730 op0 = XEXP (op0, 0);
5731 compound_p = true;
5732 }
5733
5734 /* Integer multiplies or FMAs have zero/sign extending variants. */
5735 if ((GET_CODE (op0) == ZERO_EXTEND
5736 && GET_CODE (op1) == ZERO_EXTEND)
5737 || (GET_CODE (op0) == SIGN_EXTEND
5738 && GET_CODE (op1) == SIGN_EXTEND))
5739 {
5740 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5741 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5742
5743 if (speed)
5744 {
5745 if (compound_p)
5746 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5747 cost += extra_cost->mult[0].extend_add;
5748 else
5749 /* MUL/SMULL/UMULL. */
5750 cost += extra_cost->mult[0].extend;
5751 }
5752
5753 return cost;
5754 }
5755
5756 /* This is either an integer multiply or a MADD. In both cases
5757 we want to recurse and cost the operands. */
5758 cost += rtx_cost (op0, mode, MULT, 0, speed);
5759 cost += rtx_cost (op1, mode, MULT, 1, speed);
5760
5761 if (speed)
5762 {
5763 if (compound_p)
5764 /* MADD/MSUB. */
5765 cost += extra_cost->mult[mode == DImode].add;
5766 else
5767 /* MUL. */
5768 cost += extra_cost->mult[mode == DImode].simple;
5769 }
5770
5771 return cost;
5772 }
5773 else
5774 {
5775 if (speed)
5776 {
5777 /* Floating-point FMA/FMUL can also support negations of the
5778 operands, unless the rounding mode is upward or downward in
5779 which case FNMUL is different than FMUL with operand negation. */
5780 bool neg0 = GET_CODE (op0) == NEG;
5781 bool neg1 = GET_CODE (op1) == NEG;
5782 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5783 {
5784 if (neg0)
5785 op0 = XEXP (op0, 0);
5786 if (neg1)
5787 op1 = XEXP (op1, 0);
5788 }
5789
5790 if (compound_p)
5791 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5792 cost += extra_cost->fp[mode == DFmode].fma;
5793 else
5794 /* FMUL/FNMUL. */
5795 cost += extra_cost->fp[mode == DFmode].mult;
5796 }
5797
5798 cost += rtx_cost (op0, mode, MULT, 0, speed);
5799 cost += rtx_cost (op1, mode, MULT, 1, speed);
5800 return cost;
5801 }
5802 }
5803
5804 static int
5805 aarch64_address_cost (rtx x,
5806 machine_mode mode,
5807 addr_space_t as ATTRIBUTE_UNUSED,
5808 bool speed)
5809 {
5810 enum rtx_code c = GET_CODE (x);
5811 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5812 struct aarch64_address_info info;
5813 int cost = 0;
5814 info.shift = 0;
5815
5816 if (!aarch64_classify_address (&info, x, mode, c, false))
5817 {
5818 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5819 {
5820 /* This is a CONST or SYMBOL ref which will be split
5821 in a different way depending on the code model in use.
5822 Cost it through the generic infrastructure. */
5823 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5824 /* Divide through by the cost of one instruction to
5825 bring it to the same units as the address costs. */
5826 cost_symbol_ref /= COSTS_N_INSNS (1);
5827 /* The cost is then the cost of preparing the address,
5828 followed by an immediate (possibly 0) offset. */
5829 return cost_symbol_ref + addr_cost->imm_offset;
5830 }
5831 else
5832 {
5833 /* This is most likely a jump table from a case
5834 statement. */
5835 return addr_cost->register_offset;
5836 }
5837 }
5838
5839 switch (info.type)
5840 {
5841 case ADDRESS_LO_SUM:
5842 case ADDRESS_SYMBOLIC:
5843 case ADDRESS_REG_IMM:
5844 cost += addr_cost->imm_offset;
5845 break;
5846
5847 case ADDRESS_REG_WB:
5848 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5849 cost += addr_cost->pre_modify;
5850 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5851 cost += addr_cost->post_modify;
5852 else
5853 gcc_unreachable ();
5854
5855 break;
5856
5857 case ADDRESS_REG_REG:
5858 cost += addr_cost->register_offset;
5859 break;
5860
5861 case ADDRESS_REG_SXTW:
5862 cost += addr_cost->register_sextend;
5863 break;
5864
5865 case ADDRESS_REG_UXTW:
5866 cost += addr_cost->register_zextend;
5867 break;
5868
5869 default:
5870 gcc_unreachable ();
5871 }
5872
5873
5874 if (info.shift > 0)
5875 {
5876 /* For the sake of calculating the cost of the shifted register
5877 component, we can treat same sized modes in the same way. */
5878 switch (GET_MODE_BITSIZE (mode))
5879 {
5880 case 16:
5881 cost += addr_cost->addr_scale_costs.hi;
5882 break;
5883
5884 case 32:
5885 cost += addr_cost->addr_scale_costs.si;
5886 break;
5887
5888 case 64:
5889 cost += addr_cost->addr_scale_costs.di;
5890 break;
5891
5892 /* We can't tell, or this is a 128-bit vector. */
5893 default:
5894 cost += addr_cost->addr_scale_costs.ti;
5895 break;
5896 }
5897 }
5898
5899 return cost;
5900 }
5901
5902 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5903 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5904 to be taken. */
5905
5906 int
5907 aarch64_branch_cost (bool speed_p, bool predictable_p)
5908 {
5909 /* When optimizing for speed, use the cost of unpredictable branches. */
5910 const struct cpu_branch_cost *branch_costs =
5911 aarch64_tune_params.branch_costs;
5912
5913 if (!speed_p || predictable_p)
5914 return branch_costs->predictable;
5915 else
5916 return branch_costs->unpredictable;
5917 }
5918
5919 /* Return true if the RTX X in mode MODE is a zero or sign extract
5920 usable in an ADD or SUB (extended register) instruction. */
5921 static bool
5922 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5923 {
5924 /* Catch add with a sign extract.
5925 This is add_<optab><mode>_multp2. */
5926 if (GET_CODE (x) == SIGN_EXTRACT
5927 || GET_CODE (x) == ZERO_EXTRACT)
5928 {
5929 rtx op0 = XEXP (x, 0);
5930 rtx op1 = XEXP (x, 1);
5931 rtx op2 = XEXP (x, 2);
5932
5933 if (GET_CODE (op0) == MULT
5934 && CONST_INT_P (op1)
5935 && op2 == const0_rtx
5936 && CONST_INT_P (XEXP (op0, 1))
5937 && aarch64_is_extend_from_extract (mode,
5938 XEXP (op0, 1),
5939 op1))
5940 {
5941 return true;
5942 }
5943 }
5944 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5945 No shift. */
5946 else if (GET_CODE (x) == SIGN_EXTEND
5947 || GET_CODE (x) == ZERO_EXTEND)
5948 return REG_P (XEXP (x, 0));
5949
5950 return false;
5951 }
5952
5953 static bool
5954 aarch64_frint_unspec_p (unsigned int u)
5955 {
5956 switch (u)
5957 {
5958 case UNSPEC_FRINTZ:
5959 case UNSPEC_FRINTP:
5960 case UNSPEC_FRINTM:
5961 case UNSPEC_FRINTA:
5962 case UNSPEC_FRINTN:
5963 case UNSPEC_FRINTX:
5964 case UNSPEC_FRINTI:
5965 return true;
5966
5967 default:
5968 return false;
5969 }
5970 }
5971
5972 /* Return true iff X is an rtx that will match an extr instruction
5973 i.e. as described in the *extr<mode>5_insn family of patterns.
5974 OP0 and OP1 will be set to the operands of the shifts involved
5975 on success and will be NULL_RTX otherwise. */
5976
5977 static bool
5978 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5979 {
5980 rtx op0, op1;
5981 machine_mode mode = GET_MODE (x);
5982
5983 *res_op0 = NULL_RTX;
5984 *res_op1 = NULL_RTX;
5985
5986 if (GET_CODE (x) != IOR)
5987 return false;
5988
5989 op0 = XEXP (x, 0);
5990 op1 = XEXP (x, 1);
5991
5992 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5993 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5994 {
5995 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5996 if (GET_CODE (op1) == ASHIFT)
5997 std::swap (op0, op1);
5998
5999 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6000 return false;
6001
6002 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6003 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6004
6005 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6006 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6007 {
6008 *res_op0 = XEXP (op0, 0);
6009 *res_op1 = XEXP (op1, 0);
6010 return true;
6011 }
6012 }
6013
6014 return false;
6015 }
6016
6017 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6018 storing it in *COST. Result is true if the total cost of the operation
6019 has now been calculated. */
6020 static bool
6021 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6022 {
6023 rtx inner;
6024 rtx comparator;
6025 enum rtx_code cmpcode;
6026
6027 if (COMPARISON_P (op0))
6028 {
6029 inner = XEXP (op0, 0);
6030 comparator = XEXP (op0, 1);
6031 cmpcode = GET_CODE (op0);
6032 }
6033 else
6034 {
6035 inner = op0;
6036 comparator = const0_rtx;
6037 cmpcode = NE;
6038 }
6039
6040 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6041 {
6042 /* Conditional branch. */
6043 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6044 return true;
6045 else
6046 {
6047 if (cmpcode == NE || cmpcode == EQ)
6048 {
6049 if (comparator == const0_rtx)
6050 {
6051 /* TBZ/TBNZ/CBZ/CBNZ. */
6052 if (GET_CODE (inner) == ZERO_EXTRACT)
6053 /* TBZ/TBNZ. */
6054 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6055 ZERO_EXTRACT, 0, speed);
6056 else
6057 /* CBZ/CBNZ. */
6058 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6059
6060 return true;
6061 }
6062 }
6063 else if (cmpcode == LT || cmpcode == GE)
6064 {
6065 /* TBZ/TBNZ. */
6066 if (comparator == const0_rtx)
6067 return true;
6068 }
6069 }
6070 }
6071 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6072 {
6073 /* CCMP. */
6074 if (GET_CODE (op1) == COMPARE)
6075 {
6076 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6077 if (XEXP (op1, 1) == const0_rtx)
6078 *cost += 1;
6079 if (speed)
6080 {
6081 machine_mode mode = GET_MODE (XEXP (op1, 0));
6082 const struct cpu_cost_table *extra_cost
6083 = aarch64_tune_params.insn_extra_cost;
6084
6085 if (GET_MODE_CLASS (mode) == MODE_INT)
6086 *cost += extra_cost->alu.arith;
6087 else
6088 *cost += extra_cost->fp[mode == DFmode].compare;
6089 }
6090 return true;
6091 }
6092
6093 /* It's a conditional operation based on the status flags,
6094 so it must be some flavor of CSEL. */
6095
6096 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6097 if (GET_CODE (op1) == NEG
6098 || GET_CODE (op1) == NOT
6099 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6100 op1 = XEXP (op1, 0);
6101 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6102 {
6103 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6104 op1 = XEXP (op1, 0);
6105 op2 = XEXP (op2, 0);
6106 }
6107
6108 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6109 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6110 return true;
6111 }
6112
6113 /* We don't know what this is, cost all operands. */
6114 return false;
6115 }
6116
6117 /* Check whether X is a bitfield operation of the form shift + extend that
6118 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6119 operand to which the bitfield operation is applied. Otherwise return
6120 NULL_RTX. */
6121
6122 static rtx
6123 aarch64_extend_bitfield_pattern_p (rtx x)
6124 {
6125 rtx_code outer_code = GET_CODE (x);
6126 machine_mode outer_mode = GET_MODE (x);
6127
6128 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6129 && outer_mode != SImode && outer_mode != DImode)
6130 return NULL_RTX;
6131
6132 rtx inner = XEXP (x, 0);
6133 rtx_code inner_code = GET_CODE (inner);
6134 machine_mode inner_mode = GET_MODE (inner);
6135 rtx op = NULL_RTX;
6136
6137 switch (inner_code)
6138 {
6139 case ASHIFT:
6140 if (CONST_INT_P (XEXP (inner, 1))
6141 && (inner_mode == QImode || inner_mode == HImode))
6142 op = XEXP (inner, 0);
6143 break;
6144 case LSHIFTRT:
6145 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6146 && (inner_mode == QImode || inner_mode == HImode))
6147 op = XEXP (inner, 0);
6148 break;
6149 case ASHIFTRT:
6150 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6151 && (inner_mode == QImode || inner_mode == HImode))
6152 op = XEXP (inner, 0);
6153 break;
6154 default:
6155 break;
6156 }
6157
6158 return op;
6159 }
6160
6161 /* Calculate the cost of calculating X, storing it in *COST. Result
6162 is true if the total cost of the operation has now been calculated. */
6163 static bool
6164 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6165 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6166 {
6167 rtx op0, op1, op2;
6168 const struct cpu_cost_table *extra_cost
6169 = aarch64_tune_params.insn_extra_cost;
6170 int code = GET_CODE (x);
6171
6172 /* By default, assume that everything has equivalent cost to the
6173 cheapest instruction. Any additional costs are applied as a delta
6174 above this default. */
6175 *cost = COSTS_N_INSNS (1);
6176
6177 switch (code)
6178 {
6179 case SET:
6180 /* The cost depends entirely on the operands to SET. */
6181 *cost = 0;
6182 op0 = SET_DEST (x);
6183 op1 = SET_SRC (x);
6184
6185 switch (GET_CODE (op0))
6186 {
6187 case MEM:
6188 if (speed)
6189 {
6190 rtx address = XEXP (op0, 0);
6191 if (VECTOR_MODE_P (mode))
6192 *cost += extra_cost->ldst.storev;
6193 else if (GET_MODE_CLASS (mode) == MODE_INT)
6194 *cost += extra_cost->ldst.store;
6195 else if (mode == SFmode)
6196 *cost += extra_cost->ldst.storef;
6197 else if (mode == DFmode)
6198 *cost += extra_cost->ldst.stored;
6199
6200 *cost +=
6201 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6202 0, speed));
6203 }
6204
6205 *cost += rtx_cost (op1, mode, SET, 1, speed);
6206 return true;
6207
6208 case SUBREG:
6209 if (! REG_P (SUBREG_REG (op0)))
6210 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6211
6212 /* Fall through. */
6213 case REG:
6214 /* The cost is one per vector-register copied. */
6215 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6216 {
6217 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6218 / GET_MODE_SIZE (V4SImode);
6219 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6220 }
6221 /* const0_rtx is in general free, but we will use an
6222 instruction to set a register to 0. */
6223 else if (REG_P (op1) || op1 == const0_rtx)
6224 {
6225 /* The cost is 1 per register copied. */
6226 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6227 / UNITS_PER_WORD;
6228 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6229 }
6230 else
6231 /* Cost is just the cost of the RHS of the set. */
6232 *cost += rtx_cost (op1, mode, SET, 1, speed);
6233 return true;
6234
6235 case ZERO_EXTRACT:
6236 case SIGN_EXTRACT:
6237 /* Bit-field insertion. Strip any redundant widening of
6238 the RHS to meet the width of the target. */
6239 if (GET_CODE (op1) == SUBREG)
6240 op1 = SUBREG_REG (op1);
6241 if ((GET_CODE (op1) == ZERO_EXTEND
6242 || GET_CODE (op1) == SIGN_EXTEND)
6243 && CONST_INT_P (XEXP (op0, 1))
6244 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6245 >= INTVAL (XEXP (op0, 1))))
6246 op1 = XEXP (op1, 0);
6247
6248 if (CONST_INT_P (op1))
6249 {
6250 /* MOV immediate is assumed to always be cheap. */
6251 *cost = COSTS_N_INSNS (1);
6252 }
6253 else
6254 {
6255 /* BFM. */
6256 if (speed)
6257 *cost += extra_cost->alu.bfi;
6258 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6259 }
6260
6261 return true;
6262
6263 default:
6264 /* We can't make sense of this, assume default cost. */
6265 *cost = COSTS_N_INSNS (1);
6266 return false;
6267 }
6268 return false;
6269
6270 case CONST_INT:
6271 /* If an instruction can incorporate a constant within the
6272 instruction, the instruction's expression avoids calling
6273 rtx_cost() on the constant. If rtx_cost() is called on a
6274 constant, then it is usually because the constant must be
6275 moved into a register by one or more instructions.
6276
6277 The exception is constant 0, which can be expressed
6278 as XZR/WZR and is therefore free. The exception to this is
6279 if we have (set (reg) (const0_rtx)) in which case we must cost
6280 the move. However, we can catch that when we cost the SET, so
6281 we don't need to consider that here. */
6282 if (x == const0_rtx)
6283 *cost = 0;
6284 else
6285 {
6286 /* To an approximation, building any other constant is
6287 proportionally expensive to the number of instructions
6288 required to build that constant. This is true whether we
6289 are compiling for SPEED or otherwise. */
6290 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6291 (NULL_RTX, x, false, mode));
6292 }
6293 return true;
6294
6295 case CONST_DOUBLE:
6296 if (speed)
6297 {
6298 /* mov[df,sf]_aarch64. */
6299 if (aarch64_float_const_representable_p (x))
6300 /* FMOV (scalar immediate). */
6301 *cost += extra_cost->fp[mode == DFmode].fpconst;
6302 else if (!aarch64_float_const_zero_rtx_p (x))
6303 {
6304 /* This will be a load from memory. */
6305 if (mode == DFmode)
6306 *cost += extra_cost->ldst.loadd;
6307 else
6308 *cost += extra_cost->ldst.loadf;
6309 }
6310 else
6311 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6312 or MOV v0.s[0], wzr - neither of which are modeled by the
6313 cost tables. Just use the default cost. */
6314 {
6315 }
6316 }
6317
6318 return true;
6319
6320 case MEM:
6321 if (speed)
6322 {
6323 /* For loads we want the base cost of a load, plus an
6324 approximation for the additional cost of the addressing
6325 mode. */
6326 rtx address = XEXP (x, 0);
6327 if (VECTOR_MODE_P (mode))
6328 *cost += extra_cost->ldst.loadv;
6329 else if (GET_MODE_CLASS (mode) == MODE_INT)
6330 *cost += extra_cost->ldst.load;
6331 else if (mode == SFmode)
6332 *cost += extra_cost->ldst.loadf;
6333 else if (mode == DFmode)
6334 *cost += extra_cost->ldst.loadd;
6335
6336 *cost +=
6337 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6338 0, speed));
6339 }
6340
6341 return true;
6342
6343 case NEG:
6344 op0 = XEXP (x, 0);
6345
6346 if (VECTOR_MODE_P (mode))
6347 {
6348 if (speed)
6349 {
6350 /* FNEG. */
6351 *cost += extra_cost->vect.alu;
6352 }
6353 return false;
6354 }
6355
6356 if (GET_MODE_CLASS (mode) == MODE_INT)
6357 {
6358 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6359 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6360 {
6361 /* CSETM. */
6362 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6363 return true;
6364 }
6365
6366 /* Cost this as SUB wzr, X. */
6367 op0 = CONST0_RTX (mode);
6368 op1 = XEXP (x, 0);
6369 goto cost_minus;
6370 }
6371
6372 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6373 {
6374 /* Support (neg(fma...)) as a single instruction only if
6375 sign of zeros is unimportant. This matches the decision
6376 making in aarch64.md. */
6377 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6378 {
6379 /* FNMADD. */
6380 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6381 return true;
6382 }
6383 if (GET_CODE (op0) == MULT)
6384 {
6385 /* FNMUL. */
6386 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6387 return true;
6388 }
6389 if (speed)
6390 /* FNEG. */
6391 *cost += extra_cost->fp[mode == DFmode].neg;
6392 return false;
6393 }
6394
6395 return false;
6396
6397 case CLRSB:
6398 case CLZ:
6399 if (speed)
6400 {
6401 if (VECTOR_MODE_P (mode))
6402 *cost += extra_cost->vect.alu;
6403 else
6404 *cost += extra_cost->alu.clz;
6405 }
6406
6407 return false;
6408
6409 case COMPARE:
6410 op0 = XEXP (x, 0);
6411 op1 = XEXP (x, 1);
6412
6413 if (op1 == const0_rtx
6414 && GET_CODE (op0) == AND)
6415 {
6416 x = op0;
6417 mode = GET_MODE (op0);
6418 goto cost_logic;
6419 }
6420
6421 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6422 {
6423 /* TODO: A write to the CC flags possibly costs extra, this
6424 needs encoding in the cost tables. */
6425
6426 /* CC_ZESWPmode supports zero extend for free. */
6427 if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6428 op0 = XEXP (op0, 0);
6429
6430 mode = GET_MODE (op0);
6431 /* ANDS. */
6432 if (GET_CODE (op0) == AND)
6433 {
6434 x = op0;
6435 goto cost_logic;
6436 }
6437
6438 if (GET_CODE (op0) == PLUS)
6439 {
6440 /* ADDS (and CMN alias). */
6441 x = op0;
6442 goto cost_plus;
6443 }
6444
6445 if (GET_CODE (op0) == MINUS)
6446 {
6447 /* SUBS. */
6448 x = op0;
6449 goto cost_minus;
6450 }
6451
6452 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6453 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6454 && CONST_INT_P (XEXP (op0, 2)))
6455 {
6456 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6457 Handle it here directly rather than going to cost_logic
6458 since we know the immediate generated for the TST is valid
6459 so we can avoid creating an intermediate rtx for it only
6460 for costing purposes. */
6461 if (speed)
6462 *cost += extra_cost->alu.logical;
6463
6464 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6465 ZERO_EXTRACT, 0, speed);
6466 return true;
6467 }
6468
6469 if (GET_CODE (op1) == NEG)
6470 {
6471 /* CMN. */
6472 if (speed)
6473 *cost += extra_cost->alu.arith;
6474
6475 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6476 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6477 return true;
6478 }
6479
6480 /* CMP.
6481
6482 Compare can freely swap the order of operands, and
6483 canonicalization puts the more complex operation first.
6484 But the integer MINUS logic expects the shift/extend
6485 operation in op1. */
6486 if (! (REG_P (op0)
6487 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6488 {
6489 op0 = XEXP (x, 1);
6490 op1 = XEXP (x, 0);
6491 }
6492 goto cost_minus;
6493 }
6494
6495 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6496 {
6497 /* FCMP. */
6498 if (speed)
6499 *cost += extra_cost->fp[mode == DFmode].compare;
6500
6501 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6502 {
6503 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6504 /* FCMP supports constant 0.0 for no extra cost. */
6505 return true;
6506 }
6507 return false;
6508 }
6509
6510 if (VECTOR_MODE_P (mode))
6511 {
6512 /* Vector compare. */
6513 if (speed)
6514 *cost += extra_cost->vect.alu;
6515
6516 if (aarch64_float_const_zero_rtx_p (op1))
6517 {
6518 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6519 cost. */
6520 return true;
6521 }
6522 return false;
6523 }
6524 return false;
6525
6526 case MINUS:
6527 {
6528 op0 = XEXP (x, 0);
6529 op1 = XEXP (x, 1);
6530
6531 cost_minus:
6532 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6533
6534 /* Detect valid immediates. */
6535 if ((GET_MODE_CLASS (mode) == MODE_INT
6536 || (GET_MODE_CLASS (mode) == MODE_CC
6537 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6538 && CONST_INT_P (op1)
6539 && aarch64_uimm12_shift (INTVAL (op1)))
6540 {
6541 if (speed)
6542 /* SUB(S) (immediate). */
6543 *cost += extra_cost->alu.arith;
6544 return true;
6545 }
6546
6547 /* Look for SUB (extended register). */
6548 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6549 {
6550 if (speed)
6551 *cost += extra_cost->alu.extend_arith;
6552
6553 op1 = aarch64_strip_extend (op1);
6554 *cost += rtx_cost (op1, VOIDmode,
6555 (enum rtx_code) GET_CODE (op1), 0, speed);
6556 return true;
6557 }
6558
6559 rtx new_op1 = aarch64_strip_extend (op1);
6560
6561 /* Cost this as an FMA-alike operation. */
6562 if ((GET_CODE (new_op1) == MULT
6563 || aarch64_shift_p (GET_CODE (new_op1)))
6564 && code != COMPARE)
6565 {
6566 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6567 (enum rtx_code) code,
6568 speed);
6569 return true;
6570 }
6571
6572 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6573
6574 if (speed)
6575 {
6576 if (VECTOR_MODE_P (mode))
6577 {
6578 /* Vector SUB. */
6579 *cost += extra_cost->vect.alu;
6580 }
6581 else if (GET_MODE_CLASS (mode) == MODE_INT)
6582 {
6583 /* SUB(S). */
6584 *cost += extra_cost->alu.arith;
6585 }
6586 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6587 {
6588 /* FSUB. */
6589 *cost += extra_cost->fp[mode == DFmode].addsub;
6590 }
6591 }
6592 return true;
6593 }
6594
6595 case PLUS:
6596 {
6597 rtx new_op0;
6598
6599 op0 = XEXP (x, 0);
6600 op1 = XEXP (x, 1);
6601
6602 cost_plus:
6603 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6604 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6605 {
6606 /* CSINC. */
6607 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6608 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6609 return true;
6610 }
6611
6612 if (GET_MODE_CLASS (mode) == MODE_INT
6613 && CONST_INT_P (op1)
6614 && aarch64_uimm12_shift (INTVAL (op1)))
6615 {
6616 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6617
6618 if (speed)
6619 /* ADD (immediate). */
6620 *cost += extra_cost->alu.arith;
6621 return true;
6622 }
6623
6624 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6625
6626 /* Look for ADD (extended register). */
6627 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6628 {
6629 if (speed)
6630 *cost += extra_cost->alu.extend_arith;
6631
6632 op0 = aarch64_strip_extend (op0);
6633 *cost += rtx_cost (op0, VOIDmode,
6634 (enum rtx_code) GET_CODE (op0), 0, speed);
6635 return true;
6636 }
6637
6638 /* Strip any extend, leave shifts behind as we will
6639 cost them through mult_cost. */
6640 new_op0 = aarch64_strip_extend (op0);
6641
6642 if (GET_CODE (new_op0) == MULT
6643 || aarch64_shift_p (GET_CODE (new_op0)))
6644 {
6645 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6646 speed);
6647 return true;
6648 }
6649
6650 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6651
6652 if (speed)
6653 {
6654 if (VECTOR_MODE_P (mode))
6655 {
6656 /* Vector ADD. */
6657 *cost += extra_cost->vect.alu;
6658 }
6659 else if (GET_MODE_CLASS (mode) == MODE_INT)
6660 {
6661 /* ADD. */
6662 *cost += extra_cost->alu.arith;
6663 }
6664 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6665 {
6666 /* FADD. */
6667 *cost += extra_cost->fp[mode == DFmode].addsub;
6668 }
6669 }
6670 return true;
6671 }
6672
6673 case BSWAP:
6674 *cost = COSTS_N_INSNS (1);
6675
6676 if (speed)
6677 {
6678 if (VECTOR_MODE_P (mode))
6679 *cost += extra_cost->vect.alu;
6680 else
6681 *cost += extra_cost->alu.rev;
6682 }
6683 return false;
6684
6685 case IOR:
6686 if (aarch_rev16_p (x))
6687 {
6688 *cost = COSTS_N_INSNS (1);
6689
6690 if (speed)
6691 {
6692 if (VECTOR_MODE_P (mode))
6693 *cost += extra_cost->vect.alu;
6694 else
6695 *cost += extra_cost->alu.rev;
6696 }
6697 return true;
6698 }
6699
6700 if (aarch64_extr_rtx_p (x, &op0, &op1))
6701 {
6702 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6703 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6704 if (speed)
6705 *cost += extra_cost->alu.shift;
6706
6707 return true;
6708 }
6709 /* Fall through. */
6710 case XOR:
6711 case AND:
6712 cost_logic:
6713 op0 = XEXP (x, 0);
6714 op1 = XEXP (x, 1);
6715
6716 if (VECTOR_MODE_P (mode))
6717 {
6718 if (speed)
6719 *cost += extra_cost->vect.alu;
6720 return true;
6721 }
6722
6723 if (code == AND
6724 && GET_CODE (op0) == MULT
6725 && CONST_INT_P (XEXP (op0, 1))
6726 && CONST_INT_P (op1)
6727 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6728 INTVAL (op1)) != 0)
6729 {
6730 /* This is a UBFM/SBFM. */
6731 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6732 if (speed)
6733 *cost += extra_cost->alu.bfx;
6734 return true;
6735 }
6736
6737 if (GET_MODE_CLASS (mode) == MODE_INT)
6738 {
6739 /* We possibly get the immediate for free, this is not
6740 modelled. */
6741 if (CONST_INT_P (op1)
6742 && aarch64_bitmask_imm (INTVAL (op1), mode))
6743 {
6744 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6745
6746 if (speed)
6747 *cost += extra_cost->alu.logical;
6748
6749 return true;
6750 }
6751 else
6752 {
6753 rtx new_op0 = op0;
6754
6755 /* Handle ORN, EON, or BIC. */
6756 if (GET_CODE (op0) == NOT)
6757 op0 = XEXP (op0, 0);
6758
6759 new_op0 = aarch64_strip_shift (op0);
6760
6761 /* If we had a shift on op0 then this is a logical-shift-
6762 by-register/immediate operation. Otherwise, this is just
6763 a logical operation. */
6764 if (speed)
6765 {
6766 if (new_op0 != op0)
6767 {
6768 /* Shift by immediate. */
6769 if (CONST_INT_P (XEXP (op0, 1)))
6770 *cost += extra_cost->alu.log_shift;
6771 else
6772 *cost += extra_cost->alu.log_shift_reg;
6773 }
6774 else
6775 *cost += extra_cost->alu.logical;
6776 }
6777
6778 /* In both cases we want to cost both operands. */
6779 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6780 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6781
6782 return true;
6783 }
6784 }
6785 return false;
6786
6787 case NOT:
6788 x = XEXP (x, 0);
6789 op0 = aarch64_strip_shift (x);
6790
6791 if (VECTOR_MODE_P (mode))
6792 {
6793 /* Vector NOT. */
6794 *cost += extra_cost->vect.alu;
6795 return false;
6796 }
6797
6798 /* MVN-shifted-reg. */
6799 if (op0 != x)
6800 {
6801 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6802
6803 if (speed)
6804 *cost += extra_cost->alu.log_shift;
6805
6806 return true;
6807 }
6808 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6809 Handle the second form here taking care that 'a' in the above can
6810 be a shift. */
6811 else if (GET_CODE (op0) == XOR)
6812 {
6813 rtx newop0 = XEXP (op0, 0);
6814 rtx newop1 = XEXP (op0, 1);
6815 rtx op0_stripped = aarch64_strip_shift (newop0);
6816
6817 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6818 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6819
6820 if (speed)
6821 {
6822 if (op0_stripped != newop0)
6823 *cost += extra_cost->alu.log_shift;
6824 else
6825 *cost += extra_cost->alu.logical;
6826 }
6827
6828 return true;
6829 }
6830 /* MVN. */
6831 if (speed)
6832 *cost += extra_cost->alu.logical;
6833
6834 return false;
6835
6836 case ZERO_EXTEND:
6837
6838 op0 = XEXP (x, 0);
6839 /* If a value is written in SI mode, then zero extended to DI
6840 mode, the operation will in general be free as a write to
6841 a 'w' register implicitly zeroes the upper bits of an 'x'
6842 register. However, if this is
6843
6844 (set (reg) (zero_extend (reg)))
6845
6846 we must cost the explicit register move. */
6847 if (mode == DImode
6848 && GET_MODE (op0) == SImode
6849 && outer == SET)
6850 {
6851 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6852
6853 if (!op_cost && speed)
6854 /* MOV. */
6855 *cost += extra_cost->alu.extend;
6856 else
6857 /* Free, the cost is that of the SI mode operation. */
6858 *cost = op_cost;
6859
6860 return true;
6861 }
6862 else if (MEM_P (op0))
6863 {
6864 /* All loads can zero extend to any size for free. */
6865 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6866 return true;
6867 }
6868
6869 op0 = aarch64_extend_bitfield_pattern_p (x);
6870 if (op0)
6871 {
6872 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6873 if (speed)
6874 *cost += extra_cost->alu.bfx;
6875 return true;
6876 }
6877
6878 if (speed)
6879 {
6880 if (VECTOR_MODE_P (mode))
6881 {
6882 /* UMOV. */
6883 *cost += extra_cost->vect.alu;
6884 }
6885 else
6886 {
6887 /* UXTB/UXTH. */
6888 *cost += extra_cost->alu.extend;
6889 }
6890 }
6891 return false;
6892
6893 case SIGN_EXTEND:
6894 if (MEM_P (XEXP (x, 0)))
6895 {
6896 /* LDRSH. */
6897 if (speed)
6898 {
6899 rtx address = XEXP (XEXP (x, 0), 0);
6900 *cost += extra_cost->ldst.load_sign_extend;
6901
6902 *cost +=
6903 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6904 0, speed));
6905 }
6906 return true;
6907 }
6908
6909 op0 = aarch64_extend_bitfield_pattern_p (x);
6910 if (op0)
6911 {
6912 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6913 if (speed)
6914 *cost += extra_cost->alu.bfx;
6915 return true;
6916 }
6917
6918 if (speed)
6919 {
6920 if (VECTOR_MODE_P (mode))
6921 *cost += extra_cost->vect.alu;
6922 else
6923 *cost += extra_cost->alu.extend;
6924 }
6925 return false;
6926
6927 case ASHIFT:
6928 op0 = XEXP (x, 0);
6929 op1 = XEXP (x, 1);
6930
6931 if (CONST_INT_P (op1))
6932 {
6933 if (speed)
6934 {
6935 if (VECTOR_MODE_P (mode))
6936 {
6937 /* Vector shift (immediate). */
6938 *cost += extra_cost->vect.alu;
6939 }
6940 else
6941 {
6942 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6943 aliases. */
6944 *cost += extra_cost->alu.shift;
6945 }
6946 }
6947
6948 /* We can incorporate zero/sign extend for free. */
6949 if (GET_CODE (op0) == ZERO_EXTEND
6950 || GET_CODE (op0) == SIGN_EXTEND)
6951 op0 = XEXP (op0, 0);
6952
6953 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6954 return true;
6955 }
6956 else
6957 {
6958 if (speed)
6959 {
6960 if (VECTOR_MODE_P (mode))
6961 {
6962 /* Vector shift (register). */
6963 *cost += extra_cost->vect.alu;
6964 }
6965 else
6966 {
6967 /* LSLV. */
6968 *cost += extra_cost->alu.shift_reg;
6969 }
6970 }
6971 return false; /* All arguments need to be in registers. */
6972 }
6973
6974 case ROTATE:
6975 case ROTATERT:
6976 case LSHIFTRT:
6977 case ASHIFTRT:
6978 op0 = XEXP (x, 0);
6979 op1 = XEXP (x, 1);
6980
6981 if (CONST_INT_P (op1))
6982 {
6983 /* ASR (immediate) and friends. */
6984 if (speed)
6985 {
6986 if (VECTOR_MODE_P (mode))
6987 *cost += extra_cost->vect.alu;
6988 else
6989 *cost += extra_cost->alu.shift;
6990 }
6991
6992 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6993 return true;
6994 }
6995 else
6996 {
6997
6998 /* ASR (register) and friends. */
6999 if (speed)
7000 {
7001 if (VECTOR_MODE_P (mode))
7002 *cost += extra_cost->vect.alu;
7003 else
7004 *cost += extra_cost->alu.shift_reg;
7005 }
7006 return false; /* All arguments need to be in registers. */
7007 }
7008
7009 case SYMBOL_REF:
7010
7011 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7012 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7013 {
7014 /* LDR. */
7015 if (speed)
7016 *cost += extra_cost->ldst.load;
7017 }
7018 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7019 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7020 {
7021 /* ADRP, followed by ADD. */
7022 *cost += COSTS_N_INSNS (1);
7023 if (speed)
7024 *cost += 2 * extra_cost->alu.arith;
7025 }
7026 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7027 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7028 {
7029 /* ADR. */
7030 if (speed)
7031 *cost += extra_cost->alu.arith;
7032 }
7033
7034 if (flag_pic)
7035 {
7036 /* One extra load instruction, after accessing the GOT. */
7037 *cost += COSTS_N_INSNS (1);
7038 if (speed)
7039 *cost += extra_cost->ldst.load;
7040 }
7041 return true;
7042
7043 case HIGH:
7044 case LO_SUM:
7045 /* ADRP/ADD (immediate). */
7046 if (speed)
7047 *cost += extra_cost->alu.arith;
7048 return true;
7049
7050 case ZERO_EXTRACT:
7051 case SIGN_EXTRACT:
7052 /* UBFX/SBFX. */
7053 if (speed)
7054 {
7055 if (VECTOR_MODE_P (mode))
7056 *cost += extra_cost->vect.alu;
7057 else
7058 *cost += extra_cost->alu.bfx;
7059 }
7060
7061 /* We can trust that the immediates used will be correct (there
7062 are no by-register forms), so we need only cost op0. */
7063 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7064 return true;
7065
7066 case MULT:
7067 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7068 /* aarch64_rtx_mult_cost always handles recursion to its
7069 operands. */
7070 return true;
7071
7072 case MOD:
7073 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7074 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7075 an unconditional negate. This case should only ever be reached through
7076 the set_smod_pow2_cheap check in expmed.c. */
7077 if (CONST_INT_P (XEXP (x, 1))
7078 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7079 && (mode == SImode || mode == DImode))
7080 {
7081 /* We expand to 4 instructions. Reset the baseline. */
7082 *cost = COSTS_N_INSNS (4);
7083
7084 if (speed)
7085 *cost += 2 * extra_cost->alu.logical
7086 + 2 * extra_cost->alu.arith;
7087
7088 return true;
7089 }
7090
7091 /* Fall-through. */
7092 case UMOD:
7093 if (speed)
7094 {
7095 if (VECTOR_MODE_P (mode))
7096 *cost += extra_cost->vect.alu;
7097 else if (GET_MODE_CLASS (mode) == MODE_INT)
7098 *cost += (extra_cost->mult[mode == DImode].add
7099 + extra_cost->mult[mode == DImode].idiv);
7100 else if (mode == DFmode)
7101 *cost += (extra_cost->fp[1].mult
7102 + extra_cost->fp[1].div);
7103 else if (mode == SFmode)
7104 *cost += (extra_cost->fp[0].mult
7105 + extra_cost->fp[0].div);
7106 }
7107 return false; /* All arguments need to be in registers. */
7108
7109 case DIV:
7110 case UDIV:
7111 case SQRT:
7112 if (speed)
7113 {
7114 if (VECTOR_MODE_P (mode))
7115 *cost += extra_cost->vect.alu;
7116 else if (GET_MODE_CLASS (mode) == MODE_INT)
7117 /* There is no integer SQRT, so only DIV and UDIV can get
7118 here. */
7119 *cost += extra_cost->mult[mode == DImode].idiv;
7120 else
7121 *cost += extra_cost->fp[mode == DFmode].div;
7122 }
7123 return false; /* All arguments need to be in registers. */
7124
7125 case IF_THEN_ELSE:
7126 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7127 XEXP (x, 2), cost, speed);
7128
7129 case EQ:
7130 case NE:
7131 case GT:
7132 case GTU:
7133 case LT:
7134 case LTU:
7135 case GE:
7136 case GEU:
7137 case LE:
7138 case LEU:
7139
7140 return false; /* All arguments must be in registers. */
7141
7142 case FMA:
7143 op0 = XEXP (x, 0);
7144 op1 = XEXP (x, 1);
7145 op2 = XEXP (x, 2);
7146
7147 if (speed)
7148 {
7149 if (VECTOR_MODE_P (mode))
7150 *cost += extra_cost->vect.alu;
7151 else
7152 *cost += extra_cost->fp[mode == DFmode].fma;
7153 }
7154
7155 /* FMSUB, FNMADD, and FNMSUB are free. */
7156 if (GET_CODE (op0) == NEG)
7157 op0 = XEXP (op0, 0);
7158
7159 if (GET_CODE (op2) == NEG)
7160 op2 = XEXP (op2, 0);
7161
7162 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7163 and the by-element operand as operand 0. */
7164 if (GET_CODE (op1) == NEG)
7165 op1 = XEXP (op1, 0);
7166
7167 /* Catch vector-by-element operations. The by-element operand can
7168 either be (vec_duplicate (vec_select (x))) or just
7169 (vec_select (x)), depending on whether we are multiplying by
7170 a vector or a scalar.
7171
7172 Canonicalization is not very good in these cases, FMA4 will put the
7173 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7174 if (GET_CODE (op0) == VEC_DUPLICATE)
7175 op0 = XEXP (op0, 0);
7176 else if (GET_CODE (op1) == VEC_DUPLICATE)
7177 op1 = XEXP (op1, 0);
7178
7179 if (GET_CODE (op0) == VEC_SELECT)
7180 op0 = XEXP (op0, 0);
7181 else if (GET_CODE (op1) == VEC_SELECT)
7182 op1 = XEXP (op1, 0);
7183
7184 /* If the remaining parameters are not registers,
7185 get the cost to put them into registers. */
7186 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7187 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7188 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7189 return true;
7190
7191 case FLOAT:
7192 case UNSIGNED_FLOAT:
7193 if (speed)
7194 *cost += extra_cost->fp[mode == DFmode].fromint;
7195 return false;
7196
7197 case FLOAT_EXTEND:
7198 if (speed)
7199 {
7200 if (VECTOR_MODE_P (mode))
7201 {
7202 /*Vector truncate. */
7203 *cost += extra_cost->vect.alu;
7204 }
7205 else
7206 *cost += extra_cost->fp[mode == DFmode].widen;
7207 }
7208 return false;
7209
7210 case FLOAT_TRUNCATE:
7211 if (speed)
7212 {
7213 if (VECTOR_MODE_P (mode))
7214 {
7215 /*Vector conversion. */
7216 *cost += extra_cost->vect.alu;
7217 }
7218 else
7219 *cost += extra_cost->fp[mode == DFmode].narrow;
7220 }
7221 return false;
7222
7223 case FIX:
7224 case UNSIGNED_FIX:
7225 x = XEXP (x, 0);
7226 /* Strip the rounding part. They will all be implemented
7227 by the fcvt* family of instructions anyway. */
7228 if (GET_CODE (x) == UNSPEC)
7229 {
7230 unsigned int uns_code = XINT (x, 1);
7231
7232 if (uns_code == UNSPEC_FRINTA
7233 || uns_code == UNSPEC_FRINTM
7234 || uns_code == UNSPEC_FRINTN
7235 || uns_code == UNSPEC_FRINTP
7236 || uns_code == UNSPEC_FRINTZ)
7237 x = XVECEXP (x, 0, 0);
7238 }
7239
7240 if (speed)
7241 {
7242 if (VECTOR_MODE_P (mode))
7243 *cost += extra_cost->vect.alu;
7244 else
7245 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7246 }
7247
7248 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7249 fixed-point fcvt. */
7250 if (GET_CODE (x) == MULT
7251 && ((VECTOR_MODE_P (mode)
7252 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7253 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7254 {
7255 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7256 0, speed);
7257 return true;
7258 }
7259
7260 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7261 return true;
7262
7263 case ABS:
7264 if (VECTOR_MODE_P (mode))
7265 {
7266 /* ABS (vector). */
7267 if (speed)
7268 *cost += extra_cost->vect.alu;
7269 }
7270 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7271 {
7272 op0 = XEXP (x, 0);
7273
7274 /* FABD, which is analogous to FADD. */
7275 if (GET_CODE (op0) == MINUS)
7276 {
7277 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7278 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7279 if (speed)
7280 *cost += extra_cost->fp[mode == DFmode].addsub;
7281
7282 return true;
7283 }
7284 /* Simple FABS is analogous to FNEG. */
7285 if (speed)
7286 *cost += extra_cost->fp[mode == DFmode].neg;
7287 }
7288 else
7289 {
7290 /* Integer ABS will either be split to
7291 two arithmetic instructions, or will be an ABS
7292 (scalar), which we don't model. */
7293 *cost = COSTS_N_INSNS (2);
7294 if (speed)
7295 *cost += 2 * extra_cost->alu.arith;
7296 }
7297 return false;
7298
7299 case SMAX:
7300 case SMIN:
7301 if (speed)
7302 {
7303 if (VECTOR_MODE_P (mode))
7304 *cost += extra_cost->vect.alu;
7305 else
7306 {
7307 /* FMAXNM/FMINNM/FMAX/FMIN.
7308 TODO: This may not be accurate for all implementations, but
7309 we do not model this in the cost tables. */
7310 *cost += extra_cost->fp[mode == DFmode].addsub;
7311 }
7312 }
7313 return false;
7314
7315 case UNSPEC:
7316 /* The floating point round to integer frint* instructions. */
7317 if (aarch64_frint_unspec_p (XINT (x, 1)))
7318 {
7319 if (speed)
7320 *cost += extra_cost->fp[mode == DFmode].roundint;
7321
7322 return false;
7323 }
7324
7325 if (XINT (x, 1) == UNSPEC_RBIT)
7326 {
7327 if (speed)
7328 *cost += extra_cost->alu.rev;
7329
7330 return false;
7331 }
7332 break;
7333
7334 case TRUNCATE:
7335
7336 /* Decompose <su>muldi3_highpart. */
7337 if (/* (truncate:DI */
7338 mode == DImode
7339 /* (lshiftrt:TI */
7340 && GET_MODE (XEXP (x, 0)) == TImode
7341 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7342 /* (mult:TI */
7343 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7344 /* (ANY_EXTEND:TI (reg:DI))
7345 (ANY_EXTEND:TI (reg:DI))) */
7346 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7347 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7348 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7349 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7350 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7351 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7352 /* (const_int 64) */
7353 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7354 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7355 {
7356 /* UMULH/SMULH. */
7357 if (speed)
7358 *cost += extra_cost->mult[mode == DImode].extend;
7359 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7360 mode, MULT, 0, speed);
7361 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7362 mode, MULT, 1, speed);
7363 return true;
7364 }
7365
7366 /* Fall through. */
7367 default:
7368 break;
7369 }
7370
7371 if (dump_file && (dump_flags & TDF_DETAILS))
7372 fprintf (dump_file,
7373 "\nFailed to cost RTX. Assuming default cost.\n");
7374
7375 return true;
7376 }
7377
7378 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7379 calculated for X. This cost is stored in *COST. Returns true
7380 if the total cost of X was calculated. */
7381 static bool
7382 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7383 int param, int *cost, bool speed)
7384 {
7385 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7386
7387 if (dump_file && (dump_flags & TDF_DETAILS))
7388 {
7389 print_rtl_single (dump_file, x);
7390 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7391 speed ? "Hot" : "Cold",
7392 *cost, result ? "final" : "partial");
7393 }
7394
7395 return result;
7396 }
7397
7398 static int
7399 aarch64_register_move_cost (machine_mode mode,
7400 reg_class_t from_i, reg_class_t to_i)
7401 {
7402 enum reg_class from = (enum reg_class) from_i;
7403 enum reg_class to = (enum reg_class) to_i;
7404 const struct cpu_regmove_cost *regmove_cost
7405 = aarch64_tune_params.regmove_cost;
7406
7407 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7408 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7409 to = GENERAL_REGS;
7410
7411 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7412 from = GENERAL_REGS;
7413
7414 /* Moving between GPR and stack cost is the same as GP2GP. */
7415 if ((from == GENERAL_REGS && to == STACK_REG)
7416 || (to == GENERAL_REGS && from == STACK_REG))
7417 return regmove_cost->GP2GP;
7418
7419 /* To/From the stack register, we move via the gprs. */
7420 if (to == STACK_REG || from == STACK_REG)
7421 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7422 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7423
7424 if (GET_MODE_SIZE (mode) == 16)
7425 {
7426 /* 128-bit operations on general registers require 2 instructions. */
7427 if (from == GENERAL_REGS && to == GENERAL_REGS)
7428 return regmove_cost->GP2GP * 2;
7429 else if (from == GENERAL_REGS)
7430 return regmove_cost->GP2FP * 2;
7431 else if (to == GENERAL_REGS)
7432 return regmove_cost->FP2GP * 2;
7433
7434 /* When AdvSIMD instructions are disabled it is not possible to move
7435 a 128-bit value directly between Q registers. This is handled in
7436 secondary reload. A general register is used as a scratch to move
7437 the upper DI value and the lower DI value is moved directly,
7438 hence the cost is the sum of three moves. */
7439 if (! TARGET_SIMD)
7440 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7441
7442 return regmove_cost->FP2FP;
7443 }
7444
7445 if (from == GENERAL_REGS && to == GENERAL_REGS)
7446 return regmove_cost->GP2GP;
7447 else if (from == GENERAL_REGS)
7448 return regmove_cost->GP2FP;
7449 else if (to == GENERAL_REGS)
7450 return regmove_cost->FP2GP;
7451
7452 return regmove_cost->FP2FP;
7453 }
7454
7455 static int
7456 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7457 reg_class_t rclass ATTRIBUTE_UNUSED,
7458 bool in ATTRIBUTE_UNUSED)
7459 {
7460 return aarch64_tune_params.memmov_cost;
7461 }
7462
7463 /* Return true if it is safe and beneficial to use the rsqrt optabs to
7464 optimize 1.0/sqrt. */
7465
7466 static bool
7467 use_rsqrt_p (void)
7468 {
7469 return (!flag_trapping_math
7470 && flag_unsafe_math_optimizations
7471 && ((aarch64_tune_params.extra_tuning_flags
7472 & AARCH64_EXTRA_TUNE_RECIP_SQRT)
7473 || flag_mrecip_low_precision_sqrt));
7474 }
7475
7476 /* Function to decide when to use
7477 reciprocal square root builtins. */
7478
7479 static tree
7480 aarch64_builtin_reciprocal (tree fndecl)
7481 {
7482 if (!use_rsqrt_p ())
7483 return NULL_TREE;
7484 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7485 }
7486
7487 typedef rtx (*rsqrte_type) (rtx, rtx);
7488
7489 /* Select reciprocal square root initial estimate
7490 insn depending on machine mode. */
7491
7492 rsqrte_type
7493 get_rsqrte_type (machine_mode mode)
7494 {
7495 switch (mode)
7496 {
7497 case DFmode: return gen_aarch64_rsqrte_df2;
7498 case SFmode: return gen_aarch64_rsqrte_sf2;
7499 case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7500 case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7501 case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7502 default: gcc_unreachable ();
7503 }
7504 }
7505
7506 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7507
7508 /* Select reciprocal square root Newton-Raphson step
7509 insn depending on machine mode. */
7510
7511 rsqrts_type
7512 get_rsqrts_type (machine_mode mode)
7513 {
7514 switch (mode)
7515 {
7516 case DFmode: return gen_aarch64_rsqrts_df3;
7517 case SFmode: return gen_aarch64_rsqrts_sf3;
7518 case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7519 case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7520 case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7521 default: gcc_unreachable ();
7522 }
7523 }
7524
7525 /* Emit instruction sequence to compute
7526 reciprocal square root. Use two Newton-Raphson steps
7527 for single precision and three for double precision. */
7528
7529 void
7530 aarch64_emit_swrsqrt (rtx dst, rtx src)
7531 {
7532 machine_mode mode = GET_MODE (src);
7533 gcc_assert (
7534 mode == SFmode || mode == V2SFmode || mode == V4SFmode
7535 || mode == DFmode || mode == V2DFmode);
7536
7537 rtx xsrc = gen_reg_rtx (mode);
7538 emit_move_insn (xsrc, src);
7539 rtx x0 = gen_reg_rtx (mode);
7540
7541 emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7542
7543 bool double_mode = (mode == DFmode || mode == V2DFmode);
7544
7545 int iterations = double_mode ? 3 : 2;
7546
7547 if (flag_mrecip_low_precision_sqrt)
7548 iterations--;
7549
7550 for (int i = 0; i < iterations; ++i)
7551 {
7552 rtx x1 = gen_reg_rtx (mode);
7553 rtx x2 = gen_reg_rtx (mode);
7554 rtx x3 = gen_reg_rtx (mode);
7555 emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7556
7557 emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7558
7559 emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7560 x0 = x1;
7561 }
7562
7563 emit_move_insn (dst, x0);
7564 }
7565
7566 /* Return the number of instructions that can be issued per cycle. */
7567 static int
7568 aarch64_sched_issue_rate (void)
7569 {
7570 return aarch64_tune_params.issue_rate;
7571 }
7572
7573 static int
7574 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7575 {
7576 int issue_rate = aarch64_sched_issue_rate ();
7577
7578 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7579 }
7580
7581
7582 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7583 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7584 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7585
7586 static int
7587 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7588 int ready_index)
7589 {
7590 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7591 }
7592
7593
7594 /* Vectorizer cost model target hooks. */
7595
7596 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7597 static int
7598 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7599 tree vectype,
7600 int misalign ATTRIBUTE_UNUSED)
7601 {
7602 unsigned elements;
7603
7604 switch (type_of_cost)
7605 {
7606 case scalar_stmt:
7607 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7608
7609 case scalar_load:
7610 return aarch64_tune_params.vec_costs->scalar_load_cost;
7611
7612 case scalar_store:
7613 return aarch64_tune_params.vec_costs->scalar_store_cost;
7614
7615 case vector_stmt:
7616 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7617
7618 case vector_load:
7619 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7620
7621 case vector_store:
7622 return aarch64_tune_params.vec_costs->vec_store_cost;
7623
7624 case vec_to_scalar:
7625 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7626
7627 case scalar_to_vec:
7628 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7629
7630 case unaligned_load:
7631 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7632
7633 case unaligned_store:
7634 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7635
7636 case cond_branch_taken:
7637 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7638
7639 case cond_branch_not_taken:
7640 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7641
7642 case vec_perm:
7643 return aarch64_tune_params.vec_costs->vec_permute_cost;
7644
7645 case vec_promote_demote:
7646 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7647
7648 case vec_construct:
7649 elements = TYPE_VECTOR_SUBPARTS (vectype);
7650 return elements / 2 + 1;
7651
7652 default:
7653 gcc_unreachable ();
7654 }
7655 }
7656
7657 /* Implement targetm.vectorize.add_stmt_cost. */
7658 static unsigned
7659 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7660 struct _stmt_vec_info *stmt_info, int misalign,
7661 enum vect_cost_model_location where)
7662 {
7663 unsigned *cost = (unsigned *) data;
7664 unsigned retval = 0;
7665
7666 if (flag_vect_cost_model)
7667 {
7668 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7669 int stmt_cost =
7670 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7671
7672 /* Statements in an inner loop relative to the loop being
7673 vectorized are weighted more heavily. The value here is
7674 arbitrary and could potentially be improved with analysis. */
7675 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7676 count *= 50; /* FIXME */
7677
7678 retval = (unsigned) (count * stmt_cost);
7679 cost[where] += retval;
7680 }
7681
7682 return retval;
7683 }
7684
7685 static void initialize_aarch64_code_model (struct gcc_options *);
7686
7687 /* Enum describing the various ways that the
7688 aarch64_parse_{arch,tune,cpu,extension} functions can fail.
7689 This way their callers can choose what kind of error to give. */
7690
7691 enum aarch64_parse_opt_result
7692 {
7693 AARCH64_PARSE_OK, /* Parsing was successful. */
7694 AARCH64_PARSE_MISSING_ARG, /* Missing argument. */
7695 AARCH64_PARSE_INVALID_FEATURE, /* Invalid feature modifier. */
7696 AARCH64_PARSE_INVALID_ARG /* Invalid arch, tune, cpu arg. */
7697 };
7698
7699 /* Parse the architecture extension string STR and update ISA_FLAGS
7700 with the architecture features turned on or off. Return a
7701 aarch64_parse_opt_result describing the result. */
7702
7703 static enum aarch64_parse_opt_result
7704 aarch64_parse_extension (char *str, unsigned long *isa_flags)
7705 {
7706 /* The extension string is parsed left to right. */
7707 const struct aarch64_option_extension *opt = NULL;
7708
7709 /* Flag to say whether we are adding or removing an extension. */
7710 int adding_ext = -1;
7711
7712 while (str != NULL && *str != 0)
7713 {
7714 char *ext;
7715 size_t len;
7716
7717 str++;
7718 ext = strchr (str, '+');
7719
7720 if (ext != NULL)
7721 len = ext - str;
7722 else
7723 len = strlen (str);
7724
7725 if (len >= 2 && strncmp (str, "no", 2) == 0)
7726 {
7727 adding_ext = 0;
7728 len -= 2;
7729 str += 2;
7730 }
7731 else if (len > 0)
7732 adding_ext = 1;
7733
7734 if (len == 0)
7735 return AARCH64_PARSE_MISSING_ARG;
7736
7737
7738 /* Scan over the extensions table trying to find an exact match. */
7739 for (opt = all_extensions; opt->name != NULL; opt++)
7740 {
7741 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7742 {
7743 /* Add or remove the extension. */
7744 if (adding_ext)
7745 *isa_flags |= opt->flags_on;
7746 else
7747 *isa_flags &= ~(opt->flags_off);
7748 break;
7749 }
7750 }
7751
7752 if (opt->name == NULL)
7753 {
7754 /* Extension not found in list. */
7755 return AARCH64_PARSE_INVALID_FEATURE;
7756 }
7757
7758 str = ext;
7759 };
7760
7761 return AARCH64_PARSE_OK;
7762 }
7763
7764 /* Parse the TO_PARSE string and put the architecture struct that it
7765 selects into RES and the architectural features into ISA_FLAGS.
7766 Return an aarch64_parse_opt_result describing the parse result.
7767 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7768
7769 static enum aarch64_parse_opt_result
7770 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7771 unsigned long *isa_flags)
7772 {
7773 char *ext;
7774 const struct processor *arch;
7775 char *str = (char *) alloca (strlen (to_parse) + 1);
7776 size_t len;
7777
7778 strcpy (str, to_parse);
7779
7780 ext = strchr (str, '+');
7781
7782 if (ext != NULL)
7783 len = ext - str;
7784 else
7785 len = strlen (str);
7786
7787 if (len == 0)
7788 return AARCH64_PARSE_MISSING_ARG;
7789
7790
7791 /* Loop through the list of supported ARCHes to find a match. */
7792 for (arch = all_architectures; arch->name != NULL; arch++)
7793 {
7794 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7795 {
7796 unsigned long isa_temp = arch->flags;
7797
7798 if (ext != NULL)
7799 {
7800 /* TO_PARSE string contains at least one extension. */
7801 enum aarch64_parse_opt_result ext_res
7802 = aarch64_parse_extension (ext, &isa_temp);
7803
7804 if (ext_res != AARCH64_PARSE_OK)
7805 return ext_res;
7806 }
7807 /* Extension parsing was successful. Confirm the result
7808 arch and ISA flags. */
7809 *res = arch;
7810 *isa_flags = isa_temp;
7811 return AARCH64_PARSE_OK;
7812 }
7813 }
7814
7815 /* ARCH name not found in list. */
7816 return AARCH64_PARSE_INVALID_ARG;
7817 }
7818
7819 /* Parse the TO_PARSE string and put the result tuning in RES and the
7820 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7821 describing the parse result. If there is an error parsing, RES and
7822 ISA_FLAGS are left unchanged. */
7823
7824 static enum aarch64_parse_opt_result
7825 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7826 unsigned long *isa_flags)
7827 {
7828 char *ext;
7829 const struct processor *cpu;
7830 char *str = (char *) alloca (strlen (to_parse) + 1);
7831 size_t len;
7832
7833 strcpy (str, to_parse);
7834
7835 ext = strchr (str, '+');
7836
7837 if (ext != NULL)
7838 len = ext - str;
7839 else
7840 len = strlen (str);
7841
7842 if (len == 0)
7843 return AARCH64_PARSE_MISSING_ARG;
7844
7845
7846 /* Loop through the list of supported CPUs to find a match. */
7847 for (cpu = all_cores; cpu->name != NULL; cpu++)
7848 {
7849 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7850 {
7851 unsigned long isa_temp = cpu->flags;
7852
7853
7854 if (ext != NULL)
7855 {
7856 /* TO_PARSE string contains at least one extension. */
7857 enum aarch64_parse_opt_result ext_res
7858 = aarch64_parse_extension (ext, &isa_temp);
7859
7860 if (ext_res != AARCH64_PARSE_OK)
7861 return ext_res;
7862 }
7863 /* Extension parsing was successfull. Confirm the result
7864 cpu and ISA flags. */
7865 *res = cpu;
7866 *isa_flags = isa_temp;
7867 return AARCH64_PARSE_OK;
7868 }
7869 }
7870
7871 /* CPU name not found in list. */
7872 return AARCH64_PARSE_INVALID_ARG;
7873 }
7874
7875 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7876 Return an aarch64_parse_opt_result describing the parse result.
7877 If the parsing fails the RES does not change. */
7878
7879 static enum aarch64_parse_opt_result
7880 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7881 {
7882 const struct processor *cpu;
7883 char *str = (char *) alloca (strlen (to_parse) + 1);
7884
7885 strcpy (str, to_parse);
7886
7887 /* Loop through the list of supported CPUs to find a match. */
7888 for (cpu = all_cores; cpu->name != NULL; cpu++)
7889 {
7890 if (strcmp (cpu->name, str) == 0)
7891 {
7892 *res = cpu;
7893 return AARCH64_PARSE_OK;
7894 }
7895 }
7896
7897 /* CPU name not found in list. */
7898 return AARCH64_PARSE_INVALID_ARG;
7899 }
7900
7901 /* Parse TOKEN, which has length LENGTH to see if it is an option
7902 described in FLAG. If it is, return the index bit for that fusion type.
7903 If not, error (printing OPTION_NAME) and return zero. */
7904
7905 static unsigned int
7906 aarch64_parse_one_option_token (const char *token,
7907 size_t length,
7908 const struct aarch64_flag_desc *flag,
7909 const char *option_name)
7910 {
7911 for (; flag->name != NULL; flag++)
7912 {
7913 if (length == strlen (flag->name)
7914 && !strncmp (flag->name, token, length))
7915 return flag->flag;
7916 }
7917
7918 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7919 return 0;
7920 }
7921
7922 /* Parse OPTION which is a comma-separated list of flags to enable.
7923 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7924 default state we inherit from the CPU tuning structures. OPTION_NAME
7925 gives the top-level option we are parsing in the -moverride string,
7926 for use in error messages. */
7927
7928 static unsigned int
7929 aarch64_parse_boolean_options (const char *option,
7930 const struct aarch64_flag_desc *flags,
7931 unsigned int initial_state,
7932 const char *option_name)
7933 {
7934 const char separator = '.';
7935 const char* specs = option;
7936 const char* ntoken = option;
7937 unsigned int found_flags = initial_state;
7938
7939 while ((ntoken = strchr (specs, separator)))
7940 {
7941 size_t token_length = ntoken - specs;
7942 unsigned token_ops = aarch64_parse_one_option_token (specs,
7943 token_length,
7944 flags,
7945 option_name);
7946 /* If we find "none" (or, for simplicity's sake, an error) anywhere
7947 in the token stream, reset the supported operations. So:
7948
7949 adrp+add.cmp+branch.none.adrp+add
7950
7951 would have the result of turning on only adrp+add fusion. */
7952 if (!token_ops)
7953 found_flags = 0;
7954
7955 found_flags |= token_ops;
7956 specs = ++ntoken;
7957 }
7958
7959 /* We ended with a comma, print something. */
7960 if (!(*specs))
7961 {
7962 error ("%s string ill-formed\n", option_name);
7963 return 0;
7964 }
7965
7966 /* We still have one more token to parse. */
7967 size_t token_length = strlen (specs);
7968 unsigned token_ops = aarch64_parse_one_option_token (specs,
7969 token_length,
7970 flags,
7971 option_name);
7972 if (!token_ops)
7973 found_flags = 0;
7974
7975 found_flags |= token_ops;
7976 return found_flags;
7977 }
7978
7979 /* Support for overriding instruction fusion. */
7980
7981 static void
7982 aarch64_parse_fuse_string (const char *fuse_string,
7983 struct tune_params *tune)
7984 {
7985 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7986 aarch64_fusible_pairs,
7987 tune->fusible_ops,
7988 "fuse=");
7989 }
7990
7991 /* Support for overriding other tuning flags. */
7992
7993 static void
7994 aarch64_parse_tune_string (const char *tune_string,
7995 struct tune_params *tune)
7996 {
7997 tune->extra_tuning_flags
7998 = aarch64_parse_boolean_options (tune_string,
7999 aarch64_tuning_flags,
8000 tune->extra_tuning_flags,
8001 "tune=");
8002 }
8003
8004 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8005 we understand. If it is, extract the option string and handoff to
8006 the appropriate function. */
8007
8008 void
8009 aarch64_parse_one_override_token (const char* token,
8010 size_t length,
8011 struct tune_params *tune)
8012 {
8013 const struct aarch64_tuning_override_function *fn
8014 = aarch64_tuning_override_functions;
8015
8016 const char *option_part = strchr (token, '=');
8017 if (!option_part)
8018 {
8019 error ("tuning string missing in option (%s)", token);
8020 return;
8021 }
8022
8023 /* Get the length of the option name. */
8024 length = option_part - token;
8025 /* Skip the '=' to get to the option string. */
8026 option_part++;
8027
8028 for (; fn->name != NULL; fn++)
8029 {
8030 if (!strncmp (fn->name, token, length))
8031 {
8032 fn->parse_override (option_part, tune);
8033 return;
8034 }
8035 }
8036
8037 error ("unknown tuning option (%s)",token);
8038 return;
8039 }
8040
8041 /* A checking mechanism for the implementation of the tls size. */
8042
8043 static void
8044 initialize_aarch64_tls_size (struct gcc_options *opts)
8045 {
8046 if (aarch64_tls_size == 0)
8047 aarch64_tls_size = 24;
8048
8049 switch (opts->x_aarch64_cmodel_var)
8050 {
8051 case AARCH64_CMODEL_TINY:
8052 /* Both the default and maximum TLS size allowed under tiny is 1M which
8053 needs two instructions to address, so we clamp the size to 24. */
8054 if (aarch64_tls_size > 24)
8055 aarch64_tls_size = 24;
8056 break;
8057 case AARCH64_CMODEL_SMALL:
8058 /* The maximum TLS size allowed under small is 4G. */
8059 if (aarch64_tls_size > 32)
8060 aarch64_tls_size = 32;
8061 break;
8062 case AARCH64_CMODEL_LARGE:
8063 /* The maximum TLS size allowed under large is 16E.
8064 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8065 if (aarch64_tls_size > 48)
8066 aarch64_tls_size = 48;
8067 break;
8068 default:
8069 gcc_unreachable ();
8070 }
8071
8072 return;
8073 }
8074
8075 /* Parse STRING looking for options in the format:
8076 string :: option:string
8077 option :: name=substring
8078 name :: {a-z}
8079 substring :: defined by option. */
8080
8081 static void
8082 aarch64_parse_override_string (const char* input_string,
8083 struct tune_params* tune)
8084 {
8085 const char separator = ':';
8086 size_t string_length = strlen (input_string) + 1;
8087 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8088 char *string = string_root;
8089 strncpy (string, input_string, string_length);
8090 string[string_length - 1] = '\0';
8091
8092 char* ntoken = string;
8093
8094 while ((ntoken = strchr (string, separator)))
8095 {
8096 size_t token_length = ntoken - string;
8097 /* Make this substring look like a string. */
8098 *ntoken = '\0';
8099 aarch64_parse_one_override_token (string, token_length, tune);
8100 string = ++ntoken;
8101 }
8102
8103 /* One last option to parse. */
8104 aarch64_parse_one_override_token (string, strlen (string), tune);
8105 free (string_root);
8106 }
8107
8108
8109 static void
8110 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8111 {
8112 if (opts->x_flag_omit_frame_pointer)
8113 opts->x_flag_omit_leaf_frame_pointer = false;
8114 else if (opts->x_flag_omit_leaf_frame_pointer)
8115 opts->x_flag_omit_frame_pointer = true;
8116
8117 /* If not optimizing for size, set the default
8118 alignment to what the target wants. */
8119 if (!opts->x_optimize_size)
8120 {
8121 if (opts->x_align_loops <= 0)
8122 opts->x_align_loops = aarch64_tune_params.loop_align;
8123 if (opts->x_align_jumps <= 0)
8124 opts->x_align_jumps = aarch64_tune_params.jump_align;
8125 if (opts->x_align_functions <= 0)
8126 opts->x_align_functions = aarch64_tune_params.function_align;
8127 }
8128
8129 /* If nopcrelative_literal_loads is set on the command line, this
8130 implies that the user asked for PC relative literal loads. */
8131 if (opts->x_nopcrelative_literal_loads == 1)
8132 aarch64_nopcrelative_literal_loads = false;
8133
8134 /* If it is not set on the command line, we default to no
8135 pc relative literal loads. */
8136 if (opts->x_nopcrelative_literal_loads == 2)
8137 aarch64_nopcrelative_literal_loads = true;
8138
8139 /* In the tiny memory model it makes no sense
8140 to disallow non PC relative literal pool loads
8141 as many other things will break anyway. */
8142 if (opts->x_nopcrelative_literal_loads
8143 && (aarch64_cmodel == AARCH64_CMODEL_TINY
8144 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
8145 aarch64_nopcrelative_literal_loads = false;
8146 }
8147
8148 /* 'Unpack' up the internal tuning structs and update the options
8149 in OPTS. The caller must have set up selected_tune and selected_arch
8150 as all the other target-specific codegen decisions are
8151 derived from them. */
8152
8153 void
8154 aarch64_override_options_internal (struct gcc_options *opts)
8155 {
8156 aarch64_tune_flags = selected_tune->flags;
8157 aarch64_tune = selected_tune->sched_core;
8158 /* Make a copy of the tuning parameters attached to the core, which
8159 we may later overwrite. */
8160 aarch64_tune_params = *(selected_tune->tune);
8161 aarch64_architecture_version = selected_arch->architecture_version;
8162
8163 if (opts->x_aarch64_override_tune_string)
8164 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8165 &aarch64_tune_params);
8166
8167 /* This target defaults to strict volatile bitfields. */
8168 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8169 opts->x_flag_strict_volatile_bitfields = 1;
8170
8171 initialize_aarch64_code_model (opts);
8172 initialize_aarch64_tls_size (opts);
8173
8174 int queue_depth = 0;
8175 switch (aarch64_tune_params.autoprefetcher_model)
8176 {
8177 case tune_params::AUTOPREFETCHER_OFF:
8178 queue_depth = -1;
8179 break;
8180 case tune_params::AUTOPREFETCHER_WEAK:
8181 queue_depth = 0;
8182 break;
8183 case tune_params::AUTOPREFETCHER_STRONG:
8184 queue_depth = max_insn_queue_index + 1;
8185 break;
8186 default:
8187 gcc_unreachable ();
8188 }
8189
8190 /* We don't mind passing in global_options_set here as we don't use
8191 the *options_set structs anyway. */
8192 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8193 queue_depth,
8194 opts->x_param_values,
8195 global_options_set.x_param_values);
8196
8197 /* Set the L1 cache line size. */
8198 if (selected_cpu->tune->cache_line_size != 0)
8199 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8200 selected_cpu->tune->cache_line_size,
8201 opts->x_param_values,
8202 global_options_set.x_param_values);
8203
8204 aarch64_override_options_after_change_1 (opts);
8205 }
8206
8207 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8208 specified in STR and throw errors if appropriate. Put the results if
8209 they are valid in RES and ISA_FLAGS. Return whether the option is
8210 valid. */
8211
8212 static bool
8213 aarch64_validate_mcpu (const char *str, const struct processor **res,
8214 unsigned long *isa_flags)
8215 {
8216 enum aarch64_parse_opt_result parse_res
8217 = aarch64_parse_cpu (str, res, isa_flags);
8218
8219 if (parse_res == AARCH64_PARSE_OK)
8220 return true;
8221
8222 switch (parse_res)
8223 {
8224 case AARCH64_PARSE_MISSING_ARG:
8225 error ("missing cpu name in -mcpu=%qs", str);
8226 break;
8227 case AARCH64_PARSE_INVALID_ARG:
8228 error ("unknown value %qs for -mcpu", str);
8229 break;
8230 case AARCH64_PARSE_INVALID_FEATURE:
8231 error ("invalid feature modifier in -mcpu=%qs", str);
8232 break;
8233 default:
8234 gcc_unreachable ();
8235 }
8236
8237 return false;
8238 }
8239
8240 /* Validate a command-line -march option. Parse the arch and extensions
8241 (if any) specified in STR and throw errors if appropriate. Put the
8242 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8243 option is valid. */
8244
8245 static bool
8246 aarch64_validate_march (const char *str, const struct processor **res,
8247 unsigned long *isa_flags)
8248 {
8249 enum aarch64_parse_opt_result parse_res
8250 = aarch64_parse_arch (str, res, isa_flags);
8251
8252 if (parse_res == AARCH64_PARSE_OK)
8253 return true;
8254
8255 switch (parse_res)
8256 {
8257 case AARCH64_PARSE_MISSING_ARG:
8258 error ("missing arch name in -march=%qs", str);
8259 break;
8260 case AARCH64_PARSE_INVALID_ARG:
8261 error ("unknown value %qs for -march", str);
8262 break;
8263 case AARCH64_PARSE_INVALID_FEATURE:
8264 error ("invalid feature modifier in -march=%qs", str);
8265 break;
8266 default:
8267 gcc_unreachable ();
8268 }
8269
8270 return false;
8271 }
8272
8273 /* Validate a command-line -mtune option. Parse the cpu
8274 specified in STR and throw errors if appropriate. Put the
8275 result, if it is valid, in RES. Return whether the option is
8276 valid. */
8277
8278 static bool
8279 aarch64_validate_mtune (const char *str, const struct processor **res)
8280 {
8281 enum aarch64_parse_opt_result parse_res
8282 = aarch64_parse_tune (str, res);
8283
8284 if (parse_res == AARCH64_PARSE_OK)
8285 return true;
8286
8287 switch (parse_res)
8288 {
8289 case AARCH64_PARSE_MISSING_ARG:
8290 error ("missing cpu name in -mtune=%qs", str);
8291 break;
8292 case AARCH64_PARSE_INVALID_ARG:
8293 error ("unknown value %qs for -mtune", str);
8294 break;
8295 default:
8296 gcc_unreachable ();
8297 }
8298 return false;
8299 }
8300
8301 /* Return the CPU corresponding to the enum CPU.
8302 If it doesn't specify a cpu, return the default. */
8303
8304 static const struct processor *
8305 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8306 {
8307 if (cpu != aarch64_none)
8308 return &all_cores[cpu];
8309
8310 /* The & 0x3f is to extract the bottom 6 bits that encode the
8311 default cpu as selected by the --with-cpu GCC configure option
8312 in config.gcc.
8313 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8314 flags mechanism should be reworked to make it more sane. */
8315 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8316 }
8317
8318 /* Return the architecture corresponding to the enum ARCH.
8319 If it doesn't specify a valid architecture, return the default. */
8320
8321 static const struct processor *
8322 aarch64_get_arch (enum aarch64_arch arch)
8323 {
8324 if (arch != aarch64_no_arch)
8325 return &all_architectures[arch];
8326
8327 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8328
8329 return &all_architectures[cpu->arch];
8330 }
8331
8332 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8333 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8334 tuning structs. In particular it must set selected_tune and
8335 aarch64_isa_flags that define the available ISA features and tuning
8336 decisions. It must also set selected_arch as this will be used to
8337 output the .arch asm tags for each function. */
8338
8339 static void
8340 aarch64_override_options (void)
8341 {
8342 unsigned long cpu_isa = 0;
8343 unsigned long arch_isa = 0;
8344 aarch64_isa_flags = 0;
8345
8346 bool valid_cpu = true;
8347 bool valid_tune = true;
8348 bool valid_arch = true;
8349
8350 selected_cpu = NULL;
8351 selected_arch = NULL;
8352 selected_tune = NULL;
8353
8354 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8355 If either of -march or -mtune is given, they override their
8356 respective component of -mcpu. */
8357 if (aarch64_cpu_string)
8358 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8359 &cpu_isa);
8360
8361 if (aarch64_arch_string)
8362 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8363 &arch_isa);
8364
8365 if (aarch64_tune_string)
8366 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8367
8368 /* If the user did not specify a processor, choose the default
8369 one for them. This will be the CPU set during configuration using
8370 --with-cpu, otherwise it is "generic". */
8371 if (!selected_cpu)
8372 {
8373 if (selected_arch)
8374 {
8375 selected_cpu = &all_cores[selected_arch->ident];
8376 aarch64_isa_flags = arch_isa;
8377 explicit_arch = selected_arch->arch;
8378 }
8379 else
8380 {
8381 /* Get default configure-time CPU. */
8382 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8383 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8384 }
8385
8386 if (selected_tune)
8387 explicit_tune_core = selected_tune->ident;
8388 }
8389 /* If both -mcpu and -march are specified check that they are architecturally
8390 compatible, warn if they're not and prefer the -march ISA flags. */
8391 else if (selected_arch)
8392 {
8393 if (selected_arch->arch != selected_cpu->arch)
8394 {
8395 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8396 all_architectures[selected_cpu->arch].name,
8397 selected_arch->name);
8398 }
8399 aarch64_isa_flags = arch_isa;
8400 explicit_arch = selected_arch->arch;
8401 explicit_tune_core = selected_tune ? selected_tune->ident
8402 : selected_cpu->ident;
8403 }
8404 else
8405 {
8406 /* -mcpu but no -march. */
8407 aarch64_isa_flags = cpu_isa;
8408 explicit_tune_core = selected_tune ? selected_tune->ident
8409 : selected_cpu->ident;
8410 gcc_assert (selected_cpu);
8411 selected_arch = &all_architectures[selected_cpu->arch];
8412 explicit_arch = selected_arch->arch;
8413 }
8414
8415 /* Set the arch as well as we will need it when outputing
8416 the .arch directive in assembly. */
8417 if (!selected_arch)
8418 {
8419 gcc_assert (selected_cpu);
8420 selected_arch = &all_architectures[selected_cpu->arch];
8421 }
8422
8423 if (!selected_tune)
8424 selected_tune = selected_cpu;
8425
8426 #ifndef HAVE_AS_MABI_OPTION
8427 /* The compiler may have been configured with 2.23.* binutils, which does
8428 not have support for ILP32. */
8429 if (TARGET_ILP32)
8430 error ("Assembler does not support -mabi=ilp32");
8431 #endif
8432
8433 /* Make sure we properly set up the explicit options. */
8434 if ((aarch64_cpu_string && valid_cpu)
8435 || (aarch64_tune_string && valid_tune))
8436 gcc_assert (explicit_tune_core != aarch64_none);
8437
8438 if ((aarch64_cpu_string && valid_cpu)
8439 || (aarch64_arch_string && valid_arch))
8440 gcc_assert (explicit_arch != aarch64_no_arch);
8441
8442 aarch64_override_options_internal (&global_options);
8443
8444 /* Save these options as the default ones in case we push and pop them later
8445 while processing functions with potential target attributes. */
8446 target_option_default_node = target_option_current_node
8447 = build_target_option_node (&global_options);
8448
8449 aarch64_register_fma_steering ();
8450
8451 }
8452
8453 /* Implement targetm.override_options_after_change. */
8454
8455 static void
8456 aarch64_override_options_after_change (void)
8457 {
8458 aarch64_override_options_after_change_1 (&global_options);
8459 }
8460
8461 static struct machine_function *
8462 aarch64_init_machine_status (void)
8463 {
8464 struct machine_function *machine;
8465 machine = ggc_cleared_alloc<machine_function> ();
8466 return machine;
8467 }
8468
8469 void
8470 aarch64_init_expanders (void)
8471 {
8472 init_machine_status = aarch64_init_machine_status;
8473 }
8474
8475 /* A checking mechanism for the implementation of the various code models. */
8476 static void
8477 initialize_aarch64_code_model (struct gcc_options *opts)
8478 {
8479 if (opts->x_flag_pic)
8480 {
8481 switch (opts->x_aarch64_cmodel_var)
8482 {
8483 case AARCH64_CMODEL_TINY:
8484 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8485 break;
8486 case AARCH64_CMODEL_SMALL:
8487 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8488 aarch64_cmodel = (flag_pic == 2
8489 ? AARCH64_CMODEL_SMALL_PIC
8490 : AARCH64_CMODEL_SMALL_SPIC);
8491 #else
8492 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8493 #endif
8494 break;
8495 case AARCH64_CMODEL_LARGE:
8496 sorry ("code model %qs with -f%s", "large",
8497 opts->x_flag_pic > 1 ? "PIC" : "pic");
8498 break;
8499 default:
8500 gcc_unreachable ();
8501 }
8502 }
8503 else
8504 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8505 }
8506
8507 /* Implement TARGET_OPTION_SAVE. */
8508
8509 static void
8510 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8511 {
8512 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8513 }
8514
8515 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8516 using the information saved in PTR. */
8517
8518 static void
8519 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8520 {
8521 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8522 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8523 opts->x_explicit_arch = ptr->x_explicit_arch;
8524 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8525 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8526
8527 aarch64_override_options_internal (opts);
8528 }
8529
8530 /* Implement TARGET_OPTION_PRINT. */
8531
8532 static void
8533 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8534 {
8535 const struct processor *cpu
8536 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8537 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8538 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8539 std::string extension
8540 = aarch64_get_extension_string_for_isa_flags (isa_flags);
8541
8542 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8543 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8544 arch->name, extension.c_str ());
8545 }
8546
8547 static GTY(()) tree aarch64_previous_fndecl;
8548
8549 void
8550 aarch64_reset_previous_fndecl (void)
8551 {
8552 aarch64_previous_fndecl = NULL;
8553 }
8554
8555 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8556 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8557 of the function, if such exists. This function may be called multiple
8558 times on a single function so use aarch64_previous_fndecl to avoid
8559 setting up identical state. */
8560
8561 static void
8562 aarch64_set_current_function (tree fndecl)
8563 {
8564 tree old_tree = (aarch64_previous_fndecl
8565 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8566 : NULL_TREE);
8567
8568 tree new_tree = (fndecl
8569 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
8570 : NULL_TREE);
8571
8572
8573 if (fndecl && fndecl != aarch64_previous_fndecl)
8574 {
8575 aarch64_previous_fndecl = fndecl;
8576 if (old_tree == new_tree)
8577 ;
8578
8579 else if (new_tree && new_tree != target_option_default_node)
8580 {
8581 cl_target_option_restore (&global_options,
8582 TREE_TARGET_OPTION (new_tree));
8583 if (TREE_TARGET_GLOBALS (new_tree))
8584 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8585 else
8586 TREE_TARGET_GLOBALS (new_tree)
8587 = save_target_globals_default_opts ();
8588 }
8589
8590 else if (old_tree && old_tree != target_option_default_node)
8591 {
8592 new_tree = target_option_current_node;
8593 cl_target_option_restore (&global_options,
8594 TREE_TARGET_OPTION (new_tree));
8595 if (TREE_TARGET_GLOBALS (new_tree))
8596 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8597 else if (new_tree == target_option_default_node)
8598 restore_target_globals (&default_target_globals);
8599 else
8600 TREE_TARGET_GLOBALS (new_tree)
8601 = save_target_globals_default_opts ();
8602 }
8603 }
8604
8605 if (!fndecl)
8606 return;
8607
8608 /* If we turned on SIMD make sure that any vector parameters are re-laid out
8609 so that they use proper vector modes. */
8610 if (TARGET_SIMD)
8611 {
8612 tree parms = DECL_ARGUMENTS (fndecl);
8613 for (; parms && parms != void_list_node; parms = TREE_CHAIN (parms))
8614 {
8615 if (TREE_CODE (parms) == PARM_DECL
8616 && VECTOR_TYPE_P (TREE_TYPE (parms))
8617 && DECL_MODE (parms) != TYPE_MODE (TREE_TYPE (parms)))
8618 relayout_decl (parms);
8619 }
8620 }
8621 }
8622
8623 /* Enum describing the various ways we can handle attributes.
8624 In many cases we can reuse the generic option handling machinery. */
8625
8626 enum aarch64_attr_opt_type
8627 {
8628 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8629 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8630 aarch64_attr_enum, /* Attribute sets an enum variable. */
8631 aarch64_attr_custom /* Attribute requires a custom handling function. */
8632 };
8633
8634 /* All the information needed to handle a target attribute.
8635 NAME is the name of the attribute.
8636 ATTR_TYPE specifies the type of behavior of the attribute as described
8637 in the definition of enum aarch64_attr_opt_type.
8638 ALLOW_NEG is true if the attribute supports a "no-" form.
8639 HANDLER is the function that takes the attribute string and whether
8640 it is a pragma or attribute and handles the option. It is needed only
8641 when the ATTR_TYPE is aarch64_attr_custom.
8642 OPT_NUM is the enum specifying the option that the attribute modifies.
8643 This is needed for attributes that mirror the behavior of a command-line
8644 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8645 aarch64_attr_enum. */
8646
8647 struct aarch64_attribute_info
8648 {
8649 const char *name;
8650 enum aarch64_attr_opt_type attr_type;
8651 bool allow_neg;
8652 bool (*handler) (const char *, const char *);
8653 enum opt_code opt_num;
8654 };
8655
8656 /* Handle the ARCH_STR argument to the arch= target attribute.
8657 PRAGMA_OR_ATTR is used in potential error messages. */
8658
8659 static bool
8660 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8661 {
8662 const struct processor *tmp_arch = NULL;
8663 enum aarch64_parse_opt_result parse_res
8664 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8665
8666 if (parse_res == AARCH64_PARSE_OK)
8667 {
8668 gcc_assert (tmp_arch);
8669 selected_arch = tmp_arch;
8670 explicit_arch = selected_arch->arch;
8671 return true;
8672 }
8673
8674 switch (parse_res)
8675 {
8676 case AARCH64_PARSE_MISSING_ARG:
8677 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8678 break;
8679 case AARCH64_PARSE_INVALID_ARG:
8680 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8681 break;
8682 case AARCH64_PARSE_INVALID_FEATURE:
8683 error ("invalid feature modifier %qs for 'arch' target %s",
8684 str, pragma_or_attr);
8685 break;
8686 default:
8687 gcc_unreachable ();
8688 }
8689
8690 return false;
8691 }
8692
8693 /* Handle the argument CPU_STR to the cpu= target attribute.
8694 PRAGMA_OR_ATTR is used in potential error messages. */
8695
8696 static bool
8697 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8698 {
8699 const struct processor *tmp_cpu = NULL;
8700 enum aarch64_parse_opt_result parse_res
8701 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8702
8703 if (parse_res == AARCH64_PARSE_OK)
8704 {
8705 gcc_assert (tmp_cpu);
8706 selected_tune = tmp_cpu;
8707 explicit_tune_core = selected_tune->ident;
8708
8709 selected_arch = &all_architectures[tmp_cpu->arch];
8710 explicit_arch = selected_arch->arch;
8711 return true;
8712 }
8713
8714 switch (parse_res)
8715 {
8716 case AARCH64_PARSE_MISSING_ARG:
8717 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8718 break;
8719 case AARCH64_PARSE_INVALID_ARG:
8720 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8721 break;
8722 case AARCH64_PARSE_INVALID_FEATURE:
8723 error ("invalid feature modifier %qs for 'cpu' target %s",
8724 str, pragma_or_attr);
8725 break;
8726 default:
8727 gcc_unreachable ();
8728 }
8729
8730 return false;
8731 }
8732
8733 /* Handle the argument STR to the tune= target attribute.
8734 PRAGMA_OR_ATTR is used in potential error messages. */
8735
8736 static bool
8737 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8738 {
8739 const struct processor *tmp_tune = NULL;
8740 enum aarch64_parse_opt_result parse_res
8741 = aarch64_parse_tune (str, &tmp_tune);
8742
8743 if (parse_res == AARCH64_PARSE_OK)
8744 {
8745 gcc_assert (tmp_tune);
8746 selected_tune = tmp_tune;
8747 explicit_tune_core = selected_tune->ident;
8748 return true;
8749 }
8750
8751 switch (parse_res)
8752 {
8753 case AARCH64_PARSE_INVALID_ARG:
8754 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8755 break;
8756 default:
8757 gcc_unreachable ();
8758 }
8759
8760 return false;
8761 }
8762
8763 /* Parse an architecture extensions target attribute string specified in STR.
8764 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8765 if successful. Update aarch64_isa_flags to reflect the ISA features
8766 modified.
8767 PRAGMA_OR_ATTR is used in potential error messages. */
8768
8769 static bool
8770 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8771 {
8772 enum aarch64_parse_opt_result parse_res;
8773 unsigned long isa_flags = aarch64_isa_flags;
8774
8775 /* We allow "+nothing" in the beginning to clear out all architectural
8776 features if the user wants to handpick specific features. */
8777 if (strncmp ("+nothing", str, 8) == 0)
8778 {
8779 isa_flags = 0;
8780 str += 8;
8781 }
8782
8783 parse_res = aarch64_parse_extension (str, &isa_flags);
8784
8785 if (parse_res == AARCH64_PARSE_OK)
8786 {
8787 aarch64_isa_flags = isa_flags;
8788 return true;
8789 }
8790
8791 switch (parse_res)
8792 {
8793 case AARCH64_PARSE_MISSING_ARG:
8794 error ("missing feature modifier in target %s %qs",
8795 pragma_or_attr, str);
8796 break;
8797
8798 case AARCH64_PARSE_INVALID_FEATURE:
8799 error ("invalid feature modifier in target %s %qs",
8800 pragma_or_attr, str);
8801 break;
8802
8803 default:
8804 gcc_unreachable ();
8805 }
8806
8807 return false;
8808 }
8809
8810 /* The target attributes that we support. On top of these we also support just
8811 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8812 handled explicitly in aarch64_process_one_target_attr. */
8813
8814 static const struct aarch64_attribute_info aarch64_attributes[] =
8815 {
8816 { "general-regs-only", aarch64_attr_mask, false, NULL,
8817 OPT_mgeneral_regs_only },
8818 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8819 OPT_mfix_cortex_a53_835769 },
8820 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8821 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8822 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8823 OPT_momit_leaf_frame_pointer },
8824 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8825 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8826 OPT_march_ },
8827 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8828 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8829 OPT_mtune_ },
8830 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8831 };
8832
8833 /* Parse ARG_STR which contains the definition of one target attribute.
8834 Show appropriate errors if any or return true if the attribute is valid.
8835 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8836 we're processing a target attribute or pragma. */
8837
8838 static bool
8839 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8840 {
8841 bool invert = false;
8842
8843 size_t len = strlen (arg_str);
8844
8845 if (len == 0)
8846 {
8847 error ("malformed target %s", pragma_or_attr);
8848 return false;
8849 }
8850
8851 char *str_to_check = (char *) alloca (len + 1);
8852 strcpy (str_to_check, arg_str);
8853
8854 /* Skip leading whitespace. */
8855 while (*str_to_check == ' ' || *str_to_check == '\t')
8856 str_to_check++;
8857
8858 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8859 It is easier to detect and handle it explicitly here rather than going
8860 through the machinery for the rest of the target attributes in this
8861 function. */
8862 if (*str_to_check == '+')
8863 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8864
8865 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8866 {
8867 invert = true;
8868 str_to_check += 3;
8869 }
8870 char *arg = strchr (str_to_check, '=');
8871
8872 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8873 and point ARG to "foo". */
8874 if (arg)
8875 {
8876 *arg = '\0';
8877 arg++;
8878 }
8879 const struct aarch64_attribute_info *p_attr;
8880 bool found = false;
8881 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8882 {
8883 /* If the names don't match up, or the user has given an argument
8884 to an attribute that doesn't accept one, or didn't give an argument
8885 to an attribute that expects one, fail to match. */
8886 if (strcmp (str_to_check, p_attr->name) != 0)
8887 continue;
8888
8889 found = true;
8890 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8891 || p_attr->attr_type == aarch64_attr_enum;
8892
8893 if (attr_need_arg_p ^ (arg != NULL))
8894 {
8895 error ("target %s %qs does not accept an argument",
8896 pragma_or_attr, str_to_check);
8897 return false;
8898 }
8899
8900 /* If the name matches but the attribute does not allow "no-" versions
8901 then we can't match. */
8902 if (invert && !p_attr->allow_neg)
8903 {
8904 error ("target %s %qs does not allow a negated form",
8905 pragma_or_attr, str_to_check);
8906 return false;
8907 }
8908
8909 switch (p_attr->attr_type)
8910 {
8911 /* Has a custom handler registered.
8912 For example, cpu=, arch=, tune=. */
8913 case aarch64_attr_custom:
8914 gcc_assert (p_attr->handler);
8915 if (!p_attr->handler (arg, pragma_or_attr))
8916 return false;
8917 break;
8918
8919 /* Either set or unset a boolean option. */
8920 case aarch64_attr_bool:
8921 {
8922 struct cl_decoded_option decoded;
8923
8924 generate_option (p_attr->opt_num, NULL, !invert,
8925 CL_TARGET, &decoded);
8926 aarch64_handle_option (&global_options, &global_options_set,
8927 &decoded, input_location);
8928 break;
8929 }
8930 /* Set or unset a bit in the target_flags. aarch64_handle_option
8931 should know what mask to apply given the option number. */
8932 case aarch64_attr_mask:
8933 {
8934 struct cl_decoded_option decoded;
8935 /* We only need to specify the option number.
8936 aarch64_handle_option will know which mask to apply. */
8937 decoded.opt_index = p_attr->opt_num;
8938 decoded.value = !invert;
8939 aarch64_handle_option (&global_options, &global_options_set,
8940 &decoded, input_location);
8941 break;
8942 }
8943 /* Use the option setting machinery to set an option to an enum. */
8944 case aarch64_attr_enum:
8945 {
8946 gcc_assert (arg);
8947 bool valid;
8948 int value;
8949 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8950 &value, CL_TARGET);
8951 if (valid)
8952 {
8953 set_option (&global_options, NULL, p_attr->opt_num, value,
8954 NULL, DK_UNSPECIFIED, input_location,
8955 global_dc);
8956 }
8957 else
8958 {
8959 error ("target %s %s=%s is not valid",
8960 pragma_or_attr, str_to_check, arg);
8961 }
8962 break;
8963 }
8964 default:
8965 gcc_unreachable ();
8966 }
8967 }
8968
8969 /* If we reached here we either have found an attribute and validated
8970 it or didn't match any. If we matched an attribute but its arguments
8971 were malformed we will have returned false already. */
8972 return found;
8973 }
8974
8975 /* Count how many times the character C appears in
8976 NULL-terminated string STR. */
8977
8978 static unsigned int
8979 num_occurences_in_str (char c, char *str)
8980 {
8981 unsigned int res = 0;
8982 while (*str != '\0')
8983 {
8984 if (*str == c)
8985 res++;
8986
8987 str++;
8988 }
8989
8990 return res;
8991 }
8992
8993 /* Parse the tree in ARGS that contains the target attribute information
8994 and update the global target options space. PRAGMA_OR_ATTR is a string
8995 to be used in error messages, specifying whether this is processing
8996 a target attribute or a target pragma. */
8997
8998 bool
8999 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9000 {
9001 if (TREE_CODE (args) == TREE_LIST)
9002 {
9003 do
9004 {
9005 tree head = TREE_VALUE (args);
9006 if (head)
9007 {
9008 if (!aarch64_process_target_attr (head, pragma_or_attr))
9009 return false;
9010 }
9011 args = TREE_CHAIN (args);
9012 } while (args);
9013
9014 return true;
9015 }
9016 /* We expect to find a string to parse. */
9017 gcc_assert (TREE_CODE (args) == STRING_CST);
9018
9019 size_t len = strlen (TREE_STRING_POINTER (args));
9020 char *str_to_check = (char *) alloca (len + 1);
9021 strcpy (str_to_check, TREE_STRING_POINTER (args));
9022
9023 if (len == 0)
9024 {
9025 error ("malformed target %s value", pragma_or_attr);
9026 return false;
9027 }
9028
9029 /* Used to catch empty spaces between commas i.e.
9030 attribute ((target ("attr1,,attr2"))). */
9031 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9032
9033 /* Handle multiple target attributes separated by ','. */
9034 char *token = strtok (str_to_check, ",");
9035
9036 unsigned int num_attrs = 0;
9037 while (token)
9038 {
9039 num_attrs++;
9040 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9041 {
9042 error ("target %s %qs is invalid", pragma_or_attr, token);
9043 return false;
9044 }
9045
9046 token = strtok (NULL, ",");
9047 }
9048
9049 if (num_attrs != num_commas + 1)
9050 {
9051 error ("malformed target %s list %qs",
9052 pragma_or_attr, TREE_STRING_POINTER (args));
9053 return false;
9054 }
9055
9056 return true;
9057 }
9058
9059 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9060 process attribute ((target ("..."))). */
9061
9062 static bool
9063 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9064 {
9065 struct cl_target_option cur_target;
9066 bool ret;
9067 tree old_optimize;
9068 tree new_target, new_optimize;
9069 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9070
9071 /* If what we're processing is the current pragma string then the
9072 target option node is already stored in target_option_current_node
9073 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9074 having to re-parse the string. This is especially useful to keep
9075 arm_neon.h compile times down since that header contains a lot
9076 of intrinsics enclosed in pragmas. */
9077 if (!existing_target && args == current_target_pragma)
9078 {
9079 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9080 return true;
9081 }
9082 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9083
9084 old_optimize = build_optimization_node (&global_options);
9085 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9086
9087 /* If the function changed the optimization levels as well as setting
9088 target options, start with the optimizations specified. */
9089 if (func_optimize && func_optimize != old_optimize)
9090 cl_optimization_restore (&global_options,
9091 TREE_OPTIMIZATION (func_optimize));
9092
9093 /* Save the current target options to restore at the end. */
9094 cl_target_option_save (&cur_target, &global_options);
9095
9096 /* If fndecl already has some target attributes applied to it, unpack
9097 them so that we add this attribute on top of them, rather than
9098 overwriting them. */
9099 if (existing_target)
9100 {
9101 struct cl_target_option *existing_options
9102 = TREE_TARGET_OPTION (existing_target);
9103
9104 if (existing_options)
9105 cl_target_option_restore (&global_options, existing_options);
9106 }
9107 else
9108 cl_target_option_restore (&global_options,
9109 TREE_TARGET_OPTION (target_option_current_node));
9110
9111
9112 ret = aarch64_process_target_attr (args, "attribute");
9113
9114 /* Set up any additional state. */
9115 if (ret)
9116 {
9117 aarch64_override_options_internal (&global_options);
9118 /* Initialize SIMD builtins if we haven't already.
9119 Set current_target_pragma to NULL for the duration so that
9120 the builtin initialization code doesn't try to tag the functions
9121 being built with the attributes specified by any current pragma, thus
9122 going into an infinite recursion. */
9123 if (TARGET_SIMD)
9124 {
9125 tree saved_current_target_pragma = current_target_pragma;
9126 current_target_pragma = NULL;
9127 aarch64_init_simd_builtins ();
9128 current_target_pragma = saved_current_target_pragma;
9129 }
9130 new_target = build_target_option_node (&global_options);
9131 }
9132 else
9133 new_target = NULL;
9134
9135 new_optimize = build_optimization_node (&global_options);
9136
9137 if (fndecl && ret)
9138 {
9139 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9140
9141 if (old_optimize != new_optimize)
9142 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9143 }
9144
9145 cl_target_option_restore (&global_options, &cur_target);
9146
9147 if (old_optimize != new_optimize)
9148 cl_optimization_restore (&global_options,
9149 TREE_OPTIMIZATION (old_optimize));
9150 return ret;
9151 }
9152
9153 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9154 tri-bool options (yes, no, don't care) and the default value is
9155 DEF, determine whether to reject inlining. */
9156
9157 static bool
9158 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9159 int dont_care, int def)
9160 {
9161 /* If the callee doesn't care, always allow inlining. */
9162 if (callee == dont_care)
9163 return true;
9164
9165 /* If the caller doesn't care, always allow inlining. */
9166 if (caller == dont_care)
9167 return true;
9168
9169 /* Otherwise, allow inlining if either the callee and caller values
9170 agree, or if the callee is using the default value. */
9171 return (callee == caller || callee == def);
9172 }
9173
9174 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9175 to inline CALLEE into CALLER based on target-specific info.
9176 Make sure that the caller and callee have compatible architectural
9177 features. Then go through the other possible target attributes
9178 and see if they can block inlining. Try not to reject always_inline
9179 callees unless they are incompatible architecturally. */
9180
9181 static bool
9182 aarch64_can_inline_p (tree caller, tree callee)
9183 {
9184 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9185 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9186
9187 /* If callee has no option attributes, then it is ok to inline. */
9188 if (!callee_tree)
9189 return true;
9190
9191 struct cl_target_option *caller_opts
9192 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9193 : target_option_default_node);
9194
9195 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9196
9197
9198 /* Callee's ISA flags should be a subset of the caller's. */
9199 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9200 != callee_opts->x_aarch64_isa_flags)
9201 return false;
9202
9203 /* Allow non-strict aligned functions inlining into strict
9204 aligned ones. */
9205 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9206 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9207 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9208 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9209 return false;
9210
9211 bool always_inline = lookup_attribute ("always_inline",
9212 DECL_ATTRIBUTES (callee));
9213
9214 /* If the architectural features match up and the callee is always_inline
9215 then the other attributes don't matter. */
9216 if (always_inline)
9217 return true;
9218
9219 if (caller_opts->x_aarch64_cmodel_var
9220 != callee_opts->x_aarch64_cmodel_var)
9221 return false;
9222
9223 if (caller_opts->x_aarch64_tls_dialect
9224 != callee_opts->x_aarch64_tls_dialect)
9225 return false;
9226
9227 /* Honour explicit requests to workaround errata. */
9228 if (!aarch64_tribools_ok_for_inlining_p (
9229 caller_opts->x_aarch64_fix_a53_err835769,
9230 callee_opts->x_aarch64_fix_a53_err835769,
9231 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9232 return false;
9233
9234 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9235 caller and calle and they don't match up, reject inlining. */
9236 if (!aarch64_tribools_ok_for_inlining_p (
9237 caller_opts->x_flag_omit_leaf_frame_pointer,
9238 callee_opts->x_flag_omit_leaf_frame_pointer,
9239 2, 1))
9240 return false;
9241
9242 /* If the callee has specific tuning overrides, respect them. */
9243 if (callee_opts->x_aarch64_override_tune_string != NULL
9244 && caller_opts->x_aarch64_override_tune_string == NULL)
9245 return false;
9246
9247 /* If the user specified tuning override strings for the
9248 caller and callee and they don't match up, reject inlining.
9249 We just do a string compare here, we don't analyze the meaning
9250 of the string, as it would be too costly for little gain. */
9251 if (callee_opts->x_aarch64_override_tune_string
9252 && caller_opts->x_aarch64_override_tune_string
9253 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9254 caller_opts->x_aarch64_override_tune_string) != 0))
9255 return false;
9256
9257 return true;
9258 }
9259
9260 /* Return true if SYMBOL_REF X binds locally. */
9261
9262 static bool
9263 aarch64_symbol_binds_local_p (const_rtx x)
9264 {
9265 return (SYMBOL_REF_DECL (x)
9266 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9267 : SYMBOL_REF_LOCAL_P (x));
9268 }
9269
9270 /* Return true if SYMBOL_REF X is thread local */
9271 static bool
9272 aarch64_tls_symbol_p (rtx x)
9273 {
9274 if (! TARGET_HAVE_TLS)
9275 return false;
9276
9277 if (GET_CODE (x) != SYMBOL_REF)
9278 return false;
9279
9280 return SYMBOL_REF_TLS_MODEL (x) != 0;
9281 }
9282
9283 /* Classify a TLS symbol into one of the TLS kinds. */
9284 enum aarch64_symbol_type
9285 aarch64_classify_tls_symbol (rtx x)
9286 {
9287 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9288
9289 switch (tls_kind)
9290 {
9291 case TLS_MODEL_GLOBAL_DYNAMIC:
9292 case TLS_MODEL_LOCAL_DYNAMIC:
9293 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9294
9295 case TLS_MODEL_INITIAL_EXEC:
9296 switch (aarch64_cmodel)
9297 {
9298 case AARCH64_CMODEL_TINY:
9299 case AARCH64_CMODEL_TINY_PIC:
9300 return SYMBOL_TINY_TLSIE;
9301 default:
9302 return SYMBOL_SMALL_TLSIE;
9303 }
9304
9305 case TLS_MODEL_LOCAL_EXEC:
9306 if (aarch64_tls_size == 12)
9307 return SYMBOL_TLSLE12;
9308 else if (aarch64_tls_size == 24)
9309 return SYMBOL_TLSLE24;
9310 else if (aarch64_tls_size == 32)
9311 return SYMBOL_TLSLE32;
9312 else if (aarch64_tls_size == 48)
9313 return SYMBOL_TLSLE48;
9314 else
9315 gcc_unreachable ();
9316
9317 case TLS_MODEL_EMULATED:
9318 case TLS_MODEL_NONE:
9319 return SYMBOL_FORCE_TO_MEM;
9320
9321 default:
9322 gcc_unreachable ();
9323 }
9324 }
9325
9326 /* Return the method that should be used to access SYMBOL_REF or
9327 LABEL_REF X. */
9328
9329 enum aarch64_symbol_type
9330 aarch64_classify_symbol (rtx x, rtx offset)
9331 {
9332 if (GET_CODE (x) == LABEL_REF)
9333 {
9334 switch (aarch64_cmodel)
9335 {
9336 case AARCH64_CMODEL_LARGE:
9337 return SYMBOL_FORCE_TO_MEM;
9338
9339 case AARCH64_CMODEL_TINY_PIC:
9340 case AARCH64_CMODEL_TINY:
9341 return SYMBOL_TINY_ABSOLUTE;
9342
9343 case AARCH64_CMODEL_SMALL_SPIC:
9344 case AARCH64_CMODEL_SMALL_PIC:
9345 case AARCH64_CMODEL_SMALL:
9346 return SYMBOL_SMALL_ABSOLUTE;
9347
9348 default:
9349 gcc_unreachable ();
9350 }
9351 }
9352
9353 if (GET_CODE (x) == SYMBOL_REF)
9354 {
9355 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
9356 {
9357 /* This is alright even in PIC code as the constant
9358 pool reference is always PC relative and within
9359 the same translation unit. */
9360 if (nopcrelative_literal_loads
9361 && CONSTANT_POOL_ADDRESS_P (x))
9362 return SYMBOL_SMALL_ABSOLUTE;
9363 else
9364 return SYMBOL_FORCE_TO_MEM;
9365 }
9366
9367 if (aarch64_tls_symbol_p (x))
9368 return aarch64_classify_tls_symbol (x);
9369
9370 switch (aarch64_cmodel)
9371 {
9372 case AARCH64_CMODEL_TINY:
9373 /* When we retreive symbol + offset address, we have to make sure
9374 the offset does not cause overflow of the final address. But
9375 we have no way of knowing the address of symbol at compile time
9376 so we can't accurately say if the distance between the PC and
9377 symbol + offset is outside the addressible range of +/-1M in the
9378 TINY code model. So we rely on images not being greater than
9379 1M and cap the offset at 1M and anything beyond 1M will have to
9380 be loaded using an alternative mechanism. */
9381 if (SYMBOL_REF_WEAK (x)
9382 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9383 return SYMBOL_FORCE_TO_MEM;
9384 return SYMBOL_TINY_ABSOLUTE;
9385
9386 case AARCH64_CMODEL_SMALL:
9387 /* Same reasoning as the tiny code model, but the offset cap here is
9388 4G. */
9389 if (SYMBOL_REF_WEAK (x)
9390 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9391 HOST_WIDE_INT_C (4294967264)))
9392 return SYMBOL_FORCE_TO_MEM;
9393 return SYMBOL_SMALL_ABSOLUTE;
9394
9395 case AARCH64_CMODEL_TINY_PIC:
9396 if (!aarch64_symbol_binds_local_p (x))
9397 return SYMBOL_TINY_GOT;
9398 return SYMBOL_TINY_ABSOLUTE;
9399
9400 case AARCH64_CMODEL_SMALL_SPIC:
9401 case AARCH64_CMODEL_SMALL_PIC:
9402 if (!aarch64_symbol_binds_local_p (x))
9403 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9404 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9405 return SYMBOL_SMALL_ABSOLUTE;
9406
9407 default:
9408 gcc_unreachable ();
9409 }
9410 }
9411
9412 /* By default push everything into the constant pool. */
9413 return SYMBOL_FORCE_TO_MEM;
9414 }
9415
9416 bool
9417 aarch64_constant_address_p (rtx x)
9418 {
9419 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9420 }
9421
9422 bool
9423 aarch64_legitimate_pic_operand_p (rtx x)
9424 {
9425 if (GET_CODE (x) == SYMBOL_REF
9426 || (GET_CODE (x) == CONST
9427 && GET_CODE (XEXP (x, 0)) == PLUS
9428 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9429 return false;
9430
9431 return true;
9432 }
9433
9434 /* Return true if X holds either a quarter-precision or
9435 floating-point +0.0 constant. */
9436 static bool
9437 aarch64_valid_floating_const (machine_mode mode, rtx x)
9438 {
9439 if (!CONST_DOUBLE_P (x))
9440 return false;
9441
9442 if (aarch64_float_const_zero_rtx_p (x))
9443 return true;
9444
9445 /* We only handle moving 0.0 to a TFmode register. */
9446 if (!(mode == SFmode || mode == DFmode))
9447 return false;
9448
9449 return aarch64_float_const_representable_p (x);
9450 }
9451
9452 static bool
9453 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9454 {
9455 /* Do not allow vector struct mode constants. We could support
9456 0 and -1 easily, but they need support in aarch64-simd.md. */
9457 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9458 return false;
9459
9460 /* This could probably go away because
9461 we now decompose CONST_INTs according to expand_mov_immediate. */
9462 if ((GET_CODE (x) == CONST_VECTOR
9463 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9464 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9465 return !targetm.cannot_force_const_mem (mode, x);
9466
9467 if (GET_CODE (x) == HIGH
9468 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9469 return true;
9470
9471 return aarch64_constant_address_p (x);
9472 }
9473
9474 rtx
9475 aarch64_load_tp (rtx target)
9476 {
9477 if (!target
9478 || GET_MODE (target) != Pmode
9479 || !register_operand (target, Pmode))
9480 target = gen_reg_rtx (Pmode);
9481
9482 /* Can return in any reg. */
9483 emit_insn (gen_aarch64_load_tp_hard (target));
9484 return target;
9485 }
9486
9487 /* On AAPCS systems, this is the "struct __va_list". */
9488 static GTY(()) tree va_list_type;
9489
9490 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9491 Return the type to use as __builtin_va_list.
9492
9493 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9494
9495 struct __va_list
9496 {
9497 void *__stack;
9498 void *__gr_top;
9499 void *__vr_top;
9500 int __gr_offs;
9501 int __vr_offs;
9502 }; */
9503
9504 static tree
9505 aarch64_build_builtin_va_list (void)
9506 {
9507 tree va_list_name;
9508 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9509
9510 /* Create the type. */
9511 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9512 /* Give it the required name. */
9513 va_list_name = build_decl (BUILTINS_LOCATION,
9514 TYPE_DECL,
9515 get_identifier ("__va_list"),
9516 va_list_type);
9517 DECL_ARTIFICIAL (va_list_name) = 1;
9518 TYPE_NAME (va_list_type) = va_list_name;
9519 TYPE_STUB_DECL (va_list_type) = va_list_name;
9520
9521 /* Create the fields. */
9522 f_stack = build_decl (BUILTINS_LOCATION,
9523 FIELD_DECL, get_identifier ("__stack"),
9524 ptr_type_node);
9525 f_grtop = build_decl (BUILTINS_LOCATION,
9526 FIELD_DECL, get_identifier ("__gr_top"),
9527 ptr_type_node);
9528 f_vrtop = build_decl (BUILTINS_LOCATION,
9529 FIELD_DECL, get_identifier ("__vr_top"),
9530 ptr_type_node);
9531 f_groff = build_decl (BUILTINS_LOCATION,
9532 FIELD_DECL, get_identifier ("__gr_offs"),
9533 integer_type_node);
9534 f_vroff = build_decl (BUILTINS_LOCATION,
9535 FIELD_DECL, get_identifier ("__vr_offs"),
9536 integer_type_node);
9537
9538 DECL_ARTIFICIAL (f_stack) = 1;
9539 DECL_ARTIFICIAL (f_grtop) = 1;
9540 DECL_ARTIFICIAL (f_vrtop) = 1;
9541 DECL_ARTIFICIAL (f_groff) = 1;
9542 DECL_ARTIFICIAL (f_vroff) = 1;
9543
9544 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9545 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9546 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9547 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9548 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9549
9550 TYPE_FIELDS (va_list_type) = f_stack;
9551 DECL_CHAIN (f_stack) = f_grtop;
9552 DECL_CHAIN (f_grtop) = f_vrtop;
9553 DECL_CHAIN (f_vrtop) = f_groff;
9554 DECL_CHAIN (f_groff) = f_vroff;
9555
9556 /* Compute its layout. */
9557 layout_type (va_list_type);
9558
9559 return va_list_type;
9560 }
9561
9562 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9563 static void
9564 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9565 {
9566 const CUMULATIVE_ARGS *cum;
9567 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9568 tree stack, grtop, vrtop, groff, vroff;
9569 tree t;
9570 int gr_save_area_size;
9571 int vr_save_area_size;
9572 int vr_offset;
9573
9574 cum = &crtl->args.info;
9575 gr_save_area_size
9576 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
9577 vr_save_area_size
9578 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
9579
9580 if (!TARGET_FLOAT)
9581 {
9582 gcc_assert (cum->aapcs_nvrn == 0);
9583 vr_save_area_size = 0;
9584 }
9585
9586 f_stack = TYPE_FIELDS (va_list_type_node);
9587 f_grtop = DECL_CHAIN (f_stack);
9588 f_vrtop = DECL_CHAIN (f_grtop);
9589 f_groff = DECL_CHAIN (f_vrtop);
9590 f_vroff = DECL_CHAIN (f_groff);
9591
9592 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9593 NULL_TREE);
9594 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9595 NULL_TREE);
9596 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9597 NULL_TREE);
9598 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9599 NULL_TREE);
9600 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9601 NULL_TREE);
9602
9603 /* Emit code to initialize STACK, which points to the next varargs stack
9604 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9605 by named arguments. STACK is 8-byte aligned. */
9606 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9607 if (cum->aapcs_stack_size > 0)
9608 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9609 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9610 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9611
9612 /* Emit code to initialize GRTOP, the top of the GR save area.
9613 virtual_incoming_args_rtx should have been 16 byte aligned. */
9614 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9615 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9616 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9617
9618 /* Emit code to initialize VRTOP, the top of the VR save area.
9619 This address is gr_save_area_bytes below GRTOP, rounded
9620 down to the next 16-byte boundary. */
9621 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9622 vr_offset = ROUND_UP (gr_save_area_size,
9623 STACK_BOUNDARY / BITS_PER_UNIT);
9624
9625 if (vr_offset)
9626 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9627 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9628 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9629
9630 /* Emit code to initialize GROFF, the offset from GRTOP of the
9631 next GPR argument. */
9632 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9633 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9634 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9635
9636 /* Likewise emit code to initialize VROFF, the offset from FTOP
9637 of the next VR argument. */
9638 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9639 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9640 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9641 }
9642
9643 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9644
9645 static tree
9646 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9647 gimple_seq *post_p ATTRIBUTE_UNUSED)
9648 {
9649 tree addr;
9650 bool indirect_p;
9651 bool is_ha; /* is HFA or HVA. */
9652 bool dw_align; /* double-word align. */
9653 machine_mode ag_mode = VOIDmode;
9654 int nregs;
9655 machine_mode mode;
9656
9657 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9658 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9659 HOST_WIDE_INT size, rsize, adjust, align;
9660 tree t, u, cond1, cond2;
9661
9662 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9663 if (indirect_p)
9664 type = build_pointer_type (type);
9665
9666 mode = TYPE_MODE (type);
9667
9668 f_stack = TYPE_FIELDS (va_list_type_node);
9669 f_grtop = DECL_CHAIN (f_stack);
9670 f_vrtop = DECL_CHAIN (f_grtop);
9671 f_groff = DECL_CHAIN (f_vrtop);
9672 f_vroff = DECL_CHAIN (f_groff);
9673
9674 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9675 f_stack, NULL_TREE);
9676 size = int_size_in_bytes (type);
9677 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9678
9679 dw_align = false;
9680 adjust = 0;
9681 if (aarch64_vfp_is_call_or_return_candidate (mode,
9682 type,
9683 &ag_mode,
9684 &nregs,
9685 &is_ha))
9686 {
9687 /* TYPE passed in fp/simd registers. */
9688 if (!TARGET_FLOAT)
9689 aarch64_err_no_fpadvsimd (mode, "varargs");
9690
9691 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9692 unshare_expr (valist), f_vrtop, NULL_TREE);
9693 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9694 unshare_expr (valist), f_vroff, NULL_TREE);
9695
9696 rsize = nregs * UNITS_PER_VREG;
9697
9698 if (is_ha)
9699 {
9700 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9701 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9702 }
9703 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9704 && size < UNITS_PER_VREG)
9705 {
9706 adjust = UNITS_PER_VREG - size;
9707 }
9708 }
9709 else
9710 {
9711 /* TYPE passed in general registers. */
9712 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9713 unshare_expr (valist), f_grtop, NULL_TREE);
9714 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9715 unshare_expr (valist), f_groff, NULL_TREE);
9716 rsize = ROUND_UP (size, UNITS_PER_WORD);
9717 nregs = rsize / UNITS_PER_WORD;
9718
9719 if (align > 8)
9720 dw_align = true;
9721
9722 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9723 && size < UNITS_PER_WORD)
9724 {
9725 adjust = UNITS_PER_WORD - size;
9726 }
9727 }
9728
9729 /* Get a local temporary for the field value. */
9730 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9731
9732 /* Emit code to branch if off >= 0. */
9733 t = build2 (GE_EXPR, boolean_type_node, off,
9734 build_int_cst (TREE_TYPE (off), 0));
9735 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9736
9737 if (dw_align)
9738 {
9739 /* Emit: offs = (offs + 15) & -16. */
9740 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9741 build_int_cst (TREE_TYPE (off), 15));
9742 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9743 build_int_cst (TREE_TYPE (off), -16));
9744 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9745 }
9746 else
9747 roundup = NULL;
9748
9749 /* Update ap.__[g|v]r_offs */
9750 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9751 build_int_cst (TREE_TYPE (off), rsize));
9752 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9753
9754 /* String up. */
9755 if (roundup)
9756 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9757
9758 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9759 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9760 build_int_cst (TREE_TYPE (f_off), 0));
9761 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9762
9763 /* String up: make sure the assignment happens before the use. */
9764 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9765 COND_EXPR_ELSE (cond1) = t;
9766
9767 /* Prepare the trees handling the argument that is passed on the stack;
9768 the top level node will store in ON_STACK. */
9769 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9770 if (align > 8)
9771 {
9772 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9773 t = fold_convert (intDI_type_node, arg);
9774 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9775 build_int_cst (TREE_TYPE (t), 15));
9776 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9777 build_int_cst (TREE_TYPE (t), -16));
9778 t = fold_convert (TREE_TYPE (arg), t);
9779 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9780 }
9781 else
9782 roundup = NULL;
9783 /* Advance ap.__stack */
9784 t = fold_convert (intDI_type_node, arg);
9785 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9786 build_int_cst (TREE_TYPE (t), size + 7));
9787 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9788 build_int_cst (TREE_TYPE (t), -8));
9789 t = fold_convert (TREE_TYPE (arg), t);
9790 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9791 /* String up roundup and advance. */
9792 if (roundup)
9793 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9794 /* String up with arg */
9795 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9796 /* Big-endianness related address adjustment. */
9797 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9798 && size < UNITS_PER_WORD)
9799 {
9800 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9801 size_int (UNITS_PER_WORD - size));
9802 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9803 }
9804
9805 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9806 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9807
9808 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9809 t = off;
9810 if (adjust)
9811 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9812 build_int_cst (TREE_TYPE (off), adjust));
9813
9814 t = fold_convert (sizetype, t);
9815 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9816
9817 if (is_ha)
9818 {
9819 /* type ha; // treat as "struct {ftype field[n];}"
9820 ... [computing offs]
9821 for (i = 0; i <nregs; ++i, offs += 16)
9822 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9823 return ha; */
9824 int i;
9825 tree tmp_ha, field_t, field_ptr_t;
9826
9827 /* Declare a local variable. */
9828 tmp_ha = create_tmp_var_raw (type, "ha");
9829 gimple_add_tmp_var (tmp_ha);
9830
9831 /* Establish the base type. */
9832 switch (ag_mode)
9833 {
9834 case SFmode:
9835 field_t = float_type_node;
9836 field_ptr_t = float_ptr_type_node;
9837 break;
9838 case DFmode:
9839 field_t = double_type_node;
9840 field_ptr_t = double_ptr_type_node;
9841 break;
9842 case TFmode:
9843 field_t = long_double_type_node;
9844 field_ptr_t = long_double_ptr_type_node;
9845 break;
9846 /* The half precision and quad precision are not fully supported yet. Enable
9847 the following code after the support is complete. Need to find the correct
9848 type node for __fp16 *. */
9849 #if 0
9850 case HFmode:
9851 field_t = float_type_node;
9852 field_ptr_t = float_ptr_type_node;
9853 break;
9854 #endif
9855 case V2SImode:
9856 case V4SImode:
9857 {
9858 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9859 field_t = build_vector_type_for_mode (innertype, ag_mode);
9860 field_ptr_t = build_pointer_type (field_t);
9861 }
9862 break;
9863 default:
9864 gcc_assert (0);
9865 }
9866
9867 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
9868 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9869 addr = t;
9870 t = fold_convert (field_ptr_t, addr);
9871 t = build2 (MODIFY_EXPR, field_t,
9872 build1 (INDIRECT_REF, field_t, tmp_ha),
9873 build1 (INDIRECT_REF, field_t, t));
9874
9875 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
9876 for (i = 1; i < nregs; ++i)
9877 {
9878 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9879 u = fold_convert (field_ptr_t, addr);
9880 u = build2 (MODIFY_EXPR, field_t,
9881 build2 (MEM_REF, field_t, tmp_ha,
9882 build_int_cst (field_ptr_t,
9883 (i *
9884 int_size_in_bytes (field_t)))),
9885 build1 (INDIRECT_REF, field_t, u));
9886 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9887 }
9888
9889 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9890 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9891 }
9892
9893 COND_EXPR_ELSE (cond2) = t;
9894 addr = fold_convert (build_pointer_type (type), cond1);
9895 addr = build_va_arg_indirect_ref (addr);
9896
9897 if (indirect_p)
9898 addr = build_va_arg_indirect_ref (addr);
9899
9900 return addr;
9901 }
9902
9903 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
9904
9905 static void
9906 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9907 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9908 int no_rtl)
9909 {
9910 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9911 CUMULATIVE_ARGS local_cum;
9912 int gr_saved, vr_saved;
9913
9914 /* The caller has advanced CUM up to, but not beyond, the last named
9915 argument. Advance a local copy of CUM past the last "real" named
9916 argument, to find out how many registers are left over. */
9917 local_cum = *cum;
9918 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9919
9920 /* Found out how many registers we need to save. */
9921 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
9922 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
9923
9924 if (!TARGET_FLOAT)
9925 {
9926 gcc_assert (local_cum.aapcs_nvrn == 0);
9927 vr_saved = 0;
9928 }
9929
9930 if (!no_rtl)
9931 {
9932 if (gr_saved > 0)
9933 {
9934 rtx ptr, mem;
9935
9936 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
9937 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9938 - gr_saved * UNITS_PER_WORD);
9939 mem = gen_frame_mem (BLKmode, ptr);
9940 set_mem_alias_set (mem, get_varargs_alias_set ());
9941
9942 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9943 mem, gr_saved);
9944 }
9945 if (vr_saved > 0)
9946 {
9947 /* We can't use move_block_from_reg, because it will use
9948 the wrong mode, storing D regs only. */
9949 machine_mode mode = TImode;
9950 int off, i;
9951
9952 /* Set OFF to the offset from virtual_incoming_args_rtx of
9953 the first vector register. The VR save area lies below
9954 the GR one, and is aligned to 16 bytes. */
9955 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
9956 STACK_BOUNDARY / BITS_PER_UNIT);
9957 off -= vr_saved * UNITS_PER_VREG;
9958
9959 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
9960 {
9961 rtx ptr, mem;
9962
9963 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
9964 mem = gen_frame_mem (mode, ptr);
9965 set_mem_alias_set (mem, get_varargs_alias_set ());
9966 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
9967 off += UNITS_PER_VREG;
9968 }
9969 }
9970 }
9971
9972 /* We don't save the size into *PRETEND_SIZE because we want to avoid
9973 any complication of having crtl->args.pretend_args_size changed. */
9974 cfun->machine->frame.saved_varargs_size
9975 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
9976 STACK_BOUNDARY / BITS_PER_UNIT)
9977 + vr_saved * UNITS_PER_VREG);
9978 }
9979
9980 static void
9981 aarch64_conditional_register_usage (void)
9982 {
9983 int i;
9984 if (!TARGET_FLOAT)
9985 {
9986 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
9987 {
9988 fixed_regs[i] = 1;
9989 call_used_regs[i] = 1;
9990 }
9991 }
9992 }
9993
9994 /* Walk down the type tree of TYPE counting consecutive base elements.
9995 If *MODEP is VOIDmode, then set it to the first valid floating point
9996 type. If a non-floating point type is found, or if a floating point
9997 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
9998 otherwise return the count in the sub-tree. */
9999 static int
10000 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10001 {
10002 machine_mode mode;
10003 HOST_WIDE_INT size;
10004
10005 switch (TREE_CODE (type))
10006 {
10007 case REAL_TYPE:
10008 mode = TYPE_MODE (type);
10009 if (mode != DFmode && mode != SFmode && mode != TFmode)
10010 return -1;
10011
10012 if (*modep == VOIDmode)
10013 *modep = mode;
10014
10015 if (*modep == mode)
10016 return 1;
10017
10018 break;
10019
10020 case COMPLEX_TYPE:
10021 mode = TYPE_MODE (TREE_TYPE (type));
10022 if (mode != DFmode && mode != SFmode && mode != TFmode)
10023 return -1;
10024
10025 if (*modep == VOIDmode)
10026 *modep = mode;
10027
10028 if (*modep == mode)
10029 return 2;
10030
10031 break;
10032
10033 case VECTOR_TYPE:
10034 /* Use V2SImode and V4SImode as representatives of all 64-bit
10035 and 128-bit vector types. */
10036 size = int_size_in_bytes (type);
10037 switch (size)
10038 {
10039 case 8:
10040 mode = V2SImode;
10041 break;
10042 case 16:
10043 mode = V4SImode;
10044 break;
10045 default:
10046 return -1;
10047 }
10048
10049 if (*modep == VOIDmode)
10050 *modep = mode;
10051
10052 /* Vector modes are considered to be opaque: two vectors are
10053 equivalent for the purposes of being homogeneous aggregates
10054 if they are the same size. */
10055 if (*modep == mode)
10056 return 1;
10057
10058 break;
10059
10060 case ARRAY_TYPE:
10061 {
10062 int count;
10063 tree index = TYPE_DOMAIN (type);
10064
10065 /* Can't handle incomplete types nor sizes that are not
10066 fixed. */
10067 if (!COMPLETE_TYPE_P (type)
10068 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10069 return -1;
10070
10071 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10072 if (count == -1
10073 || !index
10074 || !TYPE_MAX_VALUE (index)
10075 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10076 || !TYPE_MIN_VALUE (index)
10077 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10078 || count < 0)
10079 return -1;
10080
10081 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10082 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10083
10084 /* There must be no padding. */
10085 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10086 return -1;
10087
10088 return count;
10089 }
10090
10091 case RECORD_TYPE:
10092 {
10093 int count = 0;
10094 int sub_count;
10095 tree field;
10096
10097 /* Can't handle incomplete types nor sizes that are not
10098 fixed. */
10099 if (!COMPLETE_TYPE_P (type)
10100 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10101 return -1;
10102
10103 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10104 {
10105 if (TREE_CODE (field) != FIELD_DECL)
10106 continue;
10107
10108 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10109 if (sub_count < 0)
10110 return -1;
10111 count += sub_count;
10112 }
10113
10114 /* There must be no padding. */
10115 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10116 return -1;
10117
10118 return count;
10119 }
10120
10121 case UNION_TYPE:
10122 case QUAL_UNION_TYPE:
10123 {
10124 /* These aren't very interesting except in a degenerate case. */
10125 int count = 0;
10126 int sub_count;
10127 tree field;
10128
10129 /* Can't handle incomplete types nor sizes that are not
10130 fixed. */
10131 if (!COMPLETE_TYPE_P (type)
10132 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10133 return -1;
10134
10135 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10136 {
10137 if (TREE_CODE (field) != FIELD_DECL)
10138 continue;
10139
10140 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10141 if (sub_count < 0)
10142 return -1;
10143 count = count > sub_count ? count : sub_count;
10144 }
10145
10146 /* There must be no padding. */
10147 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10148 return -1;
10149
10150 return count;
10151 }
10152
10153 default:
10154 break;
10155 }
10156
10157 return -1;
10158 }
10159
10160 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10161 type as described in AAPCS64 \S 4.1.2.
10162
10163 See the comment above aarch64_composite_type_p for the notes on MODE. */
10164
10165 static bool
10166 aarch64_short_vector_p (const_tree type,
10167 machine_mode mode)
10168 {
10169 HOST_WIDE_INT size = -1;
10170
10171 if (type && TREE_CODE (type) == VECTOR_TYPE)
10172 size = int_size_in_bytes (type);
10173 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10174 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10175 size = GET_MODE_SIZE (mode);
10176
10177 return (size == 8 || size == 16);
10178 }
10179
10180 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10181 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10182 array types. The C99 floating-point complex types are also considered
10183 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10184 types, which are GCC extensions and out of the scope of AAPCS64, are
10185 treated as composite types here as well.
10186
10187 Note that MODE itself is not sufficient in determining whether a type
10188 is such a composite type or not. This is because
10189 stor-layout.c:compute_record_mode may have already changed the MODE
10190 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10191 structure with only one field may have its MODE set to the mode of the
10192 field. Also an integer mode whose size matches the size of the
10193 RECORD_TYPE type may be used to substitute the original mode
10194 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10195 solely relied on. */
10196
10197 static bool
10198 aarch64_composite_type_p (const_tree type,
10199 machine_mode mode)
10200 {
10201 if (aarch64_short_vector_p (type, mode))
10202 return false;
10203
10204 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10205 return true;
10206
10207 if (mode == BLKmode
10208 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10209 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10210 return true;
10211
10212 return false;
10213 }
10214
10215 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10216 shall be passed or returned in simd/fp register(s) (providing these
10217 parameter passing registers are available).
10218
10219 Upon successful return, *COUNT returns the number of needed registers,
10220 *BASE_MODE returns the mode of the individual register and when IS_HAF
10221 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10222 floating-point aggregate or a homogeneous short-vector aggregate. */
10223
10224 static bool
10225 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10226 const_tree type,
10227 machine_mode *base_mode,
10228 int *count,
10229 bool *is_ha)
10230 {
10231 machine_mode new_mode = VOIDmode;
10232 bool composite_p = aarch64_composite_type_p (type, mode);
10233
10234 if (is_ha != NULL) *is_ha = false;
10235
10236 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10237 || aarch64_short_vector_p (type, mode))
10238 {
10239 *count = 1;
10240 new_mode = mode;
10241 }
10242 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10243 {
10244 if (is_ha != NULL) *is_ha = true;
10245 *count = 2;
10246 new_mode = GET_MODE_INNER (mode);
10247 }
10248 else if (type && composite_p)
10249 {
10250 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10251
10252 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10253 {
10254 if (is_ha != NULL) *is_ha = true;
10255 *count = ag_count;
10256 }
10257 else
10258 return false;
10259 }
10260 else
10261 return false;
10262
10263 *base_mode = new_mode;
10264 return true;
10265 }
10266
10267 /* Implement TARGET_STRUCT_VALUE_RTX. */
10268
10269 static rtx
10270 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10271 int incoming ATTRIBUTE_UNUSED)
10272 {
10273 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10274 }
10275
10276 /* Implements target hook vector_mode_supported_p. */
10277 static bool
10278 aarch64_vector_mode_supported_p (machine_mode mode)
10279 {
10280 if (TARGET_SIMD
10281 && (mode == V4SImode || mode == V8HImode
10282 || mode == V16QImode || mode == V2DImode
10283 || mode == V2SImode || mode == V4HImode
10284 || mode == V8QImode || mode == V2SFmode
10285 || mode == V4SFmode || mode == V2DFmode
10286 || mode == V4HFmode || mode == V8HFmode
10287 || mode == V1DFmode))
10288 return true;
10289
10290 return false;
10291 }
10292
10293 /* Return appropriate SIMD container
10294 for MODE within a vector of WIDTH bits. */
10295 static machine_mode
10296 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10297 {
10298 gcc_assert (width == 64 || width == 128);
10299 if (TARGET_SIMD)
10300 {
10301 if (width == 128)
10302 switch (mode)
10303 {
10304 case DFmode:
10305 return V2DFmode;
10306 case SFmode:
10307 return V4SFmode;
10308 case SImode:
10309 return V4SImode;
10310 case HImode:
10311 return V8HImode;
10312 case QImode:
10313 return V16QImode;
10314 case DImode:
10315 return V2DImode;
10316 default:
10317 break;
10318 }
10319 else
10320 switch (mode)
10321 {
10322 case SFmode:
10323 return V2SFmode;
10324 case SImode:
10325 return V2SImode;
10326 case HImode:
10327 return V4HImode;
10328 case QImode:
10329 return V8QImode;
10330 default:
10331 break;
10332 }
10333 }
10334 return word_mode;
10335 }
10336
10337 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10338 static machine_mode
10339 aarch64_preferred_simd_mode (machine_mode mode)
10340 {
10341 return aarch64_simd_container_mode (mode, 128);
10342 }
10343
10344 /* Return the bitmask of possible vector sizes for the vectorizer
10345 to iterate over. */
10346 static unsigned int
10347 aarch64_autovectorize_vector_sizes (void)
10348 {
10349 return (16 | 8);
10350 }
10351
10352 /* Implement TARGET_MANGLE_TYPE. */
10353
10354 static const char *
10355 aarch64_mangle_type (const_tree type)
10356 {
10357 /* The AArch64 ABI documents say that "__va_list" has to be
10358 managled as if it is in the "std" namespace. */
10359 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10360 return "St9__va_list";
10361
10362 /* Half-precision float. */
10363 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10364 return "Dh";
10365
10366 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10367 builtin types. */
10368 if (TYPE_NAME (type) != NULL)
10369 return aarch64_mangle_builtin_type (type);
10370
10371 /* Use the default mangling. */
10372 return NULL;
10373 }
10374
10375
10376 /* Return true if the rtx_insn contains a MEM RTX somewhere
10377 in it. */
10378
10379 static bool
10380 has_memory_op (rtx_insn *mem_insn)
10381 {
10382 subrtx_iterator::array_type array;
10383 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10384 if (MEM_P (*iter))
10385 return true;
10386
10387 return false;
10388 }
10389
10390 /* Find the first rtx_insn before insn that will generate an assembly
10391 instruction. */
10392
10393 static rtx_insn *
10394 aarch64_prev_real_insn (rtx_insn *insn)
10395 {
10396 if (!insn)
10397 return NULL;
10398
10399 do
10400 {
10401 insn = prev_real_insn (insn);
10402 }
10403 while (insn && recog_memoized (insn) < 0);
10404
10405 return insn;
10406 }
10407
10408 static bool
10409 is_madd_op (enum attr_type t1)
10410 {
10411 unsigned int i;
10412 /* A number of these may be AArch32 only. */
10413 enum attr_type mlatypes[] = {
10414 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10415 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10416 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10417 };
10418
10419 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10420 {
10421 if (t1 == mlatypes[i])
10422 return true;
10423 }
10424
10425 return false;
10426 }
10427
10428 /* Check if there is a register dependency between a load and the insn
10429 for which we hold recog_data. */
10430
10431 static bool
10432 dep_between_memop_and_curr (rtx memop)
10433 {
10434 rtx load_reg;
10435 int opno;
10436
10437 gcc_assert (GET_CODE (memop) == SET);
10438
10439 if (!REG_P (SET_DEST (memop)))
10440 return false;
10441
10442 load_reg = SET_DEST (memop);
10443 for (opno = 1; opno < recog_data.n_operands; opno++)
10444 {
10445 rtx operand = recog_data.operand[opno];
10446 if (REG_P (operand)
10447 && reg_overlap_mentioned_p (load_reg, operand))
10448 return true;
10449
10450 }
10451 return false;
10452 }
10453
10454
10455 /* When working around the Cortex-A53 erratum 835769,
10456 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10457 instruction and has a preceding memory instruction such that a NOP
10458 should be inserted between them. */
10459
10460 bool
10461 aarch64_madd_needs_nop (rtx_insn* insn)
10462 {
10463 enum attr_type attr_type;
10464 rtx_insn *prev;
10465 rtx body;
10466
10467 if (!TARGET_FIX_ERR_A53_835769)
10468 return false;
10469
10470 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10471 return false;
10472
10473 attr_type = get_attr_type (insn);
10474 if (!is_madd_op (attr_type))
10475 return false;
10476
10477 prev = aarch64_prev_real_insn (insn);
10478 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10479 Restore recog state to INSN to avoid state corruption. */
10480 extract_constrain_insn_cached (insn);
10481
10482 if (!prev || !has_memory_op (prev))
10483 return false;
10484
10485 body = single_set (prev);
10486
10487 /* If the previous insn is a memory op and there is no dependency between
10488 it and the DImode madd, emit a NOP between them. If body is NULL then we
10489 have a complex memory operation, probably a load/store pair.
10490 Be conservative for now and emit a NOP. */
10491 if (GET_MODE (recog_data.operand[0]) == DImode
10492 && (!body || !dep_between_memop_and_curr (body)))
10493 return true;
10494
10495 return false;
10496
10497 }
10498
10499
10500 /* Implement FINAL_PRESCAN_INSN. */
10501
10502 void
10503 aarch64_final_prescan_insn (rtx_insn *insn)
10504 {
10505 if (aarch64_madd_needs_nop (insn))
10506 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10507 }
10508
10509
10510 /* Return the equivalent letter for size. */
10511 static char
10512 sizetochar (int size)
10513 {
10514 switch (size)
10515 {
10516 case 64: return 'd';
10517 case 32: return 's';
10518 case 16: return 'h';
10519 case 8 : return 'b';
10520 default: gcc_unreachable ();
10521 }
10522 }
10523
10524 /* Return true iff x is a uniform vector of floating-point
10525 constants, and the constant can be represented in
10526 quarter-precision form. Note, as aarch64_float_const_representable
10527 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10528 static bool
10529 aarch64_vect_float_const_representable_p (rtx x)
10530 {
10531 rtx elt;
10532 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10533 && const_vec_duplicate_p (x, &elt)
10534 && aarch64_float_const_representable_p (elt));
10535 }
10536
10537 /* Return true for valid and false for invalid. */
10538 bool
10539 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10540 struct simd_immediate_info *info)
10541 {
10542 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10543 matches = 1; \
10544 for (i = 0; i < idx; i += (STRIDE)) \
10545 if (!(TEST)) \
10546 matches = 0; \
10547 if (matches) \
10548 { \
10549 immtype = (CLASS); \
10550 elsize = (ELSIZE); \
10551 eshift = (SHIFT); \
10552 emvn = (NEG); \
10553 break; \
10554 }
10555
10556 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10557 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10558 unsigned char bytes[16];
10559 int immtype = -1, matches;
10560 unsigned int invmask = inverse ? 0xff : 0;
10561 int eshift, emvn;
10562
10563 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10564 {
10565 if (! (aarch64_simd_imm_zero_p (op, mode)
10566 || aarch64_vect_float_const_representable_p (op)))
10567 return false;
10568
10569 if (info)
10570 {
10571 info->value = CONST_VECTOR_ELT (op, 0);
10572 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10573 info->mvn = false;
10574 info->shift = 0;
10575 }
10576
10577 return true;
10578 }
10579
10580 /* Splat vector constant out into a byte vector. */
10581 for (i = 0; i < n_elts; i++)
10582 {
10583 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10584 it must be laid out in the vector register in reverse order. */
10585 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10586 unsigned HOST_WIDE_INT elpart;
10587
10588 gcc_assert (CONST_INT_P (el));
10589 elpart = INTVAL (el);
10590
10591 for (unsigned int byte = 0; byte < innersize; byte++)
10592 {
10593 bytes[idx++] = (elpart & 0xff) ^ invmask;
10594 elpart >>= BITS_PER_UNIT;
10595 }
10596
10597 }
10598
10599 /* Sanity check. */
10600 gcc_assert (idx == GET_MODE_SIZE (mode));
10601
10602 do
10603 {
10604 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10605 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10606
10607 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10608 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10609
10610 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10611 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10612
10613 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10614 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10615
10616 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10617
10618 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10619
10620 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10621 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10622
10623 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10624 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10625
10626 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10627 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10628
10629 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10630 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10631
10632 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10633
10634 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10635
10636 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10637 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10638
10639 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10640 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10641
10642 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10643 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10644
10645 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10646 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10647
10648 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10649
10650 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10651 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10652 }
10653 while (0);
10654
10655 if (immtype == -1)
10656 return false;
10657
10658 if (info)
10659 {
10660 info->element_width = elsize;
10661 info->mvn = emvn != 0;
10662 info->shift = eshift;
10663
10664 unsigned HOST_WIDE_INT imm = 0;
10665
10666 if (immtype >= 12 && immtype <= 15)
10667 info->msl = true;
10668
10669 /* Un-invert bytes of recognized vector, if necessary. */
10670 if (invmask != 0)
10671 for (i = 0; i < idx; i++)
10672 bytes[i] ^= invmask;
10673
10674 if (immtype == 17)
10675 {
10676 /* FIXME: Broken on 32-bit H_W_I hosts. */
10677 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10678
10679 for (i = 0; i < 8; i++)
10680 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10681 << (i * BITS_PER_UNIT);
10682
10683
10684 info->value = GEN_INT (imm);
10685 }
10686 else
10687 {
10688 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10689 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10690
10691 /* Construct 'abcdefgh' because the assembler cannot handle
10692 generic constants. */
10693 if (info->mvn)
10694 imm = ~imm;
10695 imm = (imm >> info->shift) & 0xff;
10696 info->value = GEN_INT (imm);
10697 }
10698 }
10699
10700 return true;
10701 #undef CHECK
10702 }
10703
10704 /* Check of immediate shift constants are within range. */
10705 bool
10706 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10707 {
10708 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10709 if (left)
10710 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10711 else
10712 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10713 }
10714
10715 /* Return true if X is a uniform vector where all elements
10716 are either the floating-point constant 0.0 or the
10717 integer constant 0. */
10718 bool
10719 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10720 {
10721 return x == CONST0_RTX (mode);
10722 }
10723
10724
10725 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10726 operation of width WIDTH at bit position POS. */
10727
10728 rtx
10729 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10730 {
10731 gcc_assert (CONST_INT_P (width));
10732 gcc_assert (CONST_INT_P (pos));
10733
10734 unsigned HOST_WIDE_INT mask
10735 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10736 return GEN_INT (mask << UINTVAL (pos));
10737 }
10738
10739 bool
10740 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10741 {
10742 HOST_WIDE_INT imm = INTVAL (x);
10743 int i;
10744
10745 for (i = 0; i < 8; i++)
10746 {
10747 unsigned int byte = imm & 0xff;
10748 if (byte != 0xff && byte != 0)
10749 return false;
10750 imm >>= 8;
10751 }
10752
10753 return true;
10754 }
10755
10756 bool
10757 aarch64_mov_operand_p (rtx x, machine_mode mode)
10758 {
10759 if (GET_CODE (x) == HIGH
10760 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10761 return true;
10762
10763 if (CONST_INT_P (x))
10764 return true;
10765
10766 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10767 return true;
10768
10769 return aarch64_classify_symbolic_expression (x)
10770 == SYMBOL_TINY_ABSOLUTE;
10771 }
10772
10773 /* Return a const_int vector of VAL. */
10774 rtx
10775 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10776 {
10777 int nunits = GET_MODE_NUNITS (mode);
10778 rtvec v = rtvec_alloc (nunits);
10779 int i;
10780
10781 for (i=0; i < nunits; i++)
10782 RTVEC_ELT (v, i) = GEN_INT (val);
10783
10784 return gen_rtx_CONST_VECTOR (mode, v);
10785 }
10786
10787 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10788
10789 bool
10790 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10791 {
10792 machine_mode vmode;
10793
10794 gcc_assert (!VECTOR_MODE_P (mode));
10795 vmode = aarch64_preferred_simd_mode (mode);
10796 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10797 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10798 }
10799
10800 /* Construct and return a PARALLEL RTX vector with elements numbering the
10801 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10802 the vector - from the perspective of the architecture. This does not
10803 line up with GCC's perspective on lane numbers, so we end up with
10804 different masks depending on our target endian-ness. The diagram
10805 below may help. We must draw the distinction when building masks
10806 which select one half of the vector. An instruction selecting
10807 architectural low-lanes for a big-endian target, must be described using
10808 a mask selecting GCC high-lanes.
10809
10810 Big-Endian Little-Endian
10811
10812 GCC 0 1 2 3 3 2 1 0
10813 | x | x | x | x | | x | x | x | x |
10814 Architecture 3 2 1 0 3 2 1 0
10815
10816 Low Mask: { 2, 3 } { 0, 1 }
10817 High Mask: { 0, 1 } { 2, 3 }
10818 */
10819
10820 rtx
10821 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10822 {
10823 int nunits = GET_MODE_NUNITS (mode);
10824 rtvec v = rtvec_alloc (nunits / 2);
10825 int high_base = nunits / 2;
10826 int low_base = 0;
10827 int base;
10828 rtx t1;
10829 int i;
10830
10831 if (BYTES_BIG_ENDIAN)
10832 base = high ? low_base : high_base;
10833 else
10834 base = high ? high_base : low_base;
10835
10836 for (i = 0; i < nunits / 2; i++)
10837 RTVEC_ELT (v, i) = GEN_INT (base + i);
10838
10839 t1 = gen_rtx_PARALLEL (mode, v);
10840 return t1;
10841 }
10842
10843 /* Check OP for validity as a PARALLEL RTX vector with elements
10844 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10845 from the perspective of the architecture. See the diagram above
10846 aarch64_simd_vect_par_cnst_half for more details. */
10847
10848 bool
10849 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10850 bool high)
10851 {
10852 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10853 HOST_WIDE_INT count_op = XVECLEN (op, 0);
10854 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10855 int i = 0;
10856
10857 if (!VECTOR_MODE_P (mode))
10858 return false;
10859
10860 if (count_op != count_ideal)
10861 return false;
10862
10863 for (i = 0; i < count_ideal; i++)
10864 {
10865 rtx elt_op = XVECEXP (op, 0, i);
10866 rtx elt_ideal = XVECEXP (ideal, 0, i);
10867
10868 if (!CONST_INT_P (elt_op)
10869 || INTVAL (elt_ideal) != INTVAL (elt_op))
10870 return false;
10871 }
10872 return true;
10873 }
10874
10875 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
10876 HIGH (exclusive). */
10877 void
10878 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10879 const_tree exp)
10880 {
10881 HOST_WIDE_INT lane;
10882 gcc_assert (CONST_INT_P (operand));
10883 lane = INTVAL (operand);
10884
10885 if (lane < low || lane >= high)
10886 {
10887 if (exp)
10888 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10889 else
10890 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10891 }
10892 }
10893
10894 /* Return TRUE if OP is a valid vector addressing mode. */
10895 bool
10896 aarch64_simd_mem_operand_p (rtx op)
10897 {
10898 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10899 || REG_P (XEXP (op, 0)));
10900 }
10901
10902 /* Emit a register copy from operand to operand, taking care not to
10903 early-clobber source registers in the process.
10904
10905 COUNT is the number of components into which the copy needs to be
10906 decomposed. */
10907 void
10908 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10909 unsigned int count)
10910 {
10911 unsigned int i;
10912 int rdest = REGNO (operands[0]);
10913 int rsrc = REGNO (operands[1]);
10914
10915 if (!reg_overlap_mentioned_p (operands[0], operands[1])
10916 || rdest < rsrc)
10917 for (i = 0; i < count; i++)
10918 emit_move_insn (gen_rtx_REG (mode, rdest + i),
10919 gen_rtx_REG (mode, rsrc + i));
10920 else
10921 for (i = 0; i < count; i++)
10922 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10923 gen_rtx_REG (mode, rsrc + count - i - 1));
10924 }
10925
10926 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
10927 one of VSTRUCT modes: OI, CI or XI. */
10928 int
10929 aarch64_simd_attr_length_move (rtx_insn *insn)
10930 {
10931 machine_mode mode;
10932
10933 extract_insn_cached (insn);
10934
10935 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
10936 {
10937 mode = GET_MODE (recog_data.operand[0]);
10938 switch (mode)
10939 {
10940 case OImode:
10941 return 8;
10942 case CImode:
10943 return 12;
10944 case XImode:
10945 return 16;
10946 default:
10947 gcc_unreachable ();
10948 }
10949 }
10950 return 4;
10951 }
10952
10953 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10954 one of VSTRUCT modes: OI, CI, or XI. */
10955 int
10956 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10957 {
10958 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10959 }
10960
10961 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
10962 alignment of a vector to 128 bits. */
10963 static HOST_WIDE_INT
10964 aarch64_simd_vector_alignment (const_tree type)
10965 {
10966 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10967 return MIN (align, 128);
10968 }
10969
10970 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
10971 static bool
10972 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10973 {
10974 if (is_packed)
10975 return false;
10976
10977 /* We guarantee alignment for vectors up to 128-bits. */
10978 if (tree_int_cst_compare (TYPE_SIZE (type),
10979 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10980 return false;
10981
10982 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
10983 return true;
10984 }
10985
10986 /* If VALS is a vector constant that can be loaded into a register
10987 using DUP, generate instructions to do so and return an RTX to
10988 assign to the register. Otherwise return NULL_RTX. */
10989 static rtx
10990 aarch64_simd_dup_constant (rtx vals)
10991 {
10992 machine_mode mode = GET_MODE (vals);
10993 machine_mode inner_mode = GET_MODE_INNER (mode);
10994 rtx x;
10995
10996 if (!const_vec_duplicate_p (vals, &x))
10997 return NULL_RTX;
10998
10999 /* We can load this constant by using DUP and a constant in a
11000 single ARM register. This will be cheaper than a vector
11001 load. */
11002 x = copy_to_mode_reg (inner_mode, x);
11003 return gen_rtx_VEC_DUPLICATE (mode, x);
11004 }
11005
11006
11007 /* Generate code to load VALS, which is a PARALLEL containing only
11008 constants (for vec_init) or CONST_VECTOR, efficiently into a
11009 register. Returns an RTX to copy into the register, or NULL_RTX
11010 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11011 static rtx
11012 aarch64_simd_make_constant (rtx vals)
11013 {
11014 machine_mode mode = GET_MODE (vals);
11015 rtx const_dup;
11016 rtx const_vec = NULL_RTX;
11017 int n_elts = GET_MODE_NUNITS (mode);
11018 int n_const = 0;
11019 int i;
11020
11021 if (GET_CODE (vals) == CONST_VECTOR)
11022 const_vec = vals;
11023 else if (GET_CODE (vals) == PARALLEL)
11024 {
11025 /* A CONST_VECTOR must contain only CONST_INTs and
11026 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11027 Only store valid constants in a CONST_VECTOR. */
11028 for (i = 0; i < n_elts; ++i)
11029 {
11030 rtx x = XVECEXP (vals, 0, i);
11031 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11032 n_const++;
11033 }
11034 if (n_const == n_elts)
11035 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11036 }
11037 else
11038 gcc_unreachable ();
11039
11040 if (const_vec != NULL_RTX
11041 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11042 /* Load using MOVI/MVNI. */
11043 return const_vec;
11044 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11045 /* Loaded using DUP. */
11046 return const_dup;
11047 else if (const_vec != NULL_RTX)
11048 /* Load from constant pool. We can not take advantage of single-cycle
11049 LD1 because we need a PC-relative addressing mode. */
11050 return const_vec;
11051 else
11052 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11053 We can not construct an initializer. */
11054 return NULL_RTX;
11055 }
11056
11057 /* Expand a vector initialisation sequence, such that TARGET is
11058 initialised to contain VALS. */
11059
11060 void
11061 aarch64_expand_vector_init (rtx target, rtx vals)
11062 {
11063 machine_mode mode = GET_MODE (target);
11064 machine_mode inner_mode = GET_MODE_INNER (mode);
11065 /* The number of vector elements. */
11066 int n_elts = GET_MODE_NUNITS (mode);
11067 /* The number of vector elements which are not constant. */
11068 int n_var = 0;
11069 rtx any_const = NULL_RTX;
11070 /* The first element of vals. */
11071 rtx v0 = XVECEXP (vals, 0, 0);
11072 bool all_same = true;
11073
11074 /* Count the number of variable elements to initialise. */
11075 for (int i = 0; i < n_elts; ++i)
11076 {
11077 rtx x = XVECEXP (vals, 0, i);
11078 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11079 ++n_var;
11080 else
11081 any_const = x;
11082
11083 all_same &= rtx_equal_p (x, v0);
11084 }
11085
11086 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11087 how best to handle this. */
11088 if (n_var == 0)
11089 {
11090 rtx constant = aarch64_simd_make_constant (vals);
11091 if (constant != NULL_RTX)
11092 {
11093 emit_move_insn (target, constant);
11094 return;
11095 }
11096 }
11097
11098 /* Splat a single non-constant element if we can. */
11099 if (all_same)
11100 {
11101 rtx x = copy_to_mode_reg (inner_mode, v0);
11102 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11103 return;
11104 }
11105
11106 /* Initialise a vector which is part-variable. We want to first try
11107 to build those lanes which are constant in the most efficient way we
11108 can. */
11109 if (n_var != n_elts)
11110 {
11111 rtx copy = copy_rtx (vals);
11112
11113 /* Load constant part of vector. We really don't care what goes into the
11114 parts we will overwrite, but we're more likely to be able to load the
11115 constant efficiently if it has fewer, larger, repeating parts
11116 (see aarch64_simd_valid_immediate). */
11117 for (int i = 0; i < n_elts; i++)
11118 {
11119 rtx x = XVECEXP (vals, 0, i);
11120 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11121 continue;
11122 rtx subst = any_const;
11123 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11124 {
11125 /* Look in the copied vector, as more elements are const. */
11126 rtx test = XVECEXP (copy, 0, i ^ bit);
11127 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11128 {
11129 subst = test;
11130 break;
11131 }
11132 }
11133 XVECEXP (copy, 0, i) = subst;
11134 }
11135 aarch64_expand_vector_init (target, copy);
11136 }
11137
11138 /* Insert the variable lanes directly. */
11139
11140 enum insn_code icode = optab_handler (vec_set_optab, mode);
11141 gcc_assert (icode != CODE_FOR_nothing);
11142
11143 for (int i = 0; i < n_elts; i++)
11144 {
11145 rtx x = XVECEXP (vals, 0, i);
11146 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11147 continue;
11148 x = copy_to_mode_reg (inner_mode, x);
11149 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11150 }
11151 }
11152
11153 static unsigned HOST_WIDE_INT
11154 aarch64_shift_truncation_mask (machine_mode mode)
11155 {
11156 return
11157 (!SHIFT_COUNT_TRUNCATED
11158 || aarch64_vector_mode_supported_p (mode)
11159 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11160 }
11161
11162 /* Select a format to encode pointers in exception handling data. */
11163 int
11164 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11165 {
11166 int type;
11167 switch (aarch64_cmodel)
11168 {
11169 case AARCH64_CMODEL_TINY:
11170 case AARCH64_CMODEL_TINY_PIC:
11171 case AARCH64_CMODEL_SMALL:
11172 case AARCH64_CMODEL_SMALL_PIC:
11173 case AARCH64_CMODEL_SMALL_SPIC:
11174 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11175 for everything. */
11176 type = DW_EH_PE_sdata4;
11177 break;
11178 default:
11179 /* No assumptions here. 8-byte relocs required. */
11180 type = DW_EH_PE_sdata8;
11181 break;
11182 }
11183 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11184 }
11185
11186 /* The last .arch and .tune assembly strings that we printed. */
11187 static std::string aarch64_last_printed_arch_string;
11188 static std::string aarch64_last_printed_tune_string;
11189
11190 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11191 by the function fndecl. */
11192
11193 void
11194 aarch64_declare_function_name (FILE *stream, const char* name,
11195 tree fndecl)
11196 {
11197 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11198
11199 struct cl_target_option *targ_options;
11200 if (target_parts)
11201 targ_options = TREE_TARGET_OPTION (target_parts);
11202 else
11203 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11204 gcc_assert (targ_options);
11205
11206 const struct processor *this_arch
11207 = aarch64_get_arch (targ_options->x_explicit_arch);
11208
11209 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11210 std::string extension
11211 = aarch64_get_extension_string_for_isa_flags (isa_flags);
11212 /* Only update the assembler .arch string if it is distinct from the last
11213 such string we printed. */
11214 std::string to_print = this_arch->name + extension;
11215 if (to_print != aarch64_last_printed_arch_string)
11216 {
11217 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11218 aarch64_last_printed_arch_string = to_print;
11219 }
11220
11221 /* Print the cpu name we're tuning for in the comments, might be
11222 useful to readers of the generated asm. Do it only when it changes
11223 from function to function and verbose assembly is requested. */
11224 const struct processor *this_tune
11225 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11226
11227 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11228 {
11229 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11230 this_tune->name);
11231 aarch64_last_printed_tune_string = this_tune->name;
11232 }
11233
11234 /* Don't forget the type directive for ELF. */
11235 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11236 ASM_OUTPUT_LABEL (stream, name);
11237 }
11238
11239 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11240
11241 static void
11242 aarch64_start_file (void)
11243 {
11244 struct cl_target_option *default_options
11245 = TREE_TARGET_OPTION (target_option_default_node);
11246
11247 const struct processor *default_arch
11248 = aarch64_get_arch (default_options->x_explicit_arch);
11249 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11250 std::string extension
11251 = aarch64_get_extension_string_for_isa_flags (default_isa_flags);
11252
11253 aarch64_last_printed_arch_string = default_arch->name + extension;
11254 aarch64_last_printed_tune_string = "";
11255 asm_fprintf (asm_out_file, "\t.arch %s\n",
11256 aarch64_last_printed_arch_string.c_str ());
11257
11258 default_file_start ();
11259 }
11260
11261 /* Emit load exclusive. */
11262
11263 static void
11264 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11265 rtx mem, rtx model_rtx)
11266 {
11267 rtx (*gen) (rtx, rtx, rtx);
11268
11269 switch (mode)
11270 {
11271 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11272 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11273 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11274 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11275 default:
11276 gcc_unreachable ();
11277 }
11278
11279 emit_insn (gen (rval, mem, model_rtx));
11280 }
11281
11282 /* Emit store exclusive. */
11283
11284 static void
11285 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11286 rtx rval, rtx mem, rtx model_rtx)
11287 {
11288 rtx (*gen) (rtx, rtx, rtx, rtx);
11289
11290 switch (mode)
11291 {
11292 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11293 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11294 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11295 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11296 default:
11297 gcc_unreachable ();
11298 }
11299
11300 emit_insn (gen (bval, rval, mem, model_rtx));
11301 }
11302
11303 /* Mark the previous jump instruction as unlikely. */
11304
11305 static void
11306 aarch64_emit_unlikely_jump (rtx insn)
11307 {
11308 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11309
11310 insn = emit_jump_insn (insn);
11311 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11312 }
11313
11314 /* Expand a compare and swap pattern. */
11315
11316 void
11317 aarch64_expand_compare_and_swap (rtx operands[])
11318 {
11319 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11320 machine_mode mode, cmp_mode;
11321 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11322 int idx;
11323 gen_cas_fn gen;
11324 const gen_cas_fn split_cas[] =
11325 {
11326 gen_aarch64_compare_and_swapqi,
11327 gen_aarch64_compare_and_swaphi,
11328 gen_aarch64_compare_and_swapsi,
11329 gen_aarch64_compare_and_swapdi
11330 };
11331 const gen_cas_fn atomic_cas[] =
11332 {
11333 gen_aarch64_compare_and_swapqi_lse,
11334 gen_aarch64_compare_and_swaphi_lse,
11335 gen_aarch64_compare_and_swapsi_lse,
11336 gen_aarch64_compare_and_swapdi_lse
11337 };
11338
11339 bval = operands[0];
11340 rval = operands[1];
11341 mem = operands[2];
11342 oldval = operands[3];
11343 newval = operands[4];
11344 is_weak = operands[5];
11345 mod_s = operands[6];
11346 mod_f = operands[7];
11347 mode = GET_MODE (mem);
11348 cmp_mode = mode;
11349
11350 /* Normally the succ memory model must be stronger than fail, but in the
11351 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11352 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11353
11354 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11355 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11356 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11357
11358 switch (mode)
11359 {
11360 case QImode:
11361 case HImode:
11362 /* For short modes, we're going to perform the comparison in SImode,
11363 so do the zero-extension now. */
11364 cmp_mode = SImode;
11365 rval = gen_reg_rtx (SImode);
11366 oldval = convert_modes (SImode, mode, oldval, true);
11367 /* Fall through. */
11368
11369 case SImode:
11370 case DImode:
11371 /* Force the value into a register if needed. */
11372 if (!aarch64_plus_operand (oldval, mode))
11373 oldval = force_reg (cmp_mode, oldval);
11374 break;
11375
11376 default:
11377 gcc_unreachable ();
11378 }
11379
11380 switch (mode)
11381 {
11382 case QImode: idx = 0; break;
11383 case HImode: idx = 1; break;
11384 case SImode: idx = 2; break;
11385 case DImode: idx = 3; break;
11386 default:
11387 gcc_unreachable ();
11388 }
11389 if (TARGET_LSE)
11390 gen = atomic_cas[idx];
11391 else
11392 gen = split_cas[idx];
11393
11394 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11395
11396 if (mode == QImode || mode == HImode)
11397 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11398
11399 x = gen_rtx_REG (CCmode, CC_REGNUM);
11400 x = gen_rtx_EQ (SImode, x, const0_rtx);
11401 emit_insn (gen_rtx_SET (bval, x));
11402 }
11403
11404 /* Test whether the target supports using a atomic load-operate instruction.
11405 CODE is the operation and AFTER is TRUE if the data in memory after the
11406 operation should be returned and FALSE if the data before the operation
11407 should be returned. Returns FALSE if the operation isn't supported by the
11408 architecture. */
11409
11410 bool
11411 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11412 {
11413 if (!TARGET_LSE)
11414 return false;
11415
11416 switch (code)
11417 {
11418 case SET:
11419 case AND:
11420 case IOR:
11421 case XOR:
11422 case MINUS:
11423 case PLUS:
11424 return true;
11425 default:
11426 return false;
11427 }
11428 }
11429
11430 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11431 sequence implementing an atomic operation. */
11432
11433 static void
11434 aarch64_emit_post_barrier (enum memmodel model)
11435 {
11436 const enum memmodel base_model = memmodel_base (model);
11437
11438 if (is_mm_sync (model)
11439 && (base_model == MEMMODEL_ACQUIRE
11440 || base_model == MEMMODEL_ACQ_REL
11441 || base_model == MEMMODEL_SEQ_CST))
11442 {
11443 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11444 }
11445 }
11446
11447 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11448 for the data in memory. EXPECTED is the value expected to be in memory.
11449 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11450 is the memory ordering to use. */
11451
11452 void
11453 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11454 rtx expected, rtx desired,
11455 rtx model)
11456 {
11457 rtx (*gen) (rtx, rtx, rtx, rtx);
11458 machine_mode mode;
11459
11460 mode = GET_MODE (mem);
11461
11462 switch (mode)
11463 {
11464 case QImode: gen = gen_aarch64_atomic_casqi; break;
11465 case HImode: gen = gen_aarch64_atomic_cashi; break;
11466 case SImode: gen = gen_aarch64_atomic_cassi; break;
11467 case DImode: gen = gen_aarch64_atomic_casdi; break;
11468 default:
11469 gcc_unreachable ();
11470 }
11471
11472 /* Move the expected value into the CAS destination register. */
11473 emit_insn (gen_rtx_SET (rval, expected));
11474
11475 /* Emit the CAS. */
11476 emit_insn (gen (rval, mem, desired, model));
11477
11478 /* Compare the expected value with the value loaded by the CAS, to establish
11479 whether the swap was made. */
11480 aarch64_gen_compare_reg (EQ, rval, expected);
11481 }
11482
11483 /* Split a compare and swap pattern. */
11484
11485 void
11486 aarch64_split_compare_and_swap (rtx operands[])
11487 {
11488 rtx rval, mem, oldval, newval, scratch;
11489 machine_mode mode;
11490 bool is_weak;
11491 rtx_code_label *label1, *label2;
11492 rtx x, cond;
11493 enum memmodel model;
11494 rtx model_rtx;
11495
11496 rval = operands[0];
11497 mem = operands[1];
11498 oldval = operands[2];
11499 newval = operands[3];
11500 is_weak = (operands[4] != const0_rtx);
11501 model_rtx = operands[5];
11502 scratch = operands[7];
11503 mode = GET_MODE (mem);
11504 model = memmodel_from_int (INTVAL (model_rtx));
11505
11506 label1 = NULL;
11507 if (!is_weak)
11508 {
11509 label1 = gen_label_rtx ();
11510 emit_label (label1);
11511 }
11512 label2 = gen_label_rtx ();
11513
11514 /* The initial load can be relaxed for a __sync operation since a final
11515 barrier will be emitted to stop code hoisting. */
11516 if (is_mm_sync (model))
11517 aarch64_emit_load_exclusive (mode, rval, mem,
11518 GEN_INT (MEMMODEL_RELAXED));
11519 else
11520 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11521
11522 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11523 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11524 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11525 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11526 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11527
11528 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11529
11530 if (!is_weak)
11531 {
11532 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11533 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11534 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11535 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11536 }
11537 else
11538 {
11539 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11540 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11541 emit_insn (gen_rtx_SET (cond, x));
11542 }
11543
11544 emit_label (label2);
11545
11546 /* Emit any final barrier needed for a __sync operation. */
11547 if (is_mm_sync (model))
11548 aarch64_emit_post_barrier (model);
11549 }
11550
11551 /* Emit a BIC instruction. */
11552
11553 static void
11554 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11555 {
11556 rtx shift_rtx = GEN_INT (shift);
11557 rtx (*gen) (rtx, rtx, rtx, rtx);
11558
11559 switch (mode)
11560 {
11561 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11562 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11563 default:
11564 gcc_unreachable ();
11565 }
11566
11567 emit_insn (gen (dst, s2, shift_rtx, s1));
11568 }
11569
11570 /* Emit an atomic swap. */
11571
11572 static void
11573 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11574 rtx mem, rtx model)
11575 {
11576 rtx (*gen) (rtx, rtx, rtx, rtx);
11577
11578 switch (mode)
11579 {
11580 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11581 case HImode: gen = gen_aarch64_atomic_swphi; break;
11582 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11583 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11584 default:
11585 gcc_unreachable ();
11586 }
11587
11588 emit_insn (gen (dst, mem, value, model));
11589 }
11590
11591 /* Operations supported by aarch64_emit_atomic_load_op. */
11592
11593 enum aarch64_atomic_load_op_code
11594 {
11595 AARCH64_LDOP_PLUS, /* A + B */
11596 AARCH64_LDOP_XOR, /* A ^ B */
11597 AARCH64_LDOP_OR, /* A | B */
11598 AARCH64_LDOP_BIC /* A & ~B */
11599 };
11600
11601 /* Emit an atomic load-operate. */
11602
11603 static void
11604 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11605 machine_mode mode, rtx dst, rtx src,
11606 rtx mem, rtx model)
11607 {
11608 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11609 const aarch64_atomic_load_op_fn plus[] =
11610 {
11611 gen_aarch64_atomic_loadaddqi,
11612 gen_aarch64_atomic_loadaddhi,
11613 gen_aarch64_atomic_loadaddsi,
11614 gen_aarch64_atomic_loadadddi
11615 };
11616 const aarch64_atomic_load_op_fn eor[] =
11617 {
11618 gen_aarch64_atomic_loadeorqi,
11619 gen_aarch64_atomic_loadeorhi,
11620 gen_aarch64_atomic_loadeorsi,
11621 gen_aarch64_atomic_loadeordi
11622 };
11623 const aarch64_atomic_load_op_fn ior[] =
11624 {
11625 gen_aarch64_atomic_loadsetqi,
11626 gen_aarch64_atomic_loadsethi,
11627 gen_aarch64_atomic_loadsetsi,
11628 gen_aarch64_atomic_loadsetdi
11629 };
11630 const aarch64_atomic_load_op_fn bic[] =
11631 {
11632 gen_aarch64_atomic_loadclrqi,
11633 gen_aarch64_atomic_loadclrhi,
11634 gen_aarch64_atomic_loadclrsi,
11635 gen_aarch64_atomic_loadclrdi
11636 };
11637 aarch64_atomic_load_op_fn gen;
11638 int idx = 0;
11639
11640 switch (mode)
11641 {
11642 case QImode: idx = 0; break;
11643 case HImode: idx = 1; break;
11644 case SImode: idx = 2; break;
11645 case DImode: idx = 3; break;
11646 default:
11647 gcc_unreachable ();
11648 }
11649
11650 switch (code)
11651 {
11652 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11653 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11654 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11655 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11656 default:
11657 gcc_unreachable ();
11658 }
11659
11660 emit_insn (gen (dst, mem, src, model));
11661 }
11662
11663 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11664 location to store the data read from memory. OUT_RESULT is the location to
11665 store the result of the operation. MEM is the memory location to read and
11666 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11667 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11668 be NULL. */
11669
11670 void
11671 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11672 rtx mem, rtx value, rtx model_rtx)
11673 {
11674 machine_mode mode = GET_MODE (mem);
11675 machine_mode wmode = (mode == DImode ? DImode : SImode);
11676 const bool short_mode = (mode < SImode);
11677 aarch64_atomic_load_op_code ldop_code;
11678 rtx src;
11679 rtx x;
11680
11681 if (out_data)
11682 out_data = gen_lowpart (mode, out_data);
11683
11684 if (out_result)
11685 out_result = gen_lowpart (mode, out_result);
11686
11687 /* Make sure the value is in a register, putting it into a destination
11688 register if it needs to be manipulated. */
11689 if (!register_operand (value, mode)
11690 || code == AND || code == MINUS)
11691 {
11692 src = out_result ? out_result : out_data;
11693 emit_move_insn (src, gen_lowpart (mode, value));
11694 }
11695 else
11696 src = value;
11697 gcc_assert (register_operand (src, mode));
11698
11699 /* Preprocess the data for the operation as necessary. If the operation is
11700 a SET then emit a swap instruction and finish. */
11701 switch (code)
11702 {
11703 case SET:
11704 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11705 return;
11706
11707 case MINUS:
11708 /* Negate the value and treat it as a PLUS. */
11709 {
11710 rtx neg_src;
11711
11712 /* Resize the value if necessary. */
11713 if (short_mode)
11714 src = gen_lowpart (wmode, src);
11715
11716 neg_src = gen_rtx_NEG (wmode, src);
11717 emit_insn (gen_rtx_SET (src, neg_src));
11718
11719 if (short_mode)
11720 src = gen_lowpart (mode, src);
11721 }
11722 /* Fall-through. */
11723 case PLUS:
11724 ldop_code = AARCH64_LDOP_PLUS;
11725 break;
11726
11727 case IOR:
11728 ldop_code = AARCH64_LDOP_OR;
11729 break;
11730
11731 case XOR:
11732 ldop_code = AARCH64_LDOP_XOR;
11733 break;
11734
11735 case AND:
11736 {
11737 rtx not_src;
11738
11739 /* Resize the value if necessary. */
11740 if (short_mode)
11741 src = gen_lowpart (wmode, src);
11742
11743 not_src = gen_rtx_NOT (wmode, src);
11744 emit_insn (gen_rtx_SET (src, not_src));
11745
11746 if (short_mode)
11747 src = gen_lowpart (mode, src);
11748 }
11749 ldop_code = AARCH64_LDOP_BIC;
11750 break;
11751
11752 default:
11753 /* The operation can't be done with atomic instructions. */
11754 gcc_unreachable ();
11755 }
11756
11757 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11758
11759 /* If necessary, calculate the data in memory after the update by redoing the
11760 operation from values in registers. */
11761 if (!out_result)
11762 return;
11763
11764 if (short_mode)
11765 {
11766 src = gen_lowpart (wmode, src);
11767 out_data = gen_lowpart (wmode, out_data);
11768 out_result = gen_lowpart (wmode, out_result);
11769 }
11770
11771 x = NULL_RTX;
11772
11773 switch (code)
11774 {
11775 case MINUS:
11776 case PLUS:
11777 x = gen_rtx_PLUS (wmode, out_data, src);
11778 break;
11779 case IOR:
11780 x = gen_rtx_IOR (wmode, out_data, src);
11781 break;
11782 case XOR:
11783 x = gen_rtx_XOR (wmode, out_data, src);
11784 break;
11785 case AND:
11786 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11787 return;
11788 default:
11789 gcc_unreachable ();
11790 }
11791
11792 emit_set_insn (out_result, x);
11793
11794 return;
11795 }
11796
11797 /* Split an atomic operation. */
11798
11799 void
11800 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11801 rtx value, rtx model_rtx, rtx cond)
11802 {
11803 machine_mode mode = GET_MODE (mem);
11804 machine_mode wmode = (mode == DImode ? DImode : SImode);
11805 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11806 const bool is_sync = is_mm_sync (model);
11807 rtx_code_label *label;
11808 rtx x;
11809
11810 /* Split the atomic operation into a sequence. */
11811 label = gen_label_rtx ();
11812 emit_label (label);
11813
11814 if (new_out)
11815 new_out = gen_lowpart (wmode, new_out);
11816 if (old_out)
11817 old_out = gen_lowpart (wmode, old_out);
11818 else
11819 old_out = new_out;
11820 value = simplify_gen_subreg (wmode, value, mode, 0);
11821
11822 /* The initial load can be relaxed for a __sync operation since a final
11823 barrier will be emitted to stop code hoisting. */
11824 if (is_sync)
11825 aarch64_emit_load_exclusive (mode, old_out, mem,
11826 GEN_INT (MEMMODEL_RELAXED));
11827 else
11828 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11829
11830 switch (code)
11831 {
11832 case SET:
11833 new_out = value;
11834 break;
11835
11836 case NOT:
11837 x = gen_rtx_AND (wmode, old_out, value);
11838 emit_insn (gen_rtx_SET (new_out, x));
11839 x = gen_rtx_NOT (wmode, new_out);
11840 emit_insn (gen_rtx_SET (new_out, x));
11841 break;
11842
11843 case MINUS:
11844 if (CONST_INT_P (value))
11845 {
11846 value = GEN_INT (-INTVAL (value));
11847 code = PLUS;
11848 }
11849 /* Fall through. */
11850
11851 default:
11852 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11853 emit_insn (gen_rtx_SET (new_out, x));
11854 break;
11855 }
11856
11857 aarch64_emit_store_exclusive (mode, cond, mem,
11858 gen_lowpart (mode, new_out), model_rtx);
11859
11860 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11861 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11862 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11863 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11864
11865 /* Emit any final barrier needed for a __sync operation. */
11866 if (is_sync)
11867 aarch64_emit_post_barrier (model);
11868 }
11869
11870 static void
11871 aarch64_init_libfuncs (void)
11872 {
11873 /* Half-precision float operations. The compiler handles all operations
11874 with NULL libfuncs by converting to SFmode. */
11875
11876 /* Conversions. */
11877 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11878 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11879
11880 /* Arithmetic. */
11881 set_optab_libfunc (add_optab, HFmode, NULL);
11882 set_optab_libfunc (sdiv_optab, HFmode, NULL);
11883 set_optab_libfunc (smul_optab, HFmode, NULL);
11884 set_optab_libfunc (neg_optab, HFmode, NULL);
11885 set_optab_libfunc (sub_optab, HFmode, NULL);
11886
11887 /* Comparisons. */
11888 set_optab_libfunc (eq_optab, HFmode, NULL);
11889 set_optab_libfunc (ne_optab, HFmode, NULL);
11890 set_optab_libfunc (lt_optab, HFmode, NULL);
11891 set_optab_libfunc (le_optab, HFmode, NULL);
11892 set_optab_libfunc (ge_optab, HFmode, NULL);
11893 set_optab_libfunc (gt_optab, HFmode, NULL);
11894 set_optab_libfunc (unord_optab, HFmode, NULL);
11895 }
11896
11897 /* Target hook for c_mode_for_suffix. */
11898 static machine_mode
11899 aarch64_c_mode_for_suffix (char suffix)
11900 {
11901 if (suffix == 'q')
11902 return TFmode;
11903
11904 return VOIDmode;
11905 }
11906
11907 /* We can only represent floating point constants which will fit in
11908 "quarter-precision" values. These values are characterised by
11909 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
11910 by:
11911
11912 (-1)^s * (n/16) * 2^r
11913
11914 Where:
11915 's' is the sign bit.
11916 'n' is an integer in the range 16 <= n <= 31.
11917 'r' is an integer in the range -3 <= r <= 4. */
11918
11919 /* Return true iff X can be represented by a quarter-precision
11920 floating point immediate operand X. Note, we cannot represent 0.0. */
11921 bool
11922 aarch64_float_const_representable_p (rtx x)
11923 {
11924 /* This represents our current view of how many bits
11925 make up the mantissa. */
11926 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11927 int exponent;
11928 unsigned HOST_WIDE_INT mantissa, mask;
11929 REAL_VALUE_TYPE r, m;
11930 bool fail;
11931
11932 if (!CONST_DOUBLE_P (x))
11933 return false;
11934
11935 /* We don't support HFmode constants yet. */
11936 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11937 return false;
11938
11939 r = *CONST_DOUBLE_REAL_VALUE (x);
11940
11941 /* We cannot represent infinities, NaNs or +/-zero. We won't
11942 know if we have +zero until we analyse the mantissa, but we
11943 can reject the other invalid values. */
11944 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11945 || REAL_VALUE_MINUS_ZERO (r))
11946 return false;
11947
11948 /* Extract exponent. */
11949 r = real_value_abs (&r);
11950 exponent = REAL_EXP (&r);
11951
11952 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11953 highest (sign) bit, with a fixed binary point at bit point_pos.
11954 m1 holds the low part of the mantissa, m2 the high part.
11955 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11956 bits for the mantissa, this can fail (low bits will be lost). */
11957 real_ldexp (&m, &r, point_pos - exponent);
11958 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11959
11960 /* If the low part of the mantissa has bits set we cannot represent
11961 the value. */
11962 if (w.elt (0) != 0)
11963 return false;
11964 /* We have rejected the lower HOST_WIDE_INT, so update our
11965 understanding of how many bits lie in the mantissa and
11966 look only at the high HOST_WIDE_INT. */
11967 mantissa = w.elt (1);
11968 point_pos -= HOST_BITS_PER_WIDE_INT;
11969
11970 /* We can only represent values with a mantissa of the form 1.xxxx. */
11971 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11972 if ((mantissa & mask) != 0)
11973 return false;
11974
11975 /* Having filtered unrepresentable values, we may now remove all
11976 but the highest 5 bits. */
11977 mantissa >>= point_pos - 5;
11978
11979 /* We cannot represent the value 0.0, so reject it. This is handled
11980 elsewhere. */
11981 if (mantissa == 0)
11982 return false;
11983
11984 /* Then, as bit 4 is always set, we can mask it off, leaving
11985 the mantissa in the range [0, 15]. */
11986 mantissa &= ~(1 << 4);
11987 gcc_assert (mantissa <= 15);
11988
11989 /* GCC internally does not use IEEE754-like encoding (where normalized
11990 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
11991 Our mantissa values are shifted 4 places to the left relative to
11992 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
11993 by 5 places to correct for GCC's representation. */
11994 exponent = 5 - exponent;
11995
11996 return (exponent >= 0 && exponent <= 7);
11997 }
11998
11999 char*
12000 aarch64_output_simd_mov_immediate (rtx const_vector,
12001 machine_mode mode,
12002 unsigned width)
12003 {
12004 bool is_valid;
12005 static char templ[40];
12006 const char *mnemonic;
12007 const char *shift_op;
12008 unsigned int lane_count = 0;
12009 char element_char;
12010
12011 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12012
12013 /* This will return true to show const_vector is legal for use as either
12014 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12015 also update INFO to show how the immediate should be generated. */
12016 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12017 gcc_assert (is_valid);
12018
12019 element_char = sizetochar (info.element_width);
12020 lane_count = width / info.element_width;
12021
12022 mode = GET_MODE_INNER (mode);
12023 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12024 {
12025 gcc_assert (info.shift == 0 && ! info.mvn);
12026 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12027 move immediate path. */
12028 if (aarch64_float_const_zero_rtx_p (info.value))
12029 info.value = GEN_INT (0);
12030 else
12031 {
12032 #define buf_size 20
12033 char float_buf[buf_size] = {'\0'};
12034 real_to_decimal_for_mode (float_buf,
12035 CONST_DOUBLE_REAL_VALUE (info.value),
12036 buf_size, buf_size, 1, mode);
12037 #undef buf_size
12038
12039 if (lane_count == 1)
12040 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12041 else
12042 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12043 lane_count, element_char, float_buf);
12044 return templ;
12045 }
12046 }
12047
12048 mnemonic = info.mvn ? "mvni" : "movi";
12049 shift_op = info.msl ? "msl" : "lsl";
12050
12051 gcc_assert (CONST_INT_P (info.value));
12052 if (lane_count == 1)
12053 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12054 mnemonic, UINTVAL (info.value));
12055 else if (info.shift)
12056 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12057 ", %s %d", mnemonic, lane_count, element_char,
12058 UINTVAL (info.value), shift_op, info.shift);
12059 else
12060 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12061 mnemonic, lane_count, element_char, UINTVAL (info.value));
12062 return templ;
12063 }
12064
12065 char*
12066 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12067 machine_mode mode)
12068 {
12069 machine_mode vmode;
12070
12071 gcc_assert (!VECTOR_MODE_P (mode));
12072 vmode = aarch64_simd_container_mode (mode, 64);
12073 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12074 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12075 }
12076
12077 /* Split operands into moves from op[1] + op[2] into op[0]. */
12078
12079 void
12080 aarch64_split_combinev16qi (rtx operands[3])
12081 {
12082 unsigned int dest = REGNO (operands[0]);
12083 unsigned int src1 = REGNO (operands[1]);
12084 unsigned int src2 = REGNO (operands[2]);
12085 machine_mode halfmode = GET_MODE (operands[1]);
12086 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12087 rtx destlo, desthi;
12088
12089 gcc_assert (halfmode == V16QImode);
12090
12091 if (src1 == dest && src2 == dest + halfregs)
12092 {
12093 /* No-op move. Can't split to nothing; emit something. */
12094 emit_note (NOTE_INSN_DELETED);
12095 return;
12096 }
12097
12098 /* Preserve register attributes for variable tracking. */
12099 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12100 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12101 GET_MODE_SIZE (halfmode));
12102
12103 /* Special case of reversed high/low parts. */
12104 if (reg_overlap_mentioned_p (operands[2], destlo)
12105 && reg_overlap_mentioned_p (operands[1], desthi))
12106 {
12107 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12108 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12109 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12110 }
12111 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12112 {
12113 /* Try to avoid unnecessary moves if part of the result
12114 is in the right place already. */
12115 if (src1 != dest)
12116 emit_move_insn (destlo, operands[1]);
12117 if (src2 != dest + halfregs)
12118 emit_move_insn (desthi, operands[2]);
12119 }
12120 else
12121 {
12122 if (src2 != dest + halfregs)
12123 emit_move_insn (desthi, operands[2]);
12124 if (src1 != dest)
12125 emit_move_insn (destlo, operands[1]);
12126 }
12127 }
12128
12129 /* vec_perm support. */
12130
12131 #define MAX_VECT_LEN 16
12132
12133 struct expand_vec_perm_d
12134 {
12135 rtx target, op0, op1;
12136 unsigned char perm[MAX_VECT_LEN];
12137 machine_mode vmode;
12138 unsigned char nelt;
12139 bool one_vector_p;
12140 bool testing_p;
12141 };
12142
12143 /* Generate a variable permutation. */
12144
12145 static void
12146 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12147 {
12148 machine_mode vmode = GET_MODE (target);
12149 bool one_vector_p = rtx_equal_p (op0, op1);
12150
12151 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12152 gcc_checking_assert (GET_MODE (op0) == vmode);
12153 gcc_checking_assert (GET_MODE (op1) == vmode);
12154 gcc_checking_assert (GET_MODE (sel) == vmode);
12155 gcc_checking_assert (TARGET_SIMD);
12156
12157 if (one_vector_p)
12158 {
12159 if (vmode == V8QImode)
12160 {
12161 /* Expand the argument to a V16QI mode by duplicating it. */
12162 rtx pair = gen_reg_rtx (V16QImode);
12163 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12164 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12165 }
12166 else
12167 {
12168 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12169 }
12170 }
12171 else
12172 {
12173 rtx pair;
12174
12175 if (vmode == V8QImode)
12176 {
12177 pair = gen_reg_rtx (V16QImode);
12178 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12179 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12180 }
12181 else
12182 {
12183 pair = gen_reg_rtx (OImode);
12184 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12185 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12186 }
12187 }
12188 }
12189
12190 void
12191 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12192 {
12193 machine_mode vmode = GET_MODE (target);
12194 unsigned int nelt = GET_MODE_NUNITS (vmode);
12195 bool one_vector_p = rtx_equal_p (op0, op1);
12196 rtx mask;
12197
12198 /* The TBL instruction does not use a modulo index, so we must take care
12199 of that ourselves. */
12200 mask = aarch64_simd_gen_const_vector_dup (vmode,
12201 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12202 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12203
12204 /* For big-endian, we also need to reverse the index within the vector
12205 (but not which vector). */
12206 if (BYTES_BIG_ENDIAN)
12207 {
12208 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12209 if (!one_vector_p)
12210 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12211 sel = expand_simple_binop (vmode, XOR, sel, mask,
12212 NULL, 0, OPTAB_LIB_WIDEN);
12213 }
12214 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12215 }
12216
12217 /* Recognize patterns suitable for the TRN instructions. */
12218 static bool
12219 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12220 {
12221 unsigned int i, odd, mask, nelt = d->nelt;
12222 rtx out, in0, in1, x;
12223 rtx (*gen) (rtx, rtx, rtx);
12224 machine_mode vmode = d->vmode;
12225
12226 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12227 return false;
12228
12229 /* Note that these are little-endian tests.
12230 We correct for big-endian later. */
12231 if (d->perm[0] == 0)
12232 odd = 0;
12233 else if (d->perm[0] == 1)
12234 odd = 1;
12235 else
12236 return false;
12237 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12238
12239 for (i = 0; i < nelt; i += 2)
12240 {
12241 if (d->perm[i] != i + odd)
12242 return false;
12243 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12244 return false;
12245 }
12246
12247 /* Success! */
12248 if (d->testing_p)
12249 return true;
12250
12251 in0 = d->op0;
12252 in1 = d->op1;
12253 if (BYTES_BIG_ENDIAN)
12254 {
12255 x = in0, in0 = in1, in1 = x;
12256 odd = !odd;
12257 }
12258 out = d->target;
12259
12260 if (odd)
12261 {
12262 switch (vmode)
12263 {
12264 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12265 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12266 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12267 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12268 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12269 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12270 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12271 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12272 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12273 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12274 default:
12275 return false;
12276 }
12277 }
12278 else
12279 {
12280 switch (vmode)
12281 {
12282 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12283 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12284 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12285 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12286 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12287 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12288 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12289 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12290 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12291 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12292 default:
12293 return false;
12294 }
12295 }
12296
12297 emit_insn (gen (out, in0, in1));
12298 return true;
12299 }
12300
12301 /* Recognize patterns suitable for the UZP instructions. */
12302 static bool
12303 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12304 {
12305 unsigned int i, odd, mask, nelt = d->nelt;
12306 rtx out, in0, in1, x;
12307 rtx (*gen) (rtx, rtx, rtx);
12308 machine_mode vmode = d->vmode;
12309
12310 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12311 return false;
12312
12313 /* Note that these are little-endian tests.
12314 We correct for big-endian later. */
12315 if (d->perm[0] == 0)
12316 odd = 0;
12317 else if (d->perm[0] == 1)
12318 odd = 1;
12319 else
12320 return false;
12321 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12322
12323 for (i = 0; i < nelt; i++)
12324 {
12325 unsigned elt = (i * 2 + odd) & mask;
12326 if (d->perm[i] != elt)
12327 return false;
12328 }
12329
12330 /* Success! */
12331 if (d->testing_p)
12332 return true;
12333
12334 in0 = d->op0;
12335 in1 = d->op1;
12336 if (BYTES_BIG_ENDIAN)
12337 {
12338 x = in0, in0 = in1, in1 = x;
12339 odd = !odd;
12340 }
12341 out = d->target;
12342
12343 if (odd)
12344 {
12345 switch (vmode)
12346 {
12347 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12348 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12349 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12350 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12351 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12352 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12353 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12354 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12355 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12356 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12357 default:
12358 return false;
12359 }
12360 }
12361 else
12362 {
12363 switch (vmode)
12364 {
12365 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12366 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12367 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12368 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12369 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12370 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12371 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12372 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12373 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12374 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12375 default:
12376 return false;
12377 }
12378 }
12379
12380 emit_insn (gen (out, in0, in1));
12381 return true;
12382 }
12383
12384 /* Recognize patterns suitable for the ZIP instructions. */
12385 static bool
12386 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12387 {
12388 unsigned int i, high, mask, nelt = d->nelt;
12389 rtx out, in0, in1, x;
12390 rtx (*gen) (rtx, rtx, rtx);
12391 machine_mode vmode = d->vmode;
12392
12393 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12394 return false;
12395
12396 /* Note that these are little-endian tests.
12397 We correct for big-endian later. */
12398 high = nelt / 2;
12399 if (d->perm[0] == high)
12400 /* Do Nothing. */
12401 ;
12402 else if (d->perm[0] == 0)
12403 high = 0;
12404 else
12405 return false;
12406 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12407
12408 for (i = 0; i < nelt / 2; i++)
12409 {
12410 unsigned elt = (i + high) & mask;
12411 if (d->perm[i * 2] != elt)
12412 return false;
12413 elt = (elt + nelt) & mask;
12414 if (d->perm[i * 2 + 1] != elt)
12415 return false;
12416 }
12417
12418 /* Success! */
12419 if (d->testing_p)
12420 return true;
12421
12422 in0 = d->op0;
12423 in1 = d->op1;
12424 if (BYTES_BIG_ENDIAN)
12425 {
12426 x = in0, in0 = in1, in1 = x;
12427 high = !high;
12428 }
12429 out = d->target;
12430
12431 if (high)
12432 {
12433 switch (vmode)
12434 {
12435 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12436 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12437 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12438 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12439 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12440 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12441 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12442 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12443 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12444 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12445 default:
12446 return false;
12447 }
12448 }
12449 else
12450 {
12451 switch (vmode)
12452 {
12453 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12454 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12455 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12456 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12457 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12458 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12459 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12460 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12461 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12462 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12463 default:
12464 return false;
12465 }
12466 }
12467
12468 emit_insn (gen (out, in0, in1));
12469 return true;
12470 }
12471
12472 /* Recognize patterns for the EXT insn. */
12473
12474 static bool
12475 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12476 {
12477 unsigned int i, nelt = d->nelt;
12478 rtx (*gen) (rtx, rtx, rtx, rtx);
12479 rtx offset;
12480
12481 unsigned int location = d->perm[0]; /* Always < nelt. */
12482
12483 /* Check if the extracted indices are increasing by one. */
12484 for (i = 1; i < nelt; i++)
12485 {
12486 unsigned int required = location + i;
12487 if (d->one_vector_p)
12488 {
12489 /* We'll pass the same vector in twice, so allow indices to wrap. */
12490 required &= (nelt - 1);
12491 }
12492 if (d->perm[i] != required)
12493 return false;
12494 }
12495
12496 switch (d->vmode)
12497 {
12498 case V16QImode: gen = gen_aarch64_extv16qi; break;
12499 case V8QImode: gen = gen_aarch64_extv8qi; break;
12500 case V4HImode: gen = gen_aarch64_extv4hi; break;
12501 case V8HImode: gen = gen_aarch64_extv8hi; break;
12502 case V2SImode: gen = gen_aarch64_extv2si; break;
12503 case V4SImode: gen = gen_aarch64_extv4si; break;
12504 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12505 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12506 case V2DImode: gen = gen_aarch64_extv2di; break;
12507 case V2DFmode: gen = gen_aarch64_extv2df; break;
12508 default:
12509 return false;
12510 }
12511
12512 /* Success! */
12513 if (d->testing_p)
12514 return true;
12515
12516 /* The case where (location == 0) is a no-op for both big- and little-endian,
12517 and is removed by the mid-end at optimization levels -O1 and higher. */
12518
12519 if (BYTES_BIG_ENDIAN && (location != 0))
12520 {
12521 /* After setup, we want the high elements of the first vector (stored
12522 at the LSB end of the register), and the low elements of the second
12523 vector (stored at the MSB end of the register). So swap. */
12524 std::swap (d->op0, d->op1);
12525 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12526 location = nelt - location;
12527 }
12528
12529 offset = GEN_INT (location);
12530 emit_insn (gen (d->target, d->op0, d->op1, offset));
12531 return true;
12532 }
12533
12534 /* Recognize patterns for the REV insns. */
12535
12536 static bool
12537 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12538 {
12539 unsigned int i, j, diff, nelt = d->nelt;
12540 rtx (*gen) (rtx, rtx);
12541
12542 if (!d->one_vector_p)
12543 return false;
12544
12545 diff = d->perm[0];
12546 switch (diff)
12547 {
12548 case 7:
12549 switch (d->vmode)
12550 {
12551 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12552 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12553 default:
12554 return false;
12555 }
12556 break;
12557 case 3:
12558 switch (d->vmode)
12559 {
12560 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12561 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12562 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12563 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12564 default:
12565 return false;
12566 }
12567 break;
12568 case 1:
12569 switch (d->vmode)
12570 {
12571 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12572 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12573 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12574 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12575 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12576 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12577 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12578 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12579 default:
12580 return false;
12581 }
12582 break;
12583 default:
12584 return false;
12585 }
12586
12587 for (i = 0; i < nelt ; i += diff + 1)
12588 for (j = 0; j <= diff; j += 1)
12589 {
12590 /* This is guaranteed to be true as the value of diff
12591 is 7, 3, 1 and we should have enough elements in the
12592 queue to generate this. Getting a vector mask with a
12593 value of diff other than these values implies that
12594 something is wrong by the time we get here. */
12595 gcc_assert (i + j < nelt);
12596 if (d->perm[i + j] != i + diff - j)
12597 return false;
12598 }
12599
12600 /* Success! */
12601 if (d->testing_p)
12602 return true;
12603
12604 emit_insn (gen (d->target, d->op0));
12605 return true;
12606 }
12607
12608 static bool
12609 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12610 {
12611 rtx (*gen) (rtx, rtx, rtx);
12612 rtx out = d->target;
12613 rtx in0;
12614 machine_mode vmode = d->vmode;
12615 unsigned int i, elt, nelt = d->nelt;
12616 rtx lane;
12617
12618 elt = d->perm[0];
12619 for (i = 1; i < nelt; i++)
12620 {
12621 if (elt != d->perm[i])
12622 return false;
12623 }
12624
12625 /* The generic preparation in aarch64_expand_vec_perm_const_1
12626 swaps the operand order and the permute indices if it finds
12627 d->perm[0] to be in the second operand. Thus, we can always
12628 use d->op0 and need not do any extra arithmetic to get the
12629 correct lane number. */
12630 in0 = d->op0;
12631 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12632
12633 switch (vmode)
12634 {
12635 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12636 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12637 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12638 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12639 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12640 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12641 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12642 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12643 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12644 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12645 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12646 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12647 default:
12648 return false;
12649 }
12650
12651 emit_insn (gen (out, in0, lane));
12652 return true;
12653 }
12654
12655 static bool
12656 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12657 {
12658 rtx rperm[MAX_VECT_LEN], sel;
12659 machine_mode vmode = d->vmode;
12660 unsigned int i, nelt = d->nelt;
12661
12662 if (d->testing_p)
12663 return true;
12664
12665 /* Generic code will try constant permutation twice. Once with the
12666 original mode and again with the elements lowered to QImode.
12667 So wait and don't do the selector expansion ourselves. */
12668 if (vmode != V8QImode && vmode != V16QImode)
12669 return false;
12670
12671 for (i = 0; i < nelt; ++i)
12672 {
12673 int nunits = GET_MODE_NUNITS (vmode);
12674
12675 /* If big-endian and two vectors we end up with a weird mixed-endian
12676 mode on NEON. Reverse the index within each word but not the word
12677 itself. */
12678 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12679 : d->perm[i]);
12680 }
12681 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12682 sel = force_reg (vmode, sel);
12683
12684 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12685 return true;
12686 }
12687
12688 static bool
12689 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12690 {
12691 /* The pattern matching functions above are written to look for a small
12692 number to begin the sequence (0, 1, N/2). If we begin with an index
12693 from the second operand, we can swap the operands. */
12694 if (d->perm[0] >= d->nelt)
12695 {
12696 unsigned i, nelt = d->nelt;
12697
12698 gcc_assert (nelt == (nelt & -nelt));
12699 for (i = 0; i < nelt; ++i)
12700 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12701
12702 std::swap (d->op0, d->op1);
12703 }
12704
12705 if (TARGET_SIMD)
12706 {
12707 if (aarch64_evpc_rev (d))
12708 return true;
12709 else if (aarch64_evpc_ext (d))
12710 return true;
12711 else if (aarch64_evpc_dup (d))
12712 return true;
12713 else if (aarch64_evpc_zip (d))
12714 return true;
12715 else if (aarch64_evpc_uzp (d))
12716 return true;
12717 else if (aarch64_evpc_trn (d))
12718 return true;
12719 return aarch64_evpc_tbl (d);
12720 }
12721 return false;
12722 }
12723
12724 /* Expand a vec_perm_const pattern. */
12725
12726 bool
12727 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12728 {
12729 struct expand_vec_perm_d d;
12730 int i, nelt, which;
12731
12732 d.target = target;
12733 d.op0 = op0;
12734 d.op1 = op1;
12735
12736 d.vmode = GET_MODE (target);
12737 gcc_assert (VECTOR_MODE_P (d.vmode));
12738 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12739 d.testing_p = false;
12740
12741 for (i = which = 0; i < nelt; ++i)
12742 {
12743 rtx e = XVECEXP (sel, 0, i);
12744 int ei = INTVAL (e) & (2 * nelt - 1);
12745 which |= (ei < nelt ? 1 : 2);
12746 d.perm[i] = ei;
12747 }
12748
12749 switch (which)
12750 {
12751 default:
12752 gcc_unreachable ();
12753
12754 case 3:
12755 d.one_vector_p = false;
12756 if (!rtx_equal_p (op0, op1))
12757 break;
12758
12759 /* The elements of PERM do not suggest that only the first operand
12760 is used, but both operands are identical. Allow easier matching
12761 of the permutation by folding the permutation into the single
12762 input vector. */
12763 /* Fall Through. */
12764 case 2:
12765 for (i = 0; i < nelt; ++i)
12766 d.perm[i] &= nelt - 1;
12767 d.op0 = op1;
12768 d.one_vector_p = true;
12769 break;
12770
12771 case 1:
12772 d.op1 = op0;
12773 d.one_vector_p = true;
12774 break;
12775 }
12776
12777 return aarch64_expand_vec_perm_const_1 (&d);
12778 }
12779
12780 static bool
12781 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12782 const unsigned char *sel)
12783 {
12784 struct expand_vec_perm_d d;
12785 unsigned int i, nelt, which;
12786 bool ret;
12787
12788 d.vmode = vmode;
12789 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12790 d.testing_p = true;
12791 memcpy (d.perm, sel, nelt);
12792
12793 /* Calculate whether all elements are in one vector. */
12794 for (i = which = 0; i < nelt; ++i)
12795 {
12796 unsigned char e = d.perm[i];
12797 gcc_assert (e < 2 * nelt);
12798 which |= (e < nelt ? 1 : 2);
12799 }
12800
12801 /* If all elements are from the second vector, reindex as if from the
12802 first vector. */
12803 if (which == 2)
12804 for (i = 0; i < nelt; ++i)
12805 d.perm[i] -= nelt;
12806
12807 /* Check whether the mask can be applied to a single vector. */
12808 d.one_vector_p = (which != 3);
12809
12810 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12811 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12812 if (!d.one_vector_p)
12813 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12814
12815 start_sequence ();
12816 ret = aarch64_expand_vec_perm_const_1 (&d);
12817 end_sequence ();
12818
12819 return ret;
12820 }
12821
12822 /* Implement target hook CANNOT_CHANGE_MODE_CLASS. */
12823 bool
12824 aarch64_cannot_change_mode_class (machine_mode from,
12825 machine_mode to,
12826 enum reg_class rclass)
12827 {
12828 /* We cannot allow word_mode subregs of full vector modes.
12829 Otherwise the middle-end will assume it's ok to store to
12830 (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
12831 of the 128-bit register. However, after reload the subreg will
12832 be dropped leaving a plain DImode store. See PR67609 for a more
12833 detailed dicussion. In all other cases, we want to be permissive
12834 and return false. */
12835 return (reg_classes_intersect_p (FP_REGS, rclass)
12836 && GET_MODE_SIZE (to) == UNITS_PER_WORD
12837 && GET_MODE_SIZE (from) > UNITS_PER_WORD);
12838 }
12839
12840 rtx
12841 aarch64_reverse_mask (enum machine_mode mode)
12842 {
12843 /* We have to reverse each vector because we dont have
12844 a permuted load that can reverse-load according to ABI rules. */
12845 rtx mask;
12846 rtvec v = rtvec_alloc (16);
12847 int i, j;
12848 int nunits = GET_MODE_NUNITS (mode);
12849 int usize = GET_MODE_UNIT_SIZE (mode);
12850
12851 gcc_assert (BYTES_BIG_ENDIAN);
12852 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12853
12854 for (i = 0; i < nunits; i++)
12855 for (j = 0; j < usize; j++)
12856 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12857 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12858 return force_reg (V16QImode, mask);
12859 }
12860
12861 /* Implement MODES_TIEABLE_P. */
12862
12863 bool
12864 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12865 {
12866 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12867 return true;
12868
12869 /* We specifically want to allow elements of "structure" modes to
12870 be tieable to the structure. This more general condition allows
12871 other rarer situations too. */
12872 if (TARGET_SIMD
12873 && aarch64_vector_mode_p (mode1)
12874 && aarch64_vector_mode_p (mode2))
12875 return true;
12876
12877 return false;
12878 }
12879
12880 /* Return a new RTX holding the result of moving POINTER forward by
12881 AMOUNT bytes. */
12882
12883 static rtx
12884 aarch64_move_pointer (rtx pointer, int amount)
12885 {
12886 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12887
12888 return adjust_automodify_address (pointer, GET_MODE (pointer),
12889 next, amount);
12890 }
12891
12892 /* Return a new RTX holding the result of moving POINTER forward by the
12893 size of the mode it points to. */
12894
12895 static rtx
12896 aarch64_progress_pointer (rtx pointer)
12897 {
12898 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12899
12900 return aarch64_move_pointer (pointer, amount);
12901 }
12902
12903 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12904 MODE bytes. */
12905
12906 static void
12907 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12908 machine_mode mode)
12909 {
12910 rtx reg = gen_reg_rtx (mode);
12911
12912 /* "Cast" the pointers to the correct mode. */
12913 *src = adjust_address (*src, mode, 0);
12914 *dst = adjust_address (*dst, mode, 0);
12915 /* Emit the memcpy. */
12916 emit_move_insn (reg, *src);
12917 emit_move_insn (*dst, reg);
12918 /* Move the pointers forward. */
12919 *src = aarch64_progress_pointer (*src);
12920 *dst = aarch64_progress_pointer (*dst);
12921 }
12922
12923 /* Expand movmem, as if from a __builtin_memcpy. Return true if
12924 we succeed, otherwise return false. */
12925
12926 bool
12927 aarch64_expand_movmem (rtx *operands)
12928 {
12929 unsigned int n;
12930 rtx dst = operands[0];
12931 rtx src = operands[1];
12932 rtx base;
12933 bool speed_p = !optimize_function_for_size_p (cfun);
12934
12935 /* When optimizing for size, give a better estimate of the length of a
12936 memcpy call, but use the default otherwise. */
12937 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12938
12939 /* We can't do anything smart if the amount to copy is not constant. */
12940 if (!CONST_INT_P (operands[2]))
12941 return false;
12942
12943 n = UINTVAL (operands[2]);
12944
12945 /* Try to keep the number of instructions low. For cases below 16 bytes we
12946 need to make at most two moves. For cases above 16 bytes it will be one
12947 move for each 16 byte chunk, then at most two additional moves. */
12948 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12949 return false;
12950
12951 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12952 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12953
12954 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12955 src = adjust_automodify_address (src, VOIDmode, base, 0);
12956
12957 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12958 1-byte chunk. */
12959 if (n < 4)
12960 {
12961 if (n >= 2)
12962 {
12963 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12964 n -= 2;
12965 }
12966
12967 if (n == 1)
12968 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12969
12970 return true;
12971 }
12972
12973 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
12974 4-byte chunk, partially overlapping with the previously copied chunk. */
12975 if (n < 8)
12976 {
12977 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12978 n -= 4;
12979 if (n > 0)
12980 {
12981 int move = n - 4;
12982
12983 src = aarch64_move_pointer (src, move);
12984 dst = aarch64_move_pointer (dst, move);
12985 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12986 }
12987 return true;
12988 }
12989
12990 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
12991 them, then (if applicable) an 8-byte chunk. */
12992 while (n >= 8)
12993 {
12994 if (n / 16)
12995 {
12996 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
12997 n -= 16;
12998 }
12999 else
13000 {
13001 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13002 n -= 8;
13003 }
13004 }
13005
13006 /* Finish the final bytes of the copy. We can always do this in one
13007 instruction. We either copy the exact amount we need, or partially
13008 overlap with the previous chunk we copied and copy 8-bytes. */
13009 if (n == 0)
13010 return true;
13011 else if (n == 1)
13012 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13013 else if (n == 2)
13014 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13015 else if (n == 4)
13016 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13017 else
13018 {
13019 if (n == 3)
13020 {
13021 src = aarch64_move_pointer (src, -1);
13022 dst = aarch64_move_pointer (dst, -1);
13023 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13024 }
13025 else
13026 {
13027 int move = n - 8;
13028
13029 src = aarch64_move_pointer (src, move);
13030 dst = aarch64_move_pointer (dst, move);
13031 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13032 }
13033 }
13034
13035 return true;
13036 }
13037
13038 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13039
13040 static unsigned HOST_WIDE_INT
13041 aarch64_asan_shadow_offset (void)
13042 {
13043 return (HOST_WIDE_INT_1 << 36);
13044 }
13045
13046 static bool
13047 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13048 unsigned int align,
13049 enum by_pieces_operation op,
13050 bool speed_p)
13051 {
13052 /* STORE_BY_PIECES can be used when copying a constant string, but
13053 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13054 For now we always fail this and let the move_by_pieces code copy
13055 the string from read-only memory. */
13056 if (op == STORE_BY_PIECES)
13057 return false;
13058
13059 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13060 }
13061
13062 static rtx
13063 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13064 int code, tree treeop0, tree treeop1)
13065 {
13066 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13067 rtx op0, op1;
13068 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13069 insn_code icode;
13070 struct expand_operand ops[4];
13071
13072 start_sequence ();
13073 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13074
13075 op_mode = GET_MODE (op0);
13076 if (op_mode == VOIDmode)
13077 op_mode = GET_MODE (op1);
13078
13079 switch (op_mode)
13080 {
13081 case QImode:
13082 case HImode:
13083 case SImode:
13084 cmp_mode = SImode;
13085 icode = CODE_FOR_cmpsi;
13086 break;
13087
13088 case DImode:
13089 cmp_mode = DImode;
13090 icode = CODE_FOR_cmpdi;
13091 break;
13092
13093 case SFmode:
13094 cmp_mode = SFmode;
13095 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13096 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13097 break;
13098
13099 case DFmode:
13100 cmp_mode = DFmode;
13101 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13102 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13103 break;
13104
13105 default:
13106 end_sequence ();
13107 return NULL_RTX;
13108 }
13109
13110 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13111 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13112 if (!op0 || !op1)
13113 {
13114 end_sequence ();
13115 return NULL_RTX;
13116 }
13117 *prep_seq = get_insns ();
13118 end_sequence ();
13119
13120 create_fixed_operand (&ops[0], op0);
13121 create_fixed_operand (&ops[1], op1);
13122
13123 start_sequence ();
13124 if (!maybe_expand_insn (icode, 2, ops))
13125 {
13126 end_sequence ();
13127 return NULL_RTX;
13128 }
13129 *gen_seq = get_insns ();
13130 end_sequence ();
13131
13132 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13133 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13134 }
13135
13136 static rtx
13137 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13138 tree treeop0, tree treeop1, int bit_code)
13139 {
13140 rtx op0, op1, target;
13141 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13142 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13143 insn_code icode;
13144 struct expand_operand ops[6];
13145 int aarch64_cond;
13146
13147 push_to_sequence ((rtx_insn*) *prep_seq);
13148 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13149
13150 op_mode = GET_MODE (op0);
13151 if (op_mode == VOIDmode)
13152 op_mode = GET_MODE (op1);
13153
13154 switch (op_mode)
13155 {
13156 case QImode:
13157 case HImode:
13158 case SImode:
13159 cmp_mode = SImode;
13160 icode = CODE_FOR_ccmpsi;
13161 break;
13162
13163 case DImode:
13164 cmp_mode = DImode;
13165 icode = CODE_FOR_ccmpdi;
13166 break;
13167
13168 case SFmode:
13169 cmp_mode = SFmode;
13170 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13171 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13172 break;
13173
13174 case DFmode:
13175 cmp_mode = DFmode;
13176 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13177 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13178 break;
13179
13180 default:
13181 end_sequence ();
13182 return NULL_RTX;
13183 }
13184
13185 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13186 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13187 if (!op0 || !op1)
13188 {
13189 end_sequence ();
13190 return NULL_RTX;
13191 }
13192 *prep_seq = get_insns ();
13193 end_sequence ();
13194
13195 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13196 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13197
13198 if (bit_code != AND)
13199 {
13200 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13201 GET_MODE (XEXP (prev, 0))),
13202 VOIDmode, XEXP (prev, 0), const0_rtx);
13203 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13204 }
13205
13206 create_fixed_operand (&ops[0], XEXP (prev, 0));
13207 create_fixed_operand (&ops[1], target);
13208 create_fixed_operand (&ops[2], op0);
13209 create_fixed_operand (&ops[3], op1);
13210 create_fixed_operand (&ops[4], prev);
13211 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13212
13213 push_to_sequence ((rtx_insn*) *gen_seq);
13214 if (!maybe_expand_insn (icode, 6, ops))
13215 {
13216 end_sequence ();
13217 return NULL_RTX;
13218 }
13219
13220 *gen_seq = get_insns ();
13221 end_sequence ();
13222
13223 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13224 }
13225
13226 #undef TARGET_GEN_CCMP_FIRST
13227 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13228
13229 #undef TARGET_GEN_CCMP_NEXT
13230 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13231
13232 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13233 instruction fusion of some sort. */
13234
13235 static bool
13236 aarch64_macro_fusion_p (void)
13237 {
13238 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13239 }
13240
13241
13242 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13243 should be kept together during scheduling. */
13244
13245 static bool
13246 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13247 {
13248 rtx set_dest;
13249 rtx prev_set = single_set (prev);
13250 rtx curr_set = single_set (curr);
13251 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13252 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13253
13254 if (!aarch64_macro_fusion_p ())
13255 return false;
13256
13257 if (simple_sets_p
13258 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13259 {
13260 /* We are trying to match:
13261 prev (mov) == (set (reg r0) (const_int imm16))
13262 curr (movk) == (set (zero_extract (reg r0)
13263 (const_int 16)
13264 (const_int 16))
13265 (const_int imm16_1)) */
13266
13267 set_dest = SET_DEST (curr_set);
13268
13269 if (GET_CODE (set_dest) == ZERO_EXTRACT
13270 && CONST_INT_P (SET_SRC (curr_set))
13271 && CONST_INT_P (SET_SRC (prev_set))
13272 && CONST_INT_P (XEXP (set_dest, 2))
13273 && INTVAL (XEXP (set_dest, 2)) == 16
13274 && REG_P (XEXP (set_dest, 0))
13275 && REG_P (SET_DEST (prev_set))
13276 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13277 {
13278 return true;
13279 }
13280 }
13281
13282 if (simple_sets_p
13283 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13284 {
13285
13286 /* We're trying to match:
13287 prev (adrp) == (set (reg r1)
13288 (high (symbol_ref ("SYM"))))
13289 curr (add) == (set (reg r0)
13290 (lo_sum (reg r1)
13291 (symbol_ref ("SYM"))))
13292 Note that r0 need not necessarily be the same as r1, especially
13293 during pre-regalloc scheduling. */
13294
13295 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13296 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13297 {
13298 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13299 && REG_P (XEXP (SET_SRC (curr_set), 0))
13300 && REGNO (XEXP (SET_SRC (curr_set), 0))
13301 == REGNO (SET_DEST (prev_set))
13302 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13303 XEXP (SET_SRC (curr_set), 1)))
13304 return true;
13305 }
13306 }
13307
13308 if (simple_sets_p
13309 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13310 {
13311
13312 /* We're trying to match:
13313 prev (movk) == (set (zero_extract (reg r0)
13314 (const_int 16)
13315 (const_int 32))
13316 (const_int imm16_1))
13317 curr (movk) == (set (zero_extract (reg r0)
13318 (const_int 16)
13319 (const_int 48))
13320 (const_int imm16_2)) */
13321
13322 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13323 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13324 && REG_P (XEXP (SET_DEST (prev_set), 0))
13325 && REG_P (XEXP (SET_DEST (curr_set), 0))
13326 && REGNO (XEXP (SET_DEST (prev_set), 0))
13327 == REGNO (XEXP (SET_DEST (curr_set), 0))
13328 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13329 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13330 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13331 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13332 && CONST_INT_P (SET_SRC (prev_set))
13333 && CONST_INT_P (SET_SRC (curr_set)))
13334 return true;
13335
13336 }
13337 if (simple_sets_p
13338 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13339 {
13340 /* We're trying to match:
13341 prev (adrp) == (set (reg r0)
13342 (high (symbol_ref ("SYM"))))
13343 curr (ldr) == (set (reg r1)
13344 (mem (lo_sum (reg r0)
13345 (symbol_ref ("SYM")))))
13346 or
13347 curr (ldr) == (set (reg r1)
13348 (zero_extend (mem
13349 (lo_sum (reg r0)
13350 (symbol_ref ("SYM")))))) */
13351 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13352 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13353 {
13354 rtx curr_src = SET_SRC (curr_set);
13355
13356 if (GET_CODE (curr_src) == ZERO_EXTEND)
13357 curr_src = XEXP (curr_src, 0);
13358
13359 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13360 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13361 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13362 == REGNO (SET_DEST (prev_set))
13363 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13364 XEXP (SET_SRC (prev_set), 0)))
13365 return true;
13366 }
13367 }
13368
13369 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
13370 && aarch_crypto_can_dual_issue (prev, curr))
13371 return true;
13372
13373 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13374 && any_condjump_p (curr))
13375 {
13376 enum attr_type prev_type = get_attr_type (prev);
13377
13378 /* FIXME: this misses some which is considered simple arthematic
13379 instructions for ThunderX. Simple shifts are missed here. */
13380 if (prev_type == TYPE_ALUS_SREG
13381 || prev_type == TYPE_ALUS_IMM
13382 || prev_type == TYPE_LOGICS_REG
13383 || prev_type == TYPE_LOGICS_IMM)
13384 return true;
13385 }
13386
13387 return false;
13388 }
13389
13390 /* If MEM is in the form of [base+offset], extract the two parts
13391 of address and set to BASE and OFFSET, otherwise return false
13392 after clearing BASE and OFFSET. */
13393
13394 bool
13395 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13396 {
13397 rtx addr;
13398
13399 gcc_assert (MEM_P (mem));
13400
13401 addr = XEXP (mem, 0);
13402
13403 if (REG_P (addr))
13404 {
13405 *base = addr;
13406 *offset = const0_rtx;
13407 return true;
13408 }
13409
13410 if (GET_CODE (addr) == PLUS
13411 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13412 {
13413 *base = XEXP (addr, 0);
13414 *offset = XEXP (addr, 1);
13415 return true;
13416 }
13417
13418 *base = NULL_RTX;
13419 *offset = NULL_RTX;
13420
13421 return false;
13422 }
13423
13424 /* Types for scheduling fusion. */
13425 enum sched_fusion_type
13426 {
13427 SCHED_FUSION_NONE = 0,
13428 SCHED_FUSION_LD_SIGN_EXTEND,
13429 SCHED_FUSION_LD_ZERO_EXTEND,
13430 SCHED_FUSION_LD,
13431 SCHED_FUSION_ST,
13432 SCHED_FUSION_NUM
13433 };
13434
13435 /* If INSN is a load or store of address in the form of [base+offset],
13436 extract the two parts and set to BASE and OFFSET. Return scheduling
13437 fusion type this INSN is. */
13438
13439 static enum sched_fusion_type
13440 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13441 {
13442 rtx x, dest, src;
13443 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13444
13445 gcc_assert (INSN_P (insn));
13446 x = PATTERN (insn);
13447 if (GET_CODE (x) != SET)
13448 return SCHED_FUSION_NONE;
13449
13450 src = SET_SRC (x);
13451 dest = SET_DEST (x);
13452
13453 machine_mode dest_mode = GET_MODE (dest);
13454
13455 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13456 return SCHED_FUSION_NONE;
13457
13458 if (GET_CODE (src) == SIGN_EXTEND)
13459 {
13460 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13461 src = XEXP (src, 0);
13462 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13463 return SCHED_FUSION_NONE;
13464 }
13465 else if (GET_CODE (src) == ZERO_EXTEND)
13466 {
13467 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13468 src = XEXP (src, 0);
13469 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13470 return SCHED_FUSION_NONE;
13471 }
13472
13473 if (GET_CODE (src) == MEM && REG_P (dest))
13474 extract_base_offset_in_addr (src, base, offset);
13475 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13476 {
13477 fusion = SCHED_FUSION_ST;
13478 extract_base_offset_in_addr (dest, base, offset);
13479 }
13480 else
13481 return SCHED_FUSION_NONE;
13482
13483 if (*base == NULL_RTX || *offset == NULL_RTX)
13484 fusion = SCHED_FUSION_NONE;
13485
13486 return fusion;
13487 }
13488
13489 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13490
13491 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13492 and PRI are only calculated for these instructions. For other instruction,
13493 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13494 type instruction fusion can be added by returning different priorities.
13495
13496 It's important that irrelevant instructions get the largest FUSION_PRI. */
13497
13498 static void
13499 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13500 int *fusion_pri, int *pri)
13501 {
13502 int tmp, off_val;
13503 rtx base, offset;
13504 enum sched_fusion_type fusion;
13505
13506 gcc_assert (INSN_P (insn));
13507
13508 tmp = max_pri - 1;
13509 fusion = fusion_load_store (insn, &base, &offset);
13510 if (fusion == SCHED_FUSION_NONE)
13511 {
13512 *pri = tmp;
13513 *fusion_pri = tmp;
13514 return;
13515 }
13516
13517 /* Set FUSION_PRI according to fusion type and base register. */
13518 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13519
13520 /* Calculate PRI. */
13521 tmp /= 2;
13522
13523 /* INSN with smaller offset goes first. */
13524 off_val = (int)(INTVAL (offset));
13525 if (off_val >= 0)
13526 tmp -= (off_val & 0xfffff);
13527 else
13528 tmp += ((- off_val) & 0xfffff);
13529
13530 *pri = tmp;
13531 return;
13532 }
13533
13534 /* Given OPERANDS of consecutive load/store, check if we can merge
13535 them into ldp/stp. LOAD is true if they are load instructions.
13536 MODE is the mode of memory operands. */
13537
13538 bool
13539 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13540 enum machine_mode mode)
13541 {
13542 HOST_WIDE_INT offval_1, offval_2, msize;
13543 enum reg_class rclass_1, rclass_2;
13544 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13545
13546 if (load)
13547 {
13548 mem_1 = operands[1];
13549 mem_2 = operands[3];
13550 reg_1 = operands[0];
13551 reg_2 = operands[2];
13552 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13553 if (REGNO (reg_1) == REGNO (reg_2))
13554 return false;
13555 }
13556 else
13557 {
13558 mem_1 = operands[0];
13559 mem_2 = operands[2];
13560 reg_1 = operands[1];
13561 reg_2 = operands[3];
13562 }
13563
13564 /* The mems cannot be volatile. */
13565 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13566 return false;
13567
13568 /* Check if the addresses are in the form of [base+offset]. */
13569 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13570 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13571 return false;
13572 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13573 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13574 return false;
13575
13576 /* Check if the bases are same. */
13577 if (!rtx_equal_p (base_1, base_2))
13578 return false;
13579
13580 offval_1 = INTVAL (offset_1);
13581 offval_2 = INTVAL (offset_2);
13582 msize = GET_MODE_SIZE (mode);
13583 /* Check if the offsets are consecutive. */
13584 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13585 return false;
13586
13587 /* Check if the addresses are clobbered by load. */
13588 if (load)
13589 {
13590 if (reg_mentioned_p (reg_1, mem_1))
13591 return false;
13592
13593 /* In increasing order, the last load can clobber the address. */
13594 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13595 return false;
13596 }
13597
13598 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13599 rclass_1 = FP_REGS;
13600 else
13601 rclass_1 = GENERAL_REGS;
13602
13603 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13604 rclass_2 = FP_REGS;
13605 else
13606 rclass_2 = GENERAL_REGS;
13607
13608 /* Check if the registers are of same class. */
13609 if (rclass_1 != rclass_2)
13610 return false;
13611
13612 return true;
13613 }
13614
13615 /* Given OPERANDS of consecutive load/store, check if we can merge
13616 them into ldp/stp by adjusting the offset. LOAD is true if they
13617 are load instructions. MODE is the mode of memory operands.
13618
13619 Given below consecutive stores:
13620
13621 str w1, [xb, 0x100]
13622 str w1, [xb, 0x104]
13623 str w1, [xb, 0x108]
13624 str w1, [xb, 0x10c]
13625
13626 Though the offsets are out of the range supported by stp, we can
13627 still pair them after adjusting the offset, like:
13628
13629 add scratch, xb, 0x100
13630 stp w1, w1, [scratch]
13631 stp w1, w1, [scratch, 0x8]
13632
13633 The peephole patterns detecting this opportunity should guarantee
13634 the scratch register is avaliable. */
13635
13636 bool
13637 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13638 enum machine_mode mode)
13639 {
13640 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13641 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13642 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13643 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13644
13645 if (load)
13646 {
13647 reg_1 = operands[0];
13648 mem_1 = operands[1];
13649 reg_2 = operands[2];
13650 mem_2 = operands[3];
13651 reg_3 = operands[4];
13652 mem_3 = operands[5];
13653 reg_4 = operands[6];
13654 mem_4 = operands[7];
13655 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13656 && REG_P (reg_3) && REG_P (reg_4));
13657 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13658 return false;
13659 }
13660 else
13661 {
13662 mem_1 = operands[0];
13663 reg_1 = operands[1];
13664 mem_2 = operands[2];
13665 reg_2 = operands[3];
13666 mem_3 = operands[4];
13667 reg_3 = operands[5];
13668 mem_4 = operands[6];
13669 reg_4 = operands[7];
13670 }
13671 /* Skip if memory operand is by itslef valid for ldp/stp. */
13672 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13673 return false;
13674
13675 /* The mems cannot be volatile. */
13676 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13677 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13678 return false;
13679
13680 /* Check if the addresses are in the form of [base+offset]. */
13681 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13682 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13683 return false;
13684 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13685 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13686 return false;
13687 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13688 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13689 return false;
13690 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13691 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13692 return false;
13693
13694 /* Check if the bases are same. */
13695 if (!rtx_equal_p (base_1, base_2)
13696 || !rtx_equal_p (base_2, base_3)
13697 || !rtx_equal_p (base_3, base_4))
13698 return false;
13699
13700 offval_1 = INTVAL (offset_1);
13701 offval_2 = INTVAL (offset_2);
13702 offval_3 = INTVAL (offset_3);
13703 offval_4 = INTVAL (offset_4);
13704 msize = GET_MODE_SIZE (mode);
13705 /* Check if the offsets are consecutive. */
13706 if ((offval_1 != (offval_2 + msize)
13707 || offval_1 != (offval_3 + msize * 2)
13708 || offval_1 != (offval_4 + msize * 3))
13709 && (offval_4 != (offval_3 + msize)
13710 || offval_4 != (offval_2 + msize * 2)
13711 || offval_4 != (offval_1 + msize * 3)))
13712 return false;
13713
13714 /* Check if the addresses are clobbered by load. */
13715 if (load)
13716 {
13717 if (reg_mentioned_p (reg_1, mem_1)
13718 || reg_mentioned_p (reg_2, mem_2)
13719 || reg_mentioned_p (reg_3, mem_3))
13720 return false;
13721
13722 /* In increasing order, the last load can clobber the address. */
13723 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13724 return false;
13725 }
13726
13727 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13728 rclass_1 = FP_REGS;
13729 else
13730 rclass_1 = GENERAL_REGS;
13731
13732 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13733 rclass_2 = FP_REGS;
13734 else
13735 rclass_2 = GENERAL_REGS;
13736
13737 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13738 rclass_3 = FP_REGS;
13739 else
13740 rclass_3 = GENERAL_REGS;
13741
13742 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13743 rclass_4 = FP_REGS;
13744 else
13745 rclass_4 = GENERAL_REGS;
13746
13747 /* Check if the registers are of same class. */
13748 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13749 return false;
13750
13751 return true;
13752 }
13753
13754 /* Given OPERANDS of consecutive load/store, this function pairs them
13755 into ldp/stp after adjusting the offset. It depends on the fact
13756 that addresses of load/store instructions are in increasing order.
13757 MODE is the mode of memory operands. CODE is the rtl operator
13758 which should be applied to all memory operands, it's SIGN_EXTEND,
13759 ZERO_EXTEND or UNKNOWN. */
13760
13761 bool
13762 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13763 enum machine_mode mode, RTX_CODE code)
13764 {
13765 rtx base, offset, t1, t2;
13766 rtx mem_1, mem_2, mem_3, mem_4;
13767 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13768
13769 if (load)
13770 {
13771 mem_1 = operands[1];
13772 mem_2 = operands[3];
13773 mem_3 = operands[5];
13774 mem_4 = operands[7];
13775 }
13776 else
13777 {
13778 mem_1 = operands[0];
13779 mem_2 = operands[2];
13780 mem_3 = operands[4];
13781 mem_4 = operands[6];
13782 gcc_assert (code == UNKNOWN);
13783 }
13784
13785 extract_base_offset_in_addr (mem_1, &base, &offset);
13786 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13787
13788 /* Adjust offset thus it can fit in ldp/stp instruction. */
13789 msize = GET_MODE_SIZE (mode);
13790 stp_off_limit = msize * 0x40;
13791 off_val = INTVAL (offset);
13792 abs_off = (off_val < 0) ? -off_val : off_val;
13793 new_off = abs_off % stp_off_limit;
13794 adj_off = abs_off - new_off;
13795
13796 /* Further adjust to make sure all offsets are OK. */
13797 if ((new_off + msize * 2) >= stp_off_limit)
13798 {
13799 adj_off += stp_off_limit;
13800 new_off -= stp_off_limit;
13801 }
13802
13803 /* Make sure the adjustment can be done with ADD/SUB instructions. */
13804 if (adj_off >= 0x1000)
13805 return false;
13806
13807 if (off_val < 0)
13808 {
13809 adj_off = -adj_off;
13810 new_off = -new_off;
13811 }
13812
13813 /* Create new memory references. */
13814 mem_1 = change_address (mem_1, VOIDmode,
13815 plus_constant (DImode, operands[8], new_off));
13816
13817 /* Check if the adjusted address is OK for ldp/stp. */
13818 if (!aarch64_mem_pair_operand (mem_1, mode))
13819 return false;
13820
13821 msize = GET_MODE_SIZE (mode);
13822 mem_2 = change_address (mem_2, VOIDmode,
13823 plus_constant (DImode,
13824 operands[8],
13825 new_off + msize));
13826 mem_3 = change_address (mem_3, VOIDmode,
13827 plus_constant (DImode,
13828 operands[8],
13829 new_off + msize * 2));
13830 mem_4 = change_address (mem_4, VOIDmode,
13831 plus_constant (DImode,
13832 operands[8],
13833 new_off + msize * 3));
13834
13835 if (code == ZERO_EXTEND)
13836 {
13837 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13838 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13839 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13840 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13841 }
13842 else if (code == SIGN_EXTEND)
13843 {
13844 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13845 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13846 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13847 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13848 }
13849
13850 if (load)
13851 {
13852 operands[1] = mem_1;
13853 operands[3] = mem_2;
13854 operands[5] = mem_3;
13855 operands[7] = mem_4;
13856 }
13857 else
13858 {
13859 operands[0] = mem_1;
13860 operands[2] = mem_2;
13861 operands[4] = mem_3;
13862 operands[6] = mem_4;
13863 }
13864
13865 /* Emit adjusting instruction. */
13866 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13867 /* Emit ldp/stp instructions. */
13868 t1 = gen_rtx_SET (operands[0], operands[1]);
13869 t2 = gen_rtx_SET (operands[2], operands[3]);
13870 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13871 t1 = gen_rtx_SET (operands[4], operands[5]);
13872 t2 = gen_rtx_SET (operands[6], operands[7]);
13873 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13874 return true;
13875 }
13876
13877 /* Return 1 if pseudo register should be created and used to hold
13878 GOT address for PIC code. */
13879
13880 bool
13881 aarch64_use_pseudo_pic_reg (void)
13882 {
13883 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13884 }
13885
13886 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
13887
13888 static int
13889 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13890 {
13891 switch (XINT (x, 1))
13892 {
13893 case UNSPEC_GOTSMALLPIC:
13894 case UNSPEC_GOTSMALLPIC28K:
13895 case UNSPEC_GOTTINYPIC:
13896 return 0;
13897 default:
13898 break;
13899 }
13900
13901 return default_unspec_may_trap_p (x, flags);
13902 }
13903
13904
13905 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13906 return the log2 of that value. Otherwise return -1. */
13907
13908 int
13909 aarch64_fpconst_pow_of_2 (rtx x)
13910 {
13911 const REAL_VALUE_TYPE *r;
13912
13913 if (!CONST_DOUBLE_P (x))
13914 return -1;
13915
13916 r = CONST_DOUBLE_REAL_VALUE (x);
13917
13918 if (REAL_VALUE_NEGATIVE (*r)
13919 || REAL_VALUE_ISNAN (*r)
13920 || REAL_VALUE_ISINF (*r)
13921 || !real_isinteger (r, DFmode))
13922 return -1;
13923
13924 return exact_log2 (real_to_integer (r));
13925 }
13926
13927 /* If X is a vector of equal CONST_DOUBLE values and that value is
13928 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
13929
13930 int
13931 aarch64_vec_fpconst_pow_of_2 (rtx x)
13932 {
13933 if (GET_CODE (x) != CONST_VECTOR)
13934 return -1;
13935
13936 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13937 return -1;
13938
13939 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13940 if (firstval <= 0)
13941 return -1;
13942
13943 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13944 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13945 return -1;
13946
13947 return firstval;
13948 }
13949
13950 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
13951 static tree
13952 aarch64_promoted_type (const_tree t)
13953 {
13954 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13955 return float_type_node;
13956 return NULL_TREE;
13957 }
13958
13959 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
13960
13961 static bool
13962 aarch64_optab_supported_p (int op, machine_mode, machine_mode,
13963 optimization_type opt_type)
13964 {
13965 switch (op)
13966 {
13967 case rsqrt_optab:
13968 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
13969
13970 default:
13971 return true;
13972 }
13973 }
13974
13975 #undef TARGET_ADDRESS_COST
13976 #define TARGET_ADDRESS_COST aarch64_address_cost
13977
13978 /* This hook will determines whether unnamed bitfields affect the alignment
13979 of the containing structure. The hook returns true if the structure
13980 should inherit the alignment requirements of an unnamed bitfield's
13981 type. */
13982 #undef TARGET_ALIGN_ANON_BITFIELD
13983 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
13984
13985 #undef TARGET_ASM_ALIGNED_DI_OP
13986 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
13987
13988 #undef TARGET_ASM_ALIGNED_HI_OP
13989 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
13990
13991 #undef TARGET_ASM_ALIGNED_SI_OP
13992 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
13993
13994 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
13995 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
13996 hook_bool_const_tree_hwi_hwi_const_tree_true
13997
13998 #undef TARGET_ASM_FILE_START
13999 #define TARGET_ASM_FILE_START aarch64_start_file
14000
14001 #undef TARGET_ASM_OUTPUT_MI_THUNK
14002 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14003
14004 #undef TARGET_ASM_SELECT_RTX_SECTION
14005 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14006
14007 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14008 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14009
14010 #undef TARGET_BUILD_BUILTIN_VA_LIST
14011 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14012
14013 #undef TARGET_CALLEE_COPIES
14014 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14015
14016 #undef TARGET_CAN_ELIMINATE
14017 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14018
14019 #undef TARGET_CAN_INLINE_P
14020 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14021
14022 #undef TARGET_CANNOT_FORCE_CONST_MEM
14023 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14024
14025 #undef TARGET_CASE_VALUES_THRESHOLD
14026 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14027
14028 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14029 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14030
14031 /* Only the least significant bit is used for initialization guard
14032 variables. */
14033 #undef TARGET_CXX_GUARD_MASK_BIT
14034 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14035
14036 #undef TARGET_C_MODE_FOR_SUFFIX
14037 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14038
14039 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14040 #undef TARGET_DEFAULT_TARGET_FLAGS
14041 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14042 #endif
14043
14044 #undef TARGET_CLASS_MAX_NREGS
14045 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14046
14047 #undef TARGET_BUILTIN_DECL
14048 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14049
14050 #undef TARGET_BUILTIN_RECIPROCAL
14051 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14052
14053 #undef TARGET_EXPAND_BUILTIN
14054 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14055
14056 #undef TARGET_EXPAND_BUILTIN_VA_START
14057 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14058
14059 #undef TARGET_FOLD_BUILTIN
14060 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14061
14062 #undef TARGET_FUNCTION_ARG
14063 #define TARGET_FUNCTION_ARG aarch64_function_arg
14064
14065 #undef TARGET_FUNCTION_ARG_ADVANCE
14066 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14067
14068 #undef TARGET_FUNCTION_ARG_BOUNDARY
14069 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14070
14071 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14072 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14073
14074 #undef TARGET_FUNCTION_VALUE
14075 #define TARGET_FUNCTION_VALUE aarch64_function_value
14076
14077 #undef TARGET_FUNCTION_VALUE_REGNO_P
14078 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14079
14080 #undef TARGET_FRAME_POINTER_REQUIRED
14081 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14082
14083 #undef TARGET_GIMPLE_FOLD_BUILTIN
14084 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14085
14086 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14087 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14088
14089 #undef TARGET_INIT_BUILTINS
14090 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14091
14092 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14093 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14094 aarch64_ira_change_pseudo_allocno_class
14095
14096 #undef TARGET_LEGITIMATE_ADDRESS_P
14097 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14098
14099 #undef TARGET_LEGITIMATE_CONSTANT_P
14100 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14101
14102 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14103 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14104
14105 #undef TARGET_LRA_P
14106 #define TARGET_LRA_P hook_bool_void_true
14107
14108 #undef TARGET_MANGLE_TYPE
14109 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14110
14111 #undef TARGET_MEMORY_MOVE_COST
14112 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14113
14114 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14115 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14116
14117 #undef TARGET_MUST_PASS_IN_STACK
14118 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14119
14120 /* This target hook should return true if accesses to volatile bitfields
14121 should use the narrowest mode possible. It should return false if these
14122 accesses should use the bitfield container type. */
14123 #undef TARGET_NARROW_VOLATILE_BITFIELD
14124 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14125
14126 #undef TARGET_OPTION_OVERRIDE
14127 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14128
14129 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14130 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14131 aarch64_override_options_after_change
14132
14133 #undef TARGET_OPTION_SAVE
14134 #define TARGET_OPTION_SAVE aarch64_option_save
14135
14136 #undef TARGET_OPTION_RESTORE
14137 #define TARGET_OPTION_RESTORE aarch64_option_restore
14138
14139 #undef TARGET_OPTION_PRINT
14140 #define TARGET_OPTION_PRINT aarch64_option_print
14141
14142 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14143 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14144
14145 #undef TARGET_SET_CURRENT_FUNCTION
14146 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14147
14148 #undef TARGET_PASS_BY_REFERENCE
14149 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14150
14151 #undef TARGET_PREFERRED_RELOAD_CLASS
14152 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14153
14154 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14155 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14156
14157 #undef TARGET_PROMOTED_TYPE
14158 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14159
14160 #undef TARGET_SECONDARY_RELOAD
14161 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14162
14163 #undef TARGET_SHIFT_TRUNCATION_MASK
14164 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14165
14166 #undef TARGET_SETUP_INCOMING_VARARGS
14167 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14168
14169 #undef TARGET_STRUCT_VALUE_RTX
14170 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14171
14172 #undef TARGET_REGISTER_MOVE_COST
14173 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14174
14175 #undef TARGET_RETURN_IN_MEMORY
14176 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14177
14178 #undef TARGET_RETURN_IN_MSB
14179 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14180
14181 #undef TARGET_RTX_COSTS
14182 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14183
14184 #undef TARGET_SCHED_ISSUE_RATE
14185 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14186
14187 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14188 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14189 aarch64_sched_first_cycle_multipass_dfa_lookahead
14190
14191 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14192 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14193 aarch64_first_cycle_multipass_dfa_lookahead_guard
14194
14195 #undef TARGET_TRAMPOLINE_INIT
14196 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14197
14198 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14199 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14200
14201 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14202 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14203
14204 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14205 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14206
14207 #undef TARGET_VECTORIZE_ADD_STMT_COST
14208 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14209
14210 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14211 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14212 aarch64_builtin_vectorization_cost
14213
14214 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14215 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14216
14217 #undef TARGET_VECTORIZE_BUILTINS
14218 #define TARGET_VECTORIZE_BUILTINS
14219
14220 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14221 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14222 aarch64_builtin_vectorized_function
14223
14224 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14225 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14226 aarch64_autovectorize_vector_sizes
14227
14228 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14229 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14230 aarch64_atomic_assign_expand_fenv
14231
14232 /* Section anchor support. */
14233
14234 #undef TARGET_MIN_ANCHOR_OFFSET
14235 #define TARGET_MIN_ANCHOR_OFFSET -256
14236
14237 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14238 byte offset; we can do much more for larger data types, but have no way
14239 to determine the size of the access. We assume accesses are aligned. */
14240 #undef TARGET_MAX_ANCHOR_OFFSET
14241 #define TARGET_MAX_ANCHOR_OFFSET 4095
14242
14243 #undef TARGET_VECTOR_ALIGNMENT
14244 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14245
14246 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14247 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14248 aarch64_simd_vector_alignment_reachable
14249
14250 /* vec_perm support. */
14251
14252 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14253 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14254 aarch64_vectorize_vec_perm_const_ok
14255
14256 #undef TARGET_INIT_LIBFUNCS
14257 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14258
14259 #undef TARGET_FIXED_CONDITION_CODE_REGS
14260 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14261
14262 #undef TARGET_FLAGS_REGNUM
14263 #define TARGET_FLAGS_REGNUM CC_REGNUM
14264
14265 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14266 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14267
14268 #undef TARGET_ASAN_SHADOW_OFFSET
14269 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14270
14271 #undef TARGET_LEGITIMIZE_ADDRESS
14272 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14273
14274 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14275 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14276 aarch64_use_by_pieces_infrastructure_p
14277
14278 #undef TARGET_CAN_USE_DOLOOP_P
14279 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14280
14281 #undef TARGET_SCHED_MACRO_FUSION_P
14282 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14283
14284 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14285 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14286
14287 #undef TARGET_SCHED_FUSION_PRIORITY
14288 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14289
14290 #undef TARGET_UNSPEC_MAY_TRAP_P
14291 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14292
14293 #undef TARGET_USE_PSEUDO_PIC_REG
14294 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14295
14296 #undef TARGET_PRINT_OPERAND
14297 #define TARGET_PRINT_OPERAND aarch64_print_operand
14298
14299 #undef TARGET_PRINT_OPERAND_ADDRESS
14300 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14301
14302 #undef TARGET_OPTAB_SUPPORTED_P
14303 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14304
14305 struct gcc_target targetm = TARGET_INITIALIZER;
14306
14307 #include "gt-aarch64.h"