7fdc285b6f8eba0c16c337a3619523be5d75aed1
[gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
84
85 #define PTX_WARP_SIZE 32
86 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
87 #define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
88 #define PTX_WORKER_LENGTH 32
89 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
90
91 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
92 block, which has had a maximum number of threads of 1024 since CUDA version
93 2.x. */
94 #define PTX_CTA_SIZE 1024
95
96 /* The various PTX memory areas an object might reside in. */
97 enum nvptx_data_area
98 {
99 DATA_AREA_GENERIC,
100 DATA_AREA_GLOBAL,
101 DATA_AREA_SHARED,
102 DATA_AREA_LOCAL,
103 DATA_AREA_CONST,
104 DATA_AREA_PARAM,
105 DATA_AREA_MAX
106 };
107
108 /* We record the data area in the target symbol flags. */
109 #define SYMBOL_DATA_AREA(SYM) \
110 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
111 & 7)
112 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
113 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
114
115 /* Record the function decls we've written, and the libfuncs and function
116 decls corresponding to them. */
117 static std::stringstream func_decls;
118
119 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
120 {
121 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
122 static bool equal (rtx a, rtx b) { return a == b; }
123 };
124
125 static GTY((cache))
126 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
127
128 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
129 {
130 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
131 static bool equal (tree a, tree b) { return a == b; }
132 };
133
134 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
135 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
136
137 /* Buffer needed to broadcast across workers and vectors. This is
138 used for both worker-neutering and worker broadcasting, and
139 vector-neutering and boardcasting when vector_length > 32. It is
140 shared by all functions emitted. The buffer is placed in shared
141 memory. It'd be nice if PTX supported common blocks, because then
142 this could be shared across TUs (taking the largest size). */
143 static unsigned oacc_bcast_size;
144 static unsigned oacc_bcast_partition;
145 static unsigned oacc_bcast_align;
146 static GTY(()) rtx oacc_bcast_sym;
147
148 /* Buffer needed for worker reductions. This has to be distinct from
149 the worker broadcast array, as both may be live concurrently. */
150 static unsigned worker_red_size;
151 static unsigned worker_red_align;
152 static GTY(()) rtx worker_red_sym;
153
154 /* Buffer needed for vector reductions, when vector_length >
155 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
156 array, as both may be live concurrently. */
157 static unsigned vector_red_size;
158 static unsigned vector_red_align;
159 static unsigned vector_red_partition;
160 static GTY(()) rtx vector_red_sym;
161
162 /* Global lock variable, needed for 128bit worker & gang reductions. */
163 static GTY(()) tree global_lock_var;
164
165 /* True if any function references __nvptx_stacks. */
166 static bool need_softstack_decl;
167
168 /* True if any function references __nvptx_uni. */
169 static bool need_unisimt_decl;
170
171 static int nvptx_mach_max_workers ();
172
173 /* Allocate a new, cleared machine_function structure. */
174
175 static struct machine_function *
176 nvptx_init_machine_status (void)
177 {
178 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
179 p->return_mode = VOIDmode;
180 return p;
181 }
182
183 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
184 and -fopenacc is also enabled. */
185
186 static void
187 diagnose_openacc_conflict (bool optval, const char *optname)
188 {
189 if (flag_openacc && optval)
190 error ("option %s is not supported together with -fopenacc", optname);
191 }
192
193 /* Implement TARGET_OPTION_OVERRIDE. */
194
195 static void
196 nvptx_option_override (void)
197 {
198 init_machine_status = nvptx_init_machine_status;
199
200 /* Set toplevel_reorder, unless explicitly disabled. We need
201 reordering so that we emit necessary assembler decls of
202 undeclared variables. */
203 if (!global_options_set.x_flag_toplevel_reorder)
204 flag_toplevel_reorder = 1;
205
206 debug_nonbind_markers_p = 0;
207
208 /* Set flag_no_common, unless explicitly disabled. We fake common
209 using .weak, and that's not entirely accurate, so avoid it
210 unless forced. */
211 if (!global_options_set.x_flag_no_common)
212 flag_no_common = 1;
213
214 /* The patch area requires nops, which we don't have. */
215 if (function_entry_patch_area_size > 0)
216 sorry ("not generating patch area, nops not supported");
217
218 /* Assumes that it will see only hard registers. */
219 flag_var_tracking = 0;
220
221 if (nvptx_optimize < 0)
222 nvptx_optimize = optimize > 0;
223
224 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
225 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
226 declared_libfuncs_htab
227 = hash_table<declared_libfunc_hasher>::create_ggc (17);
228
229 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
230 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
231 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
232 oacc_bcast_partition = 0;
233
234 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
235 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
236 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
237
238 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
239 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
240 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
241 vector_red_partition = 0;
242
243 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
244 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
245 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
246
247 if (TARGET_GOMP)
248 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
249 }
250
251 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
252 deal with ptx ideosyncracies. */
253
254 const char *
255 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
256 {
257 switch (mode)
258 {
259 case E_BLKmode:
260 return ".b8";
261 case E_BImode:
262 return ".pred";
263 case E_QImode:
264 if (promote)
265 return ".u32";
266 else
267 return ".u8";
268 case E_HImode:
269 return ".u16";
270 case E_SImode:
271 return ".u32";
272 case E_DImode:
273 return ".u64";
274
275 case E_SFmode:
276 return ".f32";
277 case E_DFmode:
278 return ".f64";
279
280 case E_V2SImode:
281 return ".v2.u32";
282 case E_V2DImode:
283 return ".v2.u64";
284
285 default:
286 gcc_unreachable ();
287 }
288 }
289
290 /* Encode the PTX data area that DECL (which might not actually be a
291 _DECL) should reside in. */
292
293 static void
294 nvptx_encode_section_info (tree decl, rtx rtl, int first)
295 {
296 default_encode_section_info (decl, rtl, first);
297 if (first && MEM_P (rtl))
298 {
299 nvptx_data_area area = DATA_AREA_GENERIC;
300
301 if (TREE_CONSTANT (decl))
302 area = DATA_AREA_CONST;
303 else if (TREE_CODE (decl) == VAR_DECL)
304 {
305 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
306 {
307 area = DATA_AREA_SHARED;
308 if (DECL_INITIAL (decl))
309 error ("static initialization of variable %q+D in %<.shared%>"
310 " memory is not supported", decl);
311 }
312 else
313 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
314 }
315
316 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
317 }
318 }
319
320 /* Return the PTX name of the data area in which SYM should be
321 placed. The symbol must have already been processed by
322 nvptx_encode_seciton_info, or equivalent. */
323
324 static const char *
325 section_for_sym (rtx sym)
326 {
327 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
328 /* Same order as nvptx_data_area enum. */
329 static char const *const areas[] =
330 {"", ".global", ".shared", ".local", ".const", ".param"};
331
332 return areas[area];
333 }
334
335 /* Similarly for a decl. */
336
337 static const char *
338 section_for_decl (const_tree decl)
339 {
340 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
341 }
342
343 /* Check NAME for special function names and redirect them by returning a
344 replacement. This applies to malloc, free and realloc, for which we
345 want to use libgcc wrappers, and call, which triggers a bug in
346 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
347 not active in an offload compiler -- the names are all set by the
348 host-side compiler. */
349
350 static const char *
351 nvptx_name_replacement (const char *name)
352 {
353 if (strcmp (name, "call") == 0)
354 return "__nvptx_call";
355 if (strcmp (name, "malloc") == 0)
356 return "__nvptx_malloc";
357 if (strcmp (name, "free") == 0)
358 return "__nvptx_free";
359 if (strcmp (name, "realloc") == 0)
360 return "__nvptx_realloc";
361 return name;
362 }
363
364 /* If MODE should be treated as two registers of an inner mode, return
365 that inner mode. Otherwise return VOIDmode. */
366
367 static machine_mode
368 maybe_split_mode (machine_mode mode)
369 {
370 if (COMPLEX_MODE_P (mode))
371 return GET_MODE_INNER (mode);
372
373 if (mode == TImode)
374 return DImode;
375
376 return VOIDmode;
377 }
378
379 /* Return true if mode should be treated as two registers. */
380
381 static bool
382 split_mode_p (machine_mode mode)
383 {
384 return maybe_split_mode (mode) != VOIDmode;
385 }
386
387 /* Output a register, subreg, or register pair (with optional
388 enclosing braces). */
389
390 static void
391 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
392 int subreg_offset = -1)
393 {
394 if (inner_mode == VOIDmode)
395 {
396 if (HARD_REGISTER_NUM_P (regno))
397 fprintf (file, "%s", reg_names[regno]);
398 else
399 fprintf (file, "%%r%d", regno);
400 }
401 else if (subreg_offset >= 0)
402 {
403 output_reg (file, regno, VOIDmode);
404 fprintf (file, "$%d", subreg_offset);
405 }
406 else
407 {
408 if (subreg_offset == -1)
409 fprintf (file, "{");
410 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
411 fprintf (file, ",");
412 output_reg (file, regno, inner_mode, 0);
413 if (subreg_offset == -1)
414 fprintf (file, "}");
415 }
416 }
417
418 /* Emit forking instructions for MASK. */
419
420 static void
421 nvptx_emit_forking (unsigned mask, bool is_call)
422 {
423 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
424 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
425 if (mask)
426 {
427 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
428
429 /* Emit fork at all levels. This helps form SESE regions, as
430 it creates a block with a single successor before entering a
431 partitooned region. That is a good candidate for the end of
432 an SESE region. */
433 emit_insn (gen_nvptx_fork (op));
434 emit_insn (gen_nvptx_forked (op));
435 }
436 }
437
438 /* Emit joining instructions for MASK. */
439
440 static void
441 nvptx_emit_joining (unsigned mask, bool is_call)
442 {
443 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
444 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
445 if (mask)
446 {
447 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
448
449 /* Emit joining for all non-call pars to ensure there's a single
450 predecessor for the block the join insn ends up in. This is
451 needed for skipping entire loops. */
452 emit_insn (gen_nvptx_joining (op));
453 emit_insn (gen_nvptx_join (op));
454 }
455 }
456
457 \f
458 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
459 returned in memory. Integer and floating types supported by the
460 machine are passed in registers, everything else is passed in
461 memory. Complex types are split. */
462
463 static bool
464 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
465 {
466 if (type)
467 {
468 if (AGGREGATE_TYPE_P (type))
469 return true;
470 if (TREE_CODE (type) == VECTOR_TYPE)
471 return true;
472 }
473
474 if (!for_return && COMPLEX_MODE_P (mode))
475 /* Complex types are passed as two underlying args. */
476 mode = GET_MODE_INNER (mode);
477
478 if (GET_MODE_CLASS (mode) != MODE_INT
479 && GET_MODE_CLASS (mode) != MODE_FLOAT)
480 return true;
481
482 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
483 return true;
484
485 return false;
486 }
487
488 /* A non-memory argument of mode MODE is being passed, determine the mode it
489 should be promoted to. This is also used for determining return
490 type promotion. */
491
492 static machine_mode
493 promote_arg (machine_mode mode, bool prototyped)
494 {
495 if (!prototyped && mode == SFmode)
496 /* K&R float promotion for unprototyped functions. */
497 mode = DFmode;
498 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
499 mode = SImode;
500
501 return mode;
502 }
503
504 /* A non-memory return type of MODE is being returned. Determine the
505 mode it should be promoted to. */
506
507 static machine_mode
508 promote_return (machine_mode mode)
509 {
510 return promote_arg (mode, true);
511 }
512
513 /* Implement TARGET_FUNCTION_ARG. */
514
515 static rtx
516 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
517 const_tree, bool named)
518 {
519 if (mode == VOIDmode || !named)
520 return NULL_RTX;
521
522 return gen_reg_rtx (mode);
523 }
524
525 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
526
527 static rtx
528 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
529 const_tree, bool named)
530 {
531 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
532
533 if (mode == VOIDmode || !named)
534 return NULL_RTX;
535
536 /* No need to deal with split modes here, the only case that can
537 happen is complex modes and those are dealt with by
538 TARGET_SPLIT_COMPLEX_ARG. */
539 return gen_rtx_UNSPEC (mode,
540 gen_rtvec (1, GEN_INT (cum->count)),
541 UNSPEC_ARG_REG);
542 }
543
544 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
545
546 static void
547 nvptx_function_arg_advance (cumulative_args_t cum_v,
548 machine_mode ARG_UNUSED (mode),
549 const_tree ARG_UNUSED (type),
550 bool ARG_UNUSED (named))
551 {
552 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
553
554 cum->count++;
555 }
556
557 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
558
559 For nvptx This is only used for varadic args. The type has already
560 been promoted and/or converted to invisible reference. */
561
562 static unsigned
563 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
564 {
565 return GET_MODE_ALIGNMENT (mode);
566 }
567
568 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
569
570 For nvptx, we know how to handle functions declared as stdarg: by
571 passing an extra pointer to the unnamed arguments. However, the
572 Fortran frontend can produce a different situation, where a
573 function pointer is declared with no arguments, but the actual
574 function and calls to it take more arguments. In that case, we
575 want to ensure the call matches the definition of the function. */
576
577 static bool
578 nvptx_strict_argument_naming (cumulative_args_t cum_v)
579 {
580 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
581
582 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
583 }
584
585 /* Implement TARGET_LIBCALL_VALUE. */
586
587 static rtx
588 nvptx_libcall_value (machine_mode mode, const_rtx)
589 {
590 if (!cfun || !cfun->machine->doing_call)
591 /* Pretend to return in a hard reg for early uses before pseudos can be
592 generated. */
593 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
594
595 return gen_reg_rtx (mode);
596 }
597
598 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
599 where function FUNC returns or receives a value of data type TYPE. */
600
601 static rtx
602 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
603 bool outgoing)
604 {
605 machine_mode mode = promote_return (TYPE_MODE (type));
606
607 if (outgoing)
608 {
609 gcc_assert (cfun);
610 cfun->machine->return_mode = mode;
611 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
612 }
613
614 return nvptx_libcall_value (mode, NULL_RTX);
615 }
616
617 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
618
619 static bool
620 nvptx_function_value_regno_p (const unsigned int regno)
621 {
622 return regno == NVPTX_RETURN_REGNUM;
623 }
624
625 /* Types with a mode other than those supported by the machine are passed by
626 reference in memory. */
627
628 static bool
629 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
630 machine_mode mode, const_tree type,
631 bool ARG_UNUSED (named))
632 {
633 return pass_in_memory (mode, type, false);
634 }
635
636 /* Implement TARGET_RETURN_IN_MEMORY. */
637
638 static bool
639 nvptx_return_in_memory (const_tree type, const_tree)
640 {
641 return pass_in_memory (TYPE_MODE (type), type, true);
642 }
643
644 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
645
646 static machine_mode
647 nvptx_promote_function_mode (const_tree type, machine_mode mode,
648 int *ARG_UNUSED (punsignedp),
649 const_tree funtype, int for_return)
650 {
651 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
652 }
653
654 /* Helper for write_arg. Emit a single PTX argument of MODE, either
655 in a prototype, or as copy in a function prologue. ARGNO is the
656 index of this argument in the PTX function. FOR_REG is negative,
657 if we're emitting the PTX prototype. It is zero if we're copying
658 to an argument register and it is greater than zero if we're
659 copying to a specific hard register. */
660
661 static int
662 write_arg_mode (std::stringstream &s, int for_reg, int argno,
663 machine_mode mode)
664 {
665 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
666
667 if (for_reg < 0)
668 {
669 /* Writing PTX prototype. */
670 s << (argno ? ", " : " (");
671 s << ".param" << ptx_type << " %in_ar" << argno;
672 }
673 else
674 {
675 s << "\t.reg" << ptx_type << " ";
676 if (for_reg)
677 s << reg_names[for_reg];
678 else
679 s << "%ar" << argno;
680 s << ";\n";
681 if (argno >= 0)
682 {
683 s << "\tld.param" << ptx_type << " ";
684 if (for_reg)
685 s << reg_names[for_reg];
686 else
687 s << "%ar" << argno;
688 s << ", [%in_ar" << argno << "];\n";
689 }
690 }
691 return argno + 1;
692 }
693
694 /* Process function parameter TYPE to emit one or more PTX
695 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
696 is true, if this is a prototyped function, rather than an old-style
697 C declaration. Returns the next argument number to use.
698
699 The promotion behavior here must match the regular GCC function
700 parameter marshalling machinery. */
701
702 static int
703 write_arg_type (std::stringstream &s, int for_reg, int argno,
704 tree type, bool prototyped)
705 {
706 machine_mode mode = TYPE_MODE (type);
707
708 if (mode == VOIDmode)
709 return argno;
710
711 if (pass_in_memory (mode, type, false))
712 mode = Pmode;
713 else
714 {
715 bool split = TREE_CODE (type) == COMPLEX_TYPE;
716
717 if (split)
718 {
719 /* Complex types are sent as two separate args. */
720 type = TREE_TYPE (type);
721 mode = TYPE_MODE (type);
722 prototyped = true;
723 }
724
725 mode = promote_arg (mode, prototyped);
726 if (split)
727 argno = write_arg_mode (s, for_reg, argno, mode);
728 }
729
730 return write_arg_mode (s, for_reg, argno, mode);
731 }
732
733 /* Emit a PTX return as a prototype or function prologue declaration
734 for MODE. */
735
736 static void
737 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
738 {
739 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
740 const char *pfx = "\t.reg";
741 const char *sfx = ";\n";
742
743 if (for_proto)
744 pfx = "(.param", sfx = "_out) ";
745
746 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
747 }
748
749 /* Process a function return TYPE to emit a PTX return as a prototype
750 or function prologue declaration. Returns true if return is via an
751 additional pointer parameter. The promotion behavior here must
752 match the regular GCC function return mashalling. */
753
754 static bool
755 write_return_type (std::stringstream &s, bool for_proto, tree type)
756 {
757 machine_mode mode = TYPE_MODE (type);
758
759 if (mode == VOIDmode)
760 return false;
761
762 bool return_in_mem = pass_in_memory (mode, type, true);
763
764 if (return_in_mem)
765 {
766 if (for_proto)
767 return return_in_mem;
768
769 /* Named return values can cause us to return a pointer as well
770 as expect an argument for the return location. This is
771 optimization-level specific, so no caller can make use of
772 this data, but more importantly for us, we must ensure it
773 doesn't change the PTX prototype. */
774 mode = (machine_mode) cfun->machine->return_mode;
775
776 if (mode == VOIDmode)
777 return return_in_mem;
778
779 /* Clear return_mode to inhibit copy of retval to non-existent
780 retval parameter. */
781 cfun->machine->return_mode = VOIDmode;
782 }
783 else
784 mode = promote_return (mode);
785
786 write_return_mode (s, for_proto, mode);
787
788 return return_in_mem;
789 }
790
791 /* Look for attributes in ATTRS that would indicate we must write a function
792 as a .entry kernel rather than a .func. Return true if one is found. */
793
794 static bool
795 write_as_kernel (tree attrs)
796 {
797 return (lookup_attribute ("kernel", attrs) != NULL_TREE
798 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
799 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
800 /* For OpenMP target regions, the corresponding kernel entry is emitted from
801 write_omp_entry as a separate function. */
802 }
803
804 /* Emit a linker marker for a function decl or defn. */
805
806 static void
807 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
808 const char *name)
809 {
810 s << "\n// BEGIN";
811 if (globalize)
812 s << " GLOBAL";
813 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
814 s << name << "\n";
815 }
816
817 /* Emit a linker marker for a variable decl or defn. */
818
819 static void
820 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
821 {
822 fprintf (file, "\n// BEGIN%s VAR %s: ",
823 globalize ? " GLOBAL" : "",
824 is_defn ? "DEF" : "DECL");
825 assemble_name_raw (file, name);
826 fputs ("\n", file);
827 }
828
829 /* Write a .func or .kernel declaration or definition along with
830 a helper comment for use by ld. S is the stream to write to, DECL
831 the decl for the function with name NAME. For definitions, emit
832 a declaration too. */
833
834 static const char *
835 write_fn_proto (std::stringstream &s, bool is_defn,
836 const char *name, const_tree decl)
837 {
838 if (is_defn)
839 /* Emit a declaration. The PTX assembler gets upset without it. */
840 name = write_fn_proto (s, false, name, decl);
841 else
842 {
843 /* Avoid repeating the name replacement. */
844 name = nvptx_name_replacement (name);
845 if (name[0] == '*')
846 name++;
847 }
848
849 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
850
851 /* PTX declaration. */
852 if (DECL_EXTERNAL (decl))
853 s << ".extern ";
854 else if (TREE_PUBLIC (decl))
855 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
856 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
857
858 tree fntype = TREE_TYPE (decl);
859 tree result_type = TREE_TYPE (fntype);
860
861 /* atomic_compare_exchange_$n builtins have an exceptional calling
862 convention. */
863 int not_atomic_weak_arg = -1;
864 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
865 switch (DECL_FUNCTION_CODE (decl))
866 {
867 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
868 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
869 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
870 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
871 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
872 /* These atomics skip the 'weak' parm in an actual library
873 call. We must skip it in the prototype too. */
874 not_atomic_weak_arg = 3;
875 break;
876
877 default:
878 break;
879 }
880
881 /* Declare the result. */
882 bool return_in_mem = write_return_type (s, true, result_type);
883
884 s << name;
885
886 int argno = 0;
887
888 /* Emit argument list. */
889 if (return_in_mem)
890 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
891
892 /* We get:
893 NULL in TYPE_ARG_TYPES, for old-style functions
894 NULL in DECL_ARGUMENTS, for builtin functions without another
895 declaration.
896 So we have to pick the best one we have. */
897 tree args = TYPE_ARG_TYPES (fntype);
898 bool prototyped = true;
899 if (!args)
900 {
901 args = DECL_ARGUMENTS (decl);
902 prototyped = false;
903 }
904
905 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
906 {
907 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
908
909 if (not_atomic_weak_arg)
910 argno = write_arg_type (s, -1, argno, type, prototyped);
911 else
912 gcc_assert (type == boolean_type_node);
913 }
914
915 if (stdarg_p (fntype))
916 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
917
918 if (DECL_STATIC_CHAIN (decl))
919 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
920
921 if (!argno && strcmp (name, "main") == 0)
922 {
923 argno = write_arg_type (s, -1, argno, integer_type_node, true);
924 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
925 }
926
927 if (argno)
928 s << ")";
929
930 s << (is_defn ? "\n" : ";\n");
931
932 return name;
933 }
934
935 /* Construct a function declaration from a call insn. This can be
936 necessary for two reasons - either we have an indirect call which
937 requires a .callprototype declaration, or we have a libcall
938 generated by emit_library_call for which no decl exists. */
939
940 static void
941 write_fn_proto_from_insn (std::stringstream &s, const char *name,
942 rtx result, rtx pat)
943 {
944 if (!name)
945 {
946 s << "\t.callprototype ";
947 name = "_";
948 }
949 else
950 {
951 name = nvptx_name_replacement (name);
952 write_fn_marker (s, false, true, name);
953 s << "\t.extern .func ";
954 }
955
956 if (result != NULL_RTX)
957 write_return_mode (s, true, GET_MODE (result));
958
959 s << name;
960
961 int arg_end = XVECLEN (pat, 0);
962 for (int i = 1; i < arg_end; i++)
963 {
964 /* We don't have to deal with mode splitting & promotion here,
965 as that was already done when generating the call
966 sequence. */
967 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
968
969 write_arg_mode (s, -1, i - 1, mode);
970 }
971 if (arg_end != 1)
972 s << ")";
973 s << ";\n";
974 }
975
976 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
977 table and and write a ptx prototype. These are emitted at end of
978 compilation. */
979
980 static void
981 nvptx_record_fndecl (tree decl)
982 {
983 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
984 if (*slot == NULL)
985 {
986 *slot = decl;
987 const char *name = get_fnname_from_decl (decl);
988 write_fn_proto (func_decls, false, name, decl);
989 }
990 }
991
992 /* Record a libcall or unprototyped external function. CALLEE is the
993 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
994 declaration for it. */
995
996 static void
997 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
998 {
999 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1000 if (*slot == NULL)
1001 {
1002 *slot = callee;
1003
1004 const char *name = XSTR (callee, 0);
1005 write_fn_proto_from_insn (func_decls, name, retval, pat);
1006 }
1007 }
1008
1009 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1010 is prototyped, record it now. Otherwise record it as needed at end
1011 of compilation, when we might have more information about it. */
1012
1013 void
1014 nvptx_record_needed_fndecl (tree decl)
1015 {
1016 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1017 {
1018 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1019 if (*slot == NULL)
1020 *slot = decl;
1021 }
1022 else
1023 nvptx_record_fndecl (decl);
1024 }
1025
1026 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1027 it as needed. */
1028
1029 static void
1030 nvptx_maybe_record_fnsym (rtx sym)
1031 {
1032 tree decl = SYMBOL_REF_DECL (sym);
1033
1034 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1035 nvptx_record_needed_fndecl (decl);
1036 }
1037
1038 /* Emit a local array to hold some part of a conventional stack frame
1039 and initialize REGNO to point to it. If the size is zero, it'll
1040 never be valid to dereference, so we can simply initialize to
1041 zero. */
1042
1043 static void
1044 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1045 {
1046 if (size)
1047 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1048 align, reg_names[regno], size);
1049 fprintf (file, "\t.reg.u%d %s;\n",
1050 POINTER_SIZE, reg_names[regno]);
1051 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1052 : "\tmov.u%d %s, 0;\n"),
1053 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1054 }
1055
1056 /* Emit soft stack frame setup sequence. */
1057
1058 static void
1059 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1060 {
1061 /* Maintain 64-bit stack alignment. */
1062 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1063 size = ROUND_UP (size, keep_align);
1064 int bits = POINTER_SIZE;
1065 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1066 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1067 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1068 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1069 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1070 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1071 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1072 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1073 fprintf (file, "\t{\n");
1074 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1075 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1076 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1077 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1078 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1079 bits == 64 ? ".wide" : ".lo", bits / 8);
1080 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1081
1082 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1083 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1084
1085 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1086 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1087 bits, reg_sspprev, reg_sspslot);
1088
1089 /* Initialize %frame = %sspprev - size. */
1090 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1091 bits, reg_frame, reg_sspprev, size);
1092
1093 /* Apply alignment, if larger than 64. */
1094 if (alignment > keep_align)
1095 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1096 bits, reg_frame, reg_frame, -alignment);
1097
1098 size = crtl->outgoing_args_size;
1099 gcc_assert (size % keep_align == 0);
1100
1101 /* Initialize %stack. */
1102 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1103 bits, reg_stack, reg_frame, size);
1104
1105 if (!crtl->is_leaf)
1106 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1107 bits, reg_sspslot, reg_stack);
1108 fprintf (file, "\t}\n");
1109 cfun->machine->has_softstack = true;
1110 need_softstack_decl = true;
1111 }
1112
1113 /* Emit code to initialize the REGNO predicate register to indicate
1114 whether we are not lane zero on the NAME axis. */
1115
1116 static void
1117 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1118 {
1119 fprintf (file, "\t{\n");
1120 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1121 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1122 {
1123 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1124 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1125 }
1126 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1127 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1128 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1129 {
1130 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1131 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1132 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1133 "// vector reduction buffer\n",
1134 REGNO (cfun->machine->red_partition),
1135 vector_red_partition);
1136 }
1137 /* Verify vector_red_size. */
1138 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1139 <= vector_red_size);
1140 fprintf (file, "\t}\n");
1141 }
1142
1143 /* Emit code to initialize OpenACC worker broadcast and synchronization
1144 registers. */
1145
1146 static void
1147 nvptx_init_oacc_workers (FILE *file)
1148 {
1149 fprintf (file, "\t{\n");
1150 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1151 if (cfun->machine->bcast_partition)
1152 {
1153 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1154 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1155 }
1156 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1157 if (cfun->machine->bcast_partition)
1158 {
1159 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1160 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1161 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1162 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1163 "// vector broadcast offset\n",
1164 REGNO (cfun->machine->bcast_partition),
1165 oacc_bcast_partition);
1166 }
1167 /* Verify oacc_bcast_size. */
1168 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1169 <= oacc_bcast_size);
1170 if (cfun->machine->sync_bar)
1171 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1172 "// vector synchronization barrier\n",
1173 REGNO (cfun->machine->sync_bar));
1174 fprintf (file, "\t}\n");
1175 }
1176
1177 /* Emit code to initialize predicate and master lane index registers for
1178 -muniform-simt code generation variant. */
1179
1180 static void
1181 nvptx_init_unisimt_predicate (FILE *file)
1182 {
1183 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1184 int loc = REGNO (cfun->machine->unisimt_location);
1185 int bits = POINTER_SIZE;
1186 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1187 fprintf (file, "\t{\n");
1188 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1189 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1190 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1191 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1192 bits == 64 ? ".wide" : ".lo");
1193 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1194 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1195 if (cfun->machine->unisimt_predicate)
1196 {
1197 int master = REGNO (cfun->machine->unisimt_master);
1198 int pred = REGNO (cfun->machine->unisimt_predicate);
1199 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1200 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1201 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1202 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1203 /* Compute predicate as 'tid.x == master'. */
1204 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1205 }
1206 fprintf (file, "\t}\n");
1207 need_unisimt_decl = true;
1208 }
1209
1210 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1211
1212 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1213 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1214 {
1215 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1216 __nvptx_uni[tid.y] = 0;
1217 gomp_nvptx_main (ORIG, arg);
1218 }
1219 ORIG itself should not be emitted as a PTX .entry function. */
1220
1221 static void
1222 write_omp_entry (FILE *file, const char *name, const char *orig)
1223 {
1224 static bool gomp_nvptx_main_declared;
1225 if (!gomp_nvptx_main_declared)
1226 {
1227 gomp_nvptx_main_declared = true;
1228 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1229 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1230 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1231 }
1232 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1233 #define NTID_Y "%ntid.y"
1234 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1235 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1236 {\n\
1237 .reg.u32 %r<3>;\n\
1238 .reg.u" PS " %R<4>;\n\
1239 mov.u32 %r0, %tid.y;\n\
1240 mov.u32 %r1, " NTID_Y ";\n\
1241 mov.u32 %r2, %ctaid.x;\n\
1242 cvt.u" PS ".u32 %R1, %r0;\n\
1243 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1244 mov.u" PS " %R0, __nvptx_stacks;\n\
1245 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1246 ld.param.u" PS " %R2, [%stack];\n\
1247 ld.param.u" PS " %R3, [%sz];\n\
1248 add.u" PS " %R2, %R2, %R3;\n\
1249 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1250 st.shared.u" PS " [%R0], %R2;\n\
1251 mov.u" PS " %R0, __nvptx_uni;\n\
1252 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1253 mov.u32 %r0, 0;\n\
1254 st.shared.u32 [%R0], %r0;\n\
1255 mov.u" PS " %R0, \0;\n\
1256 ld.param.u" PS " %R1, [%arg];\n\
1257 {\n\
1258 .param.u" PS " %P<2>;\n\
1259 st.param.u" PS " [%P0], %R0;\n\
1260 st.param.u" PS " [%P1], %R1;\n\
1261 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1262 }\n\
1263 ret.uni;\n\
1264 }\n"
1265 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1266 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1267 #undef ENTRY_TEMPLATE
1268 #undef NTID_Y
1269 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1270 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1271 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1272 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1273 need_softstack_decl = need_unisimt_decl = true;
1274 }
1275
1276 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1277 function, including local var decls and copies from the arguments to
1278 local regs. */
1279
1280 void
1281 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1282 {
1283 tree fntype = TREE_TYPE (decl);
1284 tree result_type = TREE_TYPE (fntype);
1285 int argno = 0;
1286
1287 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1288 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1289 {
1290 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1291 sprintf (buf, "%s$impl", name);
1292 write_omp_entry (file, name, buf);
1293 name = buf;
1294 }
1295 /* We construct the initial part of the function into a string
1296 stream, in order to share the prototype writing code. */
1297 std::stringstream s;
1298 write_fn_proto (s, true, name, decl);
1299 s << "{\n";
1300
1301 bool return_in_mem = write_return_type (s, false, result_type);
1302 if (return_in_mem)
1303 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1304
1305 /* Declare and initialize incoming arguments. */
1306 tree args = TYPE_ARG_TYPES (fntype);
1307 bool prototyped = true;
1308 if (!args)
1309 {
1310 args = DECL_ARGUMENTS (decl);
1311 prototyped = false;
1312 }
1313
1314 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1315 {
1316 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1317
1318 argno = write_arg_type (s, 0, argno, type, prototyped);
1319 }
1320
1321 if (stdarg_p (fntype))
1322 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1323 true);
1324
1325 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1326 write_arg_type (s, STATIC_CHAIN_REGNUM,
1327 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1328 true);
1329
1330 fprintf (file, "%s", s.str().c_str());
1331
1332 /* Usually 'crtl->is_leaf' is computed during register allocator
1333 initialization (which is not done on NVPTX) or for pressure-sensitive
1334 optimizations. Initialize it here, except if already set. */
1335 if (!crtl->is_leaf)
1336 crtl->is_leaf = leaf_function_p ();
1337
1338 HOST_WIDE_INT sz = get_frame_size ();
1339 bool need_frameptr = sz || cfun->machine->has_chain;
1340 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1341 if (!TARGET_SOFT_STACK)
1342 {
1343 /* Declare a local var for outgoing varargs. */
1344 if (cfun->machine->has_varadic)
1345 init_frame (file, STACK_POINTER_REGNUM,
1346 UNITS_PER_WORD, crtl->outgoing_args_size);
1347
1348 /* Declare a local variable for the frame. Force its size to be
1349 DImode-compatible. */
1350 if (need_frameptr)
1351 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1352 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1353 }
1354 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1355 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1356 init_softstack_frame (file, alignment, sz);
1357
1358 if (cfun->machine->has_simtreg)
1359 {
1360 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1361 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1362 align = MAX (align, GET_MODE_SIZE (DImode));
1363 if (!crtl->is_leaf || cfun->calls_alloca)
1364 simtsz = HOST_WIDE_INT_M1U;
1365 if (simtsz == HOST_WIDE_INT_M1U)
1366 simtsz = nvptx_softstack_size;
1367 if (cfun->machine->has_softstack)
1368 simtsz += POINTER_SIZE / 8;
1369 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1370 if (align > GET_MODE_SIZE (DImode))
1371 simtsz += align - GET_MODE_SIZE (DImode);
1372 if (simtsz)
1373 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1374 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1375 }
1376
1377 /* Restore the vector reduction partition register, if necessary.
1378 FIXME: Find out when and why this is necessary, and fix it. */
1379 if (cfun->machine->red_partition)
1380 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1381 = cfun->machine->red_partition;
1382
1383 /* Declare the pseudos we have as ptx registers. */
1384 int maxregs = max_reg_num ();
1385 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1386 {
1387 if (regno_reg_rtx[i] != const0_rtx)
1388 {
1389 machine_mode mode = PSEUDO_REGNO_MODE (i);
1390 machine_mode split = maybe_split_mode (mode);
1391
1392 if (split_mode_p (mode))
1393 mode = split;
1394 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1395 output_reg (file, i, split, -2);
1396 fprintf (file, ";\n");
1397 }
1398 }
1399
1400 /* Emit axis predicates. */
1401 if (cfun->machine->axis_predicate[0])
1402 nvptx_init_axis_predicate (file,
1403 REGNO (cfun->machine->axis_predicate[0]), "y");
1404 if (cfun->machine->axis_predicate[1])
1405 nvptx_init_axis_predicate (file,
1406 REGNO (cfun->machine->axis_predicate[1]), "x");
1407 if (cfun->machine->unisimt_predicate
1408 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1409 nvptx_init_unisimt_predicate (file);
1410 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1411 nvptx_init_oacc_workers (file);
1412 }
1413
1414 /* Output code for switching uniform-simt state. ENTERING indicates whether
1415 we are entering or leaving non-uniform execution region. */
1416
1417 static void
1418 nvptx_output_unisimt_switch (FILE *file, bool entering)
1419 {
1420 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1421 return;
1422 fprintf (file, "\t{\n");
1423 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1424 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1425 if (!crtl->is_leaf)
1426 {
1427 int loc = REGNO (cfun->machine->unisimt_location);
1428 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1429 }
1430 if (cfun->machine->unisimt_predicate)
1431 {
1432 int master = REGNO (cfun->machine->unisimt_master);
1433 int pred = REGNO (cfun->machine->unisimt_predicate);
1434 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1435 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1436 master, entering ? "%ustmp2" : "0");
1437 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1438 }
1439 fprintf (file, "\t}\n");
1440 }
1441
1442 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1443 ENTERING indicates whether we are entering or leaving non-uniform execution.
1444 PTR is the register pointing to allocated storage, it is assigned to on
1445 entering and used to restore state on leaving. SIZE and ALIGN are used only
1446 on entering. */
1447
1448 static void
1449 nvptx_output_softstack_switch (FILE *file, bool entering,
1450 rtx ptr, rtx size, rtx align)
1451 {
1452 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1453 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1454 return;
1455 int bits = POINTER_SIZE, regno = REGNO (ptr);
1456 fprintf (file, "\t{\n");
1457 if (entering)
1458 {
1459 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1460 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1461 cfun->machine->simt_stack_size);
1462 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1463 if (CONST_INT_P (size))
1464 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1465 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1466 else
1467 output_reg (file, REGNO (size), VOIDmode);
1468 fputs (";\n", file);
1469 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1470 fprintf (file,
1471 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1472 bits, regno, regno, UINTVAL (align));
1473 }
1474 if (cfun->machine->has_softstack)
1475 {
1476 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1477 if (entering)
1478 {
1479 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1480 bits, regno, bits / 8, reg_stack);
1481 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1482 bits, reg_stack, regno, bits / 8);
1483 }
1484 else
1485 {
1486 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1487 bits, reg_stack, regno, bits / 8);
1488 }
1489 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1490 }
1491 fprintf (file, "\t}\n");
1492 }
1493
1494 /* Output code to enter non-uniform execution region. DEST is a register
1495 to hold a per-lane allocation given by SIZE and ALIGN. */
1496
1497 const char *
1498 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1499 {
1500 nvptx_output_unisimt_switch (asm_out_file, true);
1501 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1502 return "";
1503 }
1504
1505 /* Output code to leave non-uniform execution region. SRC is the register
1506 holding per-lane storage previously allocated by omp_simt_enter insn. */
1507
1508 const char *
1509 nvptx_output_simt_exit (rtx src)
1510 {
1511 nvptx_output_unisimt_switch (asm_out_file, false);
1512 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1513 return "";
1514 }
1515
1516 /* Output instruction that sets soft stack pointer in shared memory to the
1517 value in register given by SRC_REGNO. */
1518
1519 const char *
1520 nvptx_output_set_softstack (unsigned src_regno)
1521 {
1522 if (cfun->machine->has_softstack && !crtl->is_leaf)
1523 {
1524 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1525 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1526 output_reg (asm_out_file, src_regno, VOIDmode);
1527 fprintf (asm_out_file, ";\n");
1528 }
1529 return "";
1530 }
1531 /* Output a return instruction. Also copy the return value to its outgoing
1532 location. */
1533
1534 const char *
1535 nvptx_output_return (void)
1536 {
1537 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1538
1539 if (mode != VOIDmode)
1540 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1541 nvptx_ptx_type_from_mode (mode, false),
1542 reg_names[NVPTX_RETURN_REGNUM],
1543 reg_names[NVPTX_RETURN_REGNUM]);
1544
1545 return "ret;";
1546 }
1547
1548 /* Terminate a function by writing a closing brace to FILE. */
1549
1550 void
1551 nvptx_function_end (FILE *file)
1552 {
1553 fprintf (file, "}\n");
1554 }
1555 \f
1556 /* Decide whether we can make a sibling call to a function. For ptx, we
1557 can't. */
1558
1559 static bool
1560 nvptx_function_ok_for_sibcall (tree, tree)
1561 {
1562 return false;
1563 }
1564
1565 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1566
1567 static rtx
1568 nvptx_get_drap_rtx (void)
1569 {
1570 if (TARGET_SOFT_STACK && stack_realign_drap)
1571 return arg_pointer_rtx;
1572 return NULL_RTX;
1573 }
1574
1575 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1576 argument to the next call. */
1577
1578 static void
1579 nvptx_call_args (rtx arg, tree fntype)
1580 {
1581 if (!cfun->machine->doing_call)
1582 {
1583 cfun->machine->doing_call = true;
1584 cfun->machine->is_varadic = false;
1585 cfun->machine->num_args = 0;
1586
1587 if (fntype && stdarg_p (fntype))
1588 {
1589 cfun->machine->is_varadic = true;
1590 cfun->machine->has_varadic = true;
1591 cfun->machine->num_args++;
1592 }
1593 }
1594
1595 if (REG_P (arg) && arg != pc_rtx)
1596 {
1597 cfun->machine->num_args++;
1598 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1599 cfun->machine->call_args);
1600 }
1601 }
1602
1603 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1604 information we recorded. */
1605
1606 static void
1607 nvptx_end_call_args (void)
1608 {
1609 cfun->machine->doing_call = false;
1610 free_EXPR_LIST_list (&cfun->machine->call_args);
1611 }
1612
1613 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1614 track of whether calls involving static chains or varargs were seen
1615 in the current function.
1616 For libcalls, maintain a hash table of decls we have seen, and
1617 record a function decl for later when encountering a new one. */
1618
1619 void
1620 nvptx_expand_call (rtx retval, rtx address)
1621 {
1622 rtx callee = XEXP (address, 0);
1623 rtx varargs = NULL_RTX;
1624 unsigned parallel = 0;
1625
1626 if (!call_insn_operand (callee, Pmode))
1627 {
1628 callee = force_reg (Pmode, callee);
1629 address = change_address (address, QImode, callee);
1630 }
1631
1632 if (GET_CODE (callee) == SYMBOL_REF)
1633 {
1634 tree decl = SYMBOL_REF_DECL (callee);
1635 if (decl != NULL_TREE)
1636 {
1637 if (DECL_STATIC_CHAIN (decl))
1638 cfun->machine->has_chain = true;
1639
1640 tree attr = oacc_get_fn_attrib (decl);
1641 if (attr)
1642 {
1643 tree dims = TREE_VALUE (attr);
1644
1645 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1646 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1647 {
1648 if (TREE_PURPOSE (dims)
1649 && !integer_zerop (TREE_PURPOSE (dims)))
1650 break;
1651 /* Not on this axis. */
1652 parallel ^= GOMP_DIM_MASK (ix);
1653 dims = TREE_CHAIN (dims);
1654 }
1655 }
1656 }
1657 }
1658
1659 unsigned nargs = cfun->machine->num_args;
1660 if (cfun->machine->is_varadic)
1661 {
1662 varargs = gen_reg_rtx (Pmode);
1663 emit_move_insn (varargs, stack_pointer_rtx);
1664 }
1665
1666 rtvec vec = rtvec_alloc (nargs + 1);
1667 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1668 int vec_pos = 0;
1669
1670 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1671 rtx tmp_retval = retval;
1672 if (retval)
1673 {
1674 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1675 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1676 call = gen_rtx_SET (tmp_retval, call);
1677 }
1678 XVECEXP (pat, 0, vec_pos++) = call;
1679
1680 /* Construct the call insn, including a USE for each argument pseudo
1681 register. These will be used when printing the insn. */
1682 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1683 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1684
1685 if (varargs)
1686 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1687
1688 gcc_assert (vec_pos = XVECLEN (pat, 0));
1689
1690 nvptx_emit_forking (parallel, true);
1691 emit_call_insn (pat);
1692 nvptx_emit_joining (parallel, true);
1693
1694 if (tmp_retval != retval)
1695 emit_move_insn (retval, tmp_retval);
1696 }
1697
1698 /* Emit a comparison COMPARE, and return the new test to be used in the
1699 jump. */
1700
1701 rtx
1702 nvptx_expand_compare (rtx compare)
1703 {
1704 rtx pred = gen_reg_rtx (BImode);
1705 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1706 XEXP (compare, 0), XEXP (compare, 1));
1707 emit_insn (gen_rtx_SET (pred, cmp));
1708 return gen_rtx_NE (BImode, pred, const0_rtx);
1709 }
1710
1711 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1712
1713 void
1714 nvptx_expand_oacc_fork (unsigned mode)
1715 {
1716 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1717 }
1718
1719 void
1720 nvptx_expand_oacc_join (unsigned mode)
1721 {
1722 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1723 }
1724
1725 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1726 objects. */
1727
1728 static rtx
1729 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1730 {
1731 rtx res;
1732
1733 switch (GET_MODE (src))
1734 {
1735 case E_DImode:
1736 res = gen_unpackdisi2 (dst0, dst1, src);
1737 break;
1738 case E_DFmode:
1739 res = gen_unpackdfsi2 (dst0, dst1, src);
1740 break;
1741 default: gcc_unreachable ();
1742 }
1743 return res;
1744 }
1745
1746 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1747 object. */
1748
1749 static rtx
1750 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1751 {
1752 rtx res;
1753
1754 switch (GET_MODE (dst))
1755 {
1756 case E_DImode:
1757 res = gen_packsidi2 (dst, src0, src1);
1758 break;
1759 case E_DFmode:
1760 res = gen_packsidf2 (dst, src0, src1);
1761 break;
1762 default: gcc_unreachable ();
1763 }
1764 return res;
1765 }
1766
1767 /* Generate an instruction or sequence to broadcast register REG
1768 across the vectors of a single warp. */
1769
1770 rtx
1771 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1772 {
1773 rtx res;
1774
1775 switch (GET_MODE (dst))
1776 {
1777 case E_SImode:
1778 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1779 break;
1780 case E_SFmode:
1781 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1782 break;
1783 case E_DImode:
1784 case E_DFmode:
1785 {
1786 rtx tmp0 = gen_reg_rtx (SImode);
1787 rtx tmp1 = gen_reg_rtx (SImode);
1788
1789 start_sequence ();
1790 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1791 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1792 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1793 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1794 res = get_insns ();
1795 end_sequence ();
1796 }
1797 break;
1798 case E_BImode:
1799 {
1800 rtx tmp = gen_reg_rtx (SImode);
1801
1802 start_sequence ();
1803 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1804 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1805 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1806 res = get_insns ();
1807 end_sequence ();
1808 }
1809 break;
1810 case E_QImode:
1811 case E_HImode:
1812 {
1813 rtx tmp = gen_reg_rtx (SImode);
1814
1815 start_sequence ();
1816 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1817 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1818 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1819 tmp)));
1820 res = get_insns ();
1821 end_sequence ();
1822 }
1823 break;
1824
1825 default:
1826 gcc_unreachable ();
1827 }
1828 return res;
1829 }
1830
1831 /* Generate an instruction or sequence to broadcast register REG
1832 across the vectors of a single warp. */
1833
1834 static rtx
1835 nvptx_gen_warp_bcast (rtx reg)
1836 {
1837 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1838 }
1839
1840 /* Structure used when generating a worker-level spill or fill. */
1841
1842 struct broadcast_data_t
1843 {
1844 rtx base; /* Register holding base addr of buffer. */
1845 rtx ptr; /* Iteration var, if needed. */
1846 unsigned offset; /* Offset into worker buffer. */
1847 };
1848
1849 /* Direction of the spill/fill and looping setup/teardown indicator. */
1850
1851 enum propagate_mask
1852 {
1853 PM_read = 1 << 0,
1854 PM_write = 1 << 1,
1855 PM_loop_begin = 1 << 2,
1856 PM_loop_end = 1 << 3,
1857
1858 PM_read_write = PM_read | PM_write
1859 };
1860
1861 /* Generate instruction(s) to spill or fill register REG to/from the
1862 worker broadcast array. PM indicates what is to be done, REP
1863 how many loop iterations will be executed (0 for not a loop). */
1864
1865 static rtx
1866 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1867 broadcast_data_t *data, bool vector)
1868 {
1869 rtx res;
1870 machine_mode mode = GET_MODE (reg);
1871
1872 switch (mode)
1873 {
1874 case E_BImode:
1875 {
1876 rtx tmp = gen_reg_rtx (SImode);
1877
1878 start_sequence ();
1879 if (pm & PM_read)
1880 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1881 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1882 if (pm & PM_write)
1883 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1884 res = get_insns ();
1885 end_sequence ();
1886 }
1887 break;
1888
1889 default:
1890 {
1891 rtx addr = data->ptr;
1892
1893 if (!addr)
1894 {
1895 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1896
1897 oacc_bcast_align = MAX (oacc_bcast_align, align);
1898 data->offset = ROUND_UP (data->offset, align);
1899 addr = data->base;
1900 gcc_assert (data->base != NULL);
1901 if (data->offset)
1902 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1903 }
1904
1905 addr = gen_rtx_MEM (mode, addr);
1906 if (pm == PM_read)
1907 res = gen_rtx_SET (addr, reg);
1908 else if (pm == PM_write)
1909 res = gen_rtx_SET (reg, addr);
1910 else
1911 gcc_unreachable ();
1912
1913 if (data->ptr)
1914 {
1915 /* We're using a ptr, increment it. */
1916 start_sequence ();
1917
1918 emit_insn (res);
1919 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1920 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1921 res = get_insns ();
1922 end_sequence ();
1923 }
1924 else
1925 rep = 1;
1926 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1927 }
1928 break;
1929 }
1930 return res;
1931 }
1932 \f
1933 /* Returns true if X is a valid address for use in a memory reference. */
1934
1935 static bool
1936 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1937 {
1938 enum rtx_code code = GET_CODE (x);
1939
1940 switch (code)
1941 {
1942 case REG:
1943 return true;
1944
1945 case PLUS:
1946 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1947 return true;
1948 return false;
1949
1950 case CONST:
1951 case SYMBOL_REF:
1952 case LABEL_REF:
1953 return true;
1954
1955 default:
1956 return false;
1957 }
1958 }
1959 \f
1960 /* Machinery to output constant initializers. When beginning an
1961 initializer, we decide on a fragment size (which is visible in ptx
1962 in the type used), and then all initializer data is buffered until
1963 a fragment is filled and ready to be written out. */
1964
1965 static struct
1966 {
1967 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1968 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1969 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1970 out. */
1971 unsigned size; /* Fragment size to accumulate. */
1972 unsigned offset; /* Offset within current fragment. */
1973 bool started; /* Whether we've output any initializer. */
1974 } init_frag;
1975
1976 /* The current fragment is full, write it out. SYM may provide a
1977 symbolic reference we should output, in which case the fragment
1978 value is the addend. */
1979
1980 static void
1981 output_init_frag (rtx sym)
1982 {
1983 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1984 unsigned HOST_WIDE_INT val = init_frag.val;
1985
1986 init_frag.started = true;
1987 init_frag.val = 0;
1988 init_frag.offset = 0;
1989 init_frag.remaining--;
1990
1991 if (sym)
1992 {
1993 bool function = (SYMBOL_REF_DECL (sym)
1994 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1995 if (!function)
1996 fprintf (asm_out_file, "generic(");
1997 output_address (VOIDmode, sym);
1998 if (!function)
1999 fprintf (asm_out_file, ")");
2000 if (val)
2001 fprintf (asm_out_file, " + ");
2002 }
2003
2004 if (!sym || val)
2005 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2006 }
2007
2008 /* Add value VAL of size SIZE to the data we're emitting, and keep
2009 writing out chunks as they fill up. */
2010
2011 static void
2012 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2013 {
2014 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2015
2016 for (unsigned part = 0; size; size -= part)
2017 {
2018 val >>= part * BITS_PER_UNIT;
2019 part = init_frag.size - init_frag.offset;
2020 part = MIN (part, size);
2021
2022 unsigned HOST_WIDE_INT partial
2023 = val << (init_frag.offset * BITS_PER_UNIT);
2024 init_frag.val |= partial & init_frag.mask;
2025 init_frag.offset += part;
2026
2027 if (init_frag.offset == init_frag.size)
2028 output_init_frag (NULL);
2029 }
2030 }
2031
2032 /* Target hook for assembling integer object X of size SIZE. */
2033
2034 static bool
2035 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2036 {
2037 HOST_WIDE_INT val = 0;
2038
2039 switch (GET_CODE (x))
2040 {
2041 default:
2042 /* Let the generic machinery figure it out, usually for a
2043 CONST_WIDE_INT. */
2044 return false;
2045
2046 case CONST_INT:
2047 nvptx_assemble_value (INTVAL (x), size);
2048 break;
2049
2050 case CONST:
2051 x = XEXP (x, 0);
2052 gcc_assert (GET_CODE (x) == PLUS);
2053 val = INTVAL (XEXP (x, 1));
2054 x = XEXP (x, 0);
2055 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2056 /* FALLTHROUGH */
2057
2058 case SYMBOL_REF:
2059 gcc_assert (size == init_frag.size);
2060 if (init_frag.offset)
2061 sorry ("cannot emit unaligned pointers in ptx assembly");
2062
2063 nvptx_maybe_record_fnsym (x);
2064 init_frag.val = val;
2065 output_init_frag (x);
2066 break;
2067 }
2068
2069 return true;
2070 }
2071
2072 /* Output SIZE zero bytes. We ignore the FILE argument since the
2073 functions we're calling to perform the output just use
2074 asm_out_file. */
2075
2076 void
2077 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2078 {
2079 /* Finish the current fragment, if it's started. */
2080 if (init_frag.offset)
2081 {
2082 unsigned part = init_frag.size - init_frag.offset;
2083 part = MIN (part, (unsigned)size);
2084 size -= part;
2085 nvptx_assemble_value (0, part);
2086 }
2087
2088 /* If this skip doesn't terminate the initializer, write as many
2089 remaining pieces as possible directly. */
2090 if (size < init_frag.remaining * init_frag.size)
2091 {
2092 while (size >= init_frag.size)
2093 {
2094 size -= init_frag.size;
2095 output_init_frag (NULL_RTX);
2096 }
2097 if (size)
2098 nvptx_assemble_value (0, size);
2099 }
2100 }
2101
2102 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2103 ignore the FILE arg. */
2104
2105 void
2106 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2107 {
2108 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2109 nvptx_assemble_value (str[i], 1);
2110 }
2111
2112 /* Return true if TYPE is a record type where the last field is an array without
2113 given dimension. */
2114
2115 static bool
2116 flexible_array_member_type_p (const_tree type)
2117 {
2118 if (TREE_CODE (type) != RECORD_TYPE)
2119 return false;
2120
2121 const_tree last_field = NULL_TREE;
2122 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2123 last_field = f;
2124
2125 if (!last_field)
2126 return false;
2127
2128 const_tree last_field_type = TREE_TYPE (last_field);
2129 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2130 return false;
2131
2132 return (! TYPE_DOMAIN (last_field_type)
2133 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2134 }
2135
2136 /* Emit a PTX variable decl and prepare for emission of its
2137 initializer. NAME is the symbol name and SETION the PTX data
2138 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2139 The caller has already emitted any indentation and linkage
2140 specifier. It is responsible for any initializer, terminating ;
2141 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2142 this is the opposite way round that PTX wants them! */
2143
2144 static void
2145 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2146 const_tree type, HOST_WIDE_INT size, unsigned align,
2147 bool undefined = false)
2148 {
2149 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2150 && (TYPE_DOMAIN (type) == NULL_TREE);
2151
2152 if (undefined && flexible_array_member_type_p (type))
2153 {
2154 size = 0;
2155 atype = true;
2156 }
2157
2158 while (TREE_CODE (type) == ARRAY_TYPE)
2159 type = TREE_TYPE (type);
2160
2161 if (TREE_CODE (type) == VECTOR_TYPE
2162 || TREE_CODE (type) == COMPLEX_TYPE)
2163 /* Neither vector nor complex types can contain the other. */
2164 type = TREE_TYPE (type);
2165
2166 unsigned elt_size = int_size_in_bytes (type);
2167
2168 /* Largest mode we're prepared to accept. For BLKmode types we
2169 don't know if it'll contain pointer constants, so have to choose
2170 pointer size, otherwise we can choose DImode. */
2171 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2172
2173 elt_size |= GET_MODE_SIZE (elt_mode);
2174 elt_size &= -elt_size; /* Extract LSB set. */
2175
2176 init_frag.size = elt_size;
2177 /* Avoid undefined shift behavior by using '2'. */
2178 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2179 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2180 init_frag.val = 0;
2181 init_frag.offset = 0;
2182 init_frag.started = false;
2183 /* Size might not be a multiple of elt size, if there's an
2184 initialized trailing struct array with smaller type than
2185 elt_size. */
2186 init_frag.remaining = (size + elt_size - 1) / elt_size;
2187
2188 fprintf (file, "%s .align %d .u%d ",
2189 section, align / BITS_PER_UNIT,
2190 elt_size * BITS_PER_UNIT);
2191 assemble_name (file, name);
2192
2193 if (size)
2194 /* We make everything an array, to simplify any initialization
2195 emission. */
2196 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2197 else if (atype)
2198 fprintf (file, "[]");
2199 }
2200
2201 /* Called when the initializer for a decl has been completely output through
2202 combinations of the three functions above. */
2203
2204 static void
2205 nvptx_assemble_decl_end (void)
2206 {
2207 if (init_frag.offset)
2208 /* This can happen with a packed struct with trailing array member. */
2209 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2210 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2211 }
2212
2213 /* Output an uninitialized common or file-scope variable. */
2214
2215 void
2216 nvptx_output_aligned_decl (FILE *file, const char *name,
2217 const_tree decl, HOST_WIDE_INT size, unsigned align)
2218 {
2219 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2220
2221 /* If this is public, it is common. The nearest thing we have to
2222 common is weak. */
2223 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2224
2225 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2226 TREE_TYPE (decl), size, align);
2227 nvptx_assemble_decl_end ();
2228 }
2229
2230 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2231 writing a constant variable EXP with NAME and SIZE and its
2232 initializer to FILE. */
2233
2234 static void
2235 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2236 const_tree exp, HOST_WIDE_INT obj_size)
2237 {
2238 write_var_marker (file, true, false, name);
2239
2240 fprintf (file, "\t");
2241
2242 tree type = TREE_TYPE (exp);
2243 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2244 TYPE_ALIGN (type));
2245 }
2246
2247 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2248 a variable DECL with NAME to FILE. */
2249
2250 void
2251 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2252 {
2253 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2254
2255 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2256 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2257
2258 tree type = TREE_TYPE (decl);
2259 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2260 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2261 type, obj_size, DECL_ALIGN (decl));
2262 }
2263
2264 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2265
2266 static void
2267 nvptx_globalize_label (FILE *, const char *)
2268 {
2269 }
2270
2271 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2272 declaration only for variable DECL with NAME to FILE. */
2273
2274 static void
2275 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2276 {
2277 /* The middle end can place constant pool decls into the varpool as
2278 undefined. Until that is fixed, catch the problem here. */
2279 if (DECL_IN_CONSTANT_POOL (decl))
2280 return;
2281
2282 /* We support weak defintions, and hence have the right
2283 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2284 if (DECL_WEAK (decl))
2285 error_at (DECL_SOURCE_LOCATION (decl),
2286 "PTX does not support weak declarations"
2287 " (only weak definitions)");
2288 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2289
2290 fprintf (file, "\t.extern ");
2291 tree size = DECL_SIZE_UNIT (decl);
2292 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2293 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2294 DECL_ALIGN (decl), true);
2295 nvptx_assemble_decl_end ();
2296 }
2297
2298 /* Output a pattern for a move instruction. */
2299
2300 const char *
2301 nvptx_output_mov_insn (rtx dst, rtx src)
2302 {
2303 machine_mode dst_mode = GET_MODE (dst);
2304 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2305 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2306 machine_mode src_inner = (GET_CODE (src) == SUBREG
2307 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2308
2309 rtx sym = src;
2310 if (GET_CODE (sym) == CONST)
2311 sym = XEXP (XEXP (sym, 0), 0);
2312 if (SYMBOL_REF_P (sym))
2313 {
2314 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2315 return "%.\tcvta%D1%t0\t%0, %1;";
2316 nvptx_maybe_record_fnsym (sym);
2317 }
2318
2319 if (src_inner == dst_inner)
2320 return "%.\tmov%t0\t%0, %1;";
2321
2322 if (CONSTANT_P (src))
2323 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2324 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2325 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2326
2327 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2328 {
2329 if (GET_MODE_BITSIZE (dst_mode) == 128
2330 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2331 {
2332 /* mov.b128 is not supported. */
2333 if (dst_inner == V2DImode && src_inner == TImode)
2334 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2335 else if (dst_inner == TImode && src_inner == V2DImode)
2336 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2337
2338 gcc_unreachable ();
2339 }
2340 return "%.\tmov.b%T0\t%0, %1;";
2341 }
2342
2343 return "%.\tcvt%t0%t1\t%0, %1;";
2344 }
2345
2346 static void nvptx_print_operand (FILE *, rtx, int);
2347
2348 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2349 involves writing .param declarations and in/out copies into them. For
2350 indirect calls, also write the .callprototype. */
2351
2352 const char *
2353 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2354 {
2355 char buf[16];
2356 static int labelno;
2357 bool needs_tgt = register_operand (callee, Pmode);
2358 rtx pat = PATTERN (insn);
2359 if (GET_CODE (pat) == COND_EXEC)
2360 pat = COND_EXEC_CODE (pat);
2361 int arg_end = XVECLEN (pat, 0);
2362 tree decl = NULL_TREE;
2363
2364 fprintf (asm_out_file, "\t{\n");
2365 if (result != NULL)
2366 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2367 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2368 reg_names[NVPTX_RETURN_REGNUM]);
2369
2370 /* Ensure we have a ptx declaration in the output if necessary. */
2371 if (GET_CODE (callee) == SYMBOL_REF)
2372 {
2373 decl = SYMBOL_REF_DECL (callee);
2374 if (!decl
2375 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2376 nvptx_record_libfunc (callee, result, pat);
2377 else if (DECL_EXTERNAL (decl))
2378 nvptx_record_fndecl (decl);
2379 }
2380
2381 if (needs_tgt)
2382 {
2383 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2384 labelno++;
2385 ASM_OUTPUT_LABEL (asm_out_file, buf);
2386 std::stringstream s;
2387 write_fn_proto_from_insn (s, NULL, result, pat);
2388 fputs (s.str().c_str(), asm_out_file);
2389 }
2390
2391 for (int argno = 1; argno < arg_end; argno++)
2392 {
2393 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2394 machine_mode mode = GET_MODE (t);
2395 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2396
2397 /* Mode splitting has already been done. */
2398 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2399 "\t\tst.param%s [%%out_arg%d], ",
2400 ptx_type, argno, ptx_type, argno);
2401 output_reg (asm_out_file, REGNO (t), VOIDmode);
2402 fprintf (asm_out_file, ";\n");
2403 }
2404
2405 /* The '.' stands for the call's predicate, if any. */
2406 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2407 fprintf (asm_out_file, "\t\tcall ");
2408 if (result != NULL_RTX)
2409 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2410
2411 if (decl)
2412 {
2413 const char *name = get_fnname_from_decl (decl);
2414 name = nvptx_name_replacement (name);
2415 assemble_name (asm_out_file, name);
2416 }
2417 else
2418 output_address (VOIDmode, callee);
2419
2420 const char *open = "(";
2421 for (int argno = 1; argno < arg_end; argno++)
2422 {
2423 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2424 open = "";
2425 }
2426 if (decl && DECL_STATIC_CHAIN (decl))
2427 {
2428 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2429 open = "";
2430 }
2431 if (!open[0])
2432 fprintf (asm_out_file, ")");
2433
2434 if (needs_tgt)
2435 {
2436 fprintf (asm_out_file, ", ");
2437 assemble_name (asm_out_file, buf);
2438 }
2439 fprintf (asm_out_file, ";\n");
2440
2441 if (find_reg_note (insn, REG_NORETURN, NULL))
2442 {
2443 /* No return functions confuse the PTX JIT, as it doesn't realize
2444 the flow control barrier they imply. It can seg fault if it
2445 encounters what looks like an unexitable loop. Emit a trailing
2446 trap and exit, which it does grok. */
2447 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2448 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2449 }
2450
2451 if (result)
2452 {
2453 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2454
2455 if (!rval[0])
2456 /* We must escape the '%' that starts RETURN_REGNUM. */
2457 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2458 reg_names[NVPTX_RETURN_REGNUM]);
2459 return rval;
2460 }
2461
2462 return "}";
2463 }
2464
2465 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2466
2467 static bool
2468 nvptx_print_operand_punct_valid_p (unsigned char c)
2469 {
2470 return c == '.' || c== '#';
2471 }
2472
2473 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2474
2475 static void
2476 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2477 {
2478 rtx off;
2479 if (GET_CODE (x) == CONST)
2480 x = XEXP (x, 0);
2481 switch (GET_CODE (x))
2482 {
2483 case PLUS:
2484 off = XEXP (x, 1);
2485 output_address (VOIDmode, XEXP (x, 0));
2486 fprintf (file, "+");
2487 output_address (VOIDmode, off);
2488 break;
2489
2490 case SYMBOL_REF:
2491 case LABEL_REF:
2492 output_addr_const (file, x);
2493 break;
2494
2495 default:
2496 gcc_assert (GET_CODE (x) != MEM);
2497 nvptx_print_operand (file, x, 0);
2498 break;
2499 }
2500 }
2501
2502 /* Write assembly language output for the address ADDR to FILE. */
2503
2504 static void
2505 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2506 {
2507 nvptx_print_address_operand (file, addr, mode);
2508 }
2509
2510 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2511
2512 Meaning of CODE:
2513 . -- print the predicate for the instruction or an emptry string for an
2514 unconditional one.
2515 # -- print a rounding mode for the instruction
2516
2517 A -- print a data area for a MEM
2518 c -- print an opcode suffix for a comparison operator, including a type code
2519 D -- print a data area for a MEM operand
2520 S -- print a shuffle kind specified by CONST_INT
2521 t -- print a type opcode suffix, promoting QImode to 32 bits
2522 T -- print a type size in bits
2523 u -- print a type opcode suffix without promotions. */
2524
2525 static void
2526 nvptx_print_operand (FILE *file, rtx x, int code)
2527 {
2528 if (code == '.')
2529 {
2530 x = current_insn_predicate;
2531 if (x)
2532 {
2533 fputs ("@", file);
2534 if (GET_CODE (x) == EQ)
2535 fputs ("!", file);
2536 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2537 }
2538 return;
2539 }
2540 else if (code == '#')
2541 {
2542 fputs (".rn", file);
2543 return;
2544 }
2545
2546 enum rtx_code x_code = GET_CODE (x);
2547 machine_mode mode = GET_MODE (x);
2548
2549 switch (code)
2550 {
2551 case 'A':
2552 x = XEXP (x, 0);
2553 /* FALLTHROUGH. */
2554
2555 case 'D':
2556 if (GET_CODE (x) == CONST)
2557 x = XEXP (x, 0);
2558 if (GET_CODE (x) == PLUS)
2559 x = XEXP (x, 0);
2560
2561 if (GET_CODE (x) == SYMBOL_REF)
2562 fputs (section_for_sym (x), file);
2563 break;
2564
2565 case 't':
2566 case 'u':
2567 if (x_code == SUBREG)
2568 {
2569 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2570 if (VECTOR_MODE_P (inner_mode)
2571 && (GET_MODE_SIZE (mode)
2572 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2573 mode = GET_MODE_INNER (inner_mode);
2574 else if (split_mode_p (inner_mode))
2575 mode = maybe_split_mode (inner_mode);
2576 else
2577 mode = inner_mode;
2578 }
2579 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2580 break;
2581
2582 case 'H':
2583 case 'L':
2584 {
2585 rtx inner_x = SUBREG_REG (x);
2586 machine_mode inner_mode = GET_MODE (inner_x);
2587 machine_mode split = maybe_split_mode (inner_mode);
2588
2589 output_reg (file, REGNO (inner_x), split,
2590 (code == 'H'
2591 ? GET_MODE_SIZE (inner_mode) / 2
2592 : 0));
2593 }
2594 break;
2595
2596 case 'S':
2597 {
2598 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2599 /* Same order as nvptx_shuffle_kind. */
2600 static const char *const kinds[] =
2601 {".up", ".down", ".bfly", ".idx"};
2602 fputs (kinds[kind], file);
2603 }
2604 break;
2605
2606 case 'T':
2607 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2608 break;
2609
2610 case 'j':
2611 fprintf (file, "@");
2612 goto common;
2613
2614 case 'J':
2615 fprintf (file, "@!");
2616 goto common;
2617
2618 case 'c':
2619 mode = GET_MODE (XEXP (x, 0));
2620 switch (x_code)
2621 {
2622 case EQ:
2623 fputs (".eq", file);
2624 break;
2625 case NE:
2626 if (FLOAT_MODE_P (mode))
2627 fputs (".neu", file);
2628 else
2629 fputs (".ne", file);
2630 break;
2631 case LE:
2632 case LEU:
2633 fputs (".le", file);
2634 break;
2635 case GE:
2636 case GEU:
2637 fputs (".ge", file);
2638 break;
2639 case LT:
2640 case LTU:
2641 fputs (".lt", file);
2642 break;
2643 case GT:
2644 case GTU:
2645 fputs (".gt", file);
2646 break;
2647 case LTGT:
2648 fputs (".ne", file);
2649 break;
2650 case UNEQ:
2651 fputs (".equ", file);
2652 break;
2653 case UNLE:
2654 fputs (".leu", file);
2655 break;
2656 case UNGE:
2657 fputs (".geu", file);
2658 break;
2659 case UNLT:
2660 fputs (".ltu", file);
2661 break;
2662 case UNGT:
2663 fputs (".gtu", file);
2664 break;
2665 case UNORDERED:
2666 fputs (".nan", file);
2667 break;
2668 case ORDERED:
2669 fputs (".num", file);
2670 break;
2671 default:
2672 gcc_unreachable ();
2673 }
2674 if (FLOAT_MODE_P (mode)
2675 || x_code == EQ || x_code == NE
2676 || x_code == GEU || x_code == GTU
2677 || x_code == LEU || x_code == LTU)
2678 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2679 else
2680 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2681 break;
2682 default:
2683 common:
2684 switch (x_code)
2685 {
2686 case SUBREG:
2687 {
2688 rtx inner_x = SUBREG_REG (x);
2689 machine_mode inner_mode = GET_MODE (inner_x);
2690 machine_mode split = maybe_split_mode (inner_mode);
2691
2692 if (VECTOR_MODE_P (inner_mode)
2693 && (GET_MODE_SIZE (mode)
2694 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2695 {
2696 output_reg (file, REGNO (inner_x), VOIDmode);
2697 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2698 }
2699 else if (split_mode_p (inner_mode)
2700 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2701 output_reg (file, REGNO (inner_x), split);
2702 else
2703 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2704 }
2705 break;
2706
2707 case REG:
2708 output_reg (file, REGNO (x), maybe_split_mode (mode));
2709 break;
2710
2711 case MEM:
2712 fputc ('[', file);
2713 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2714 fputc (']', file);
2715 break;
2716
2717 case CONST_INT:
2718 output_addr_const (file, x);
2719 break;
2720
2721 case CONST:
2722 case SYMBOL_REF:
2723 case LABEL_REF:
2724 /* We could use output_addr_const, but that can print things like
2725 "x-8", which breaks ptxas. Need to ensure it is output as
2726 "x+-8". */
2727 nvptx_print_address_operand (file, x, VOIDmode);
2728 break;
2729
2730 case CONST_DOUBLE:
2731 long vals[2];
2732 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2733 vals[0] &= 0xffffffff;
2734 vals[1] &= 0xffffffff;
2735 if (mode == SFmode)
2736 fprintf (file, "0f%08lx", vals[0]);
2737 else
2738 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2739 break;
2740
2741 case CONST_VECTOR:
2742 {
2743 unsigned n = CONST_VECTOR_NUNITS (x);
2744 fprintf (file, "{ ");
2745 for (unsigned i = 0; i < n; ++i)
2746 {
2747 if (i != 0)
2748 fprintf (file, ", ");
2749
2750 rtx elem = CONST_VECTOR_ELT (x, i);
2751 output_addr_const (file, elem);
2752 }
2753 fprintf (file, " }");
2754 }
2755 break;
2756
2757 default:
2758 output_addr_const (file, x);
2759 }
2760 }
2761 }
2762 \f
2763 /* Record replacement regs used to deal with subreg operands. */
2764 struct reg_replace
2765 {
2766 rtx replacement[MAX_RECOG_OPERANDS];
2767 machine_mode mode;
2768 int n_allocated;
2769 int n_in_use;
2770 };
2771
2772 /* Allocate or reuse a replacement in R and return the rtx. */
2773
2774 static rtx
2775 get_replacement (struct reg_replace *r)
2776 {
2777 if (r->n_allocated == r->n_in_use)
2778 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2779 return r->replacement[r->n_in_use++];
2780 }
2781
2782 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2783 the presence of subregs would break the rules for most instructions.
2784 Replace them with a suitable new register of the right size, plus
2785 conversion copyin/copyout instructions. */
2786
2787 static void
2788 nvptx_reorg_subreg (void)
2789 {
2790 struct reg_replace qiregs, hiregs, siregs, diregs;
2791 rtx_insn *insn, *next;
2792
2793 qiregs.n_allocated = 0;
2794 hiregs.n_allocated = 0;
2795 siregs.n_allocated = 0;
2796 diregs.n_allocated = 0;
2797 qiregs.mode = QImode;
2798 hiregs.mode = HImode;
2799 siregs.mode = SImode;
2800 diregs.mode = DImode;
2801
2802 for (insn = get_insns (); insn; insn = next)
2803 {
2804 next = NEXT_INSN (insn);
2805 if (!NONDEBUG_INSN_P (insn)
2806 || asm_noperands (PATTERN (insn)) >= 0
2807 || GET_CODE (PATTERN (insn)) == USE
2808 || GET_CODE (PATTERN (insn)) == CLOBBER)
2809 continue;
2810
2811 qiregs.n_in_use = 0;
2812 hiregs.n_in_use = 0;
2813 siregs.n_in_use = 0;
2814 diregs.n_in_use = 0;
2815 extract_insn (insn);
2816 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2817
2818 for (int i = 0; i < recog_data.n_operands; i++)
2819 {
2820 rtx op = recog_data.operand[i];
2821 if (GET_CODE (op) != SUBREG)
2822 continue;
2823
2824 rtx inner = SUBREG_REG (op);
2825
2826 machine_mode outer_mode = GET_MODE (op);
2827 machine_mode inner_mode = GET_MODE (inner);
2828 gcc_assert (s_ok);
2829 if (s_ok
2830 && (GET_MODE_PRECISION (inner_mode)
2831 >= GET_MODE_PRECISION (outer_mode)))
2832 continue;
2833 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2834 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2835 : outer_mode == HImode ? &hiregs
2836 : outer_mode == SImode ? &siregs
2837 : &diregs);
2838 rtx new_reg = get_replacement (r);
2839
2840 if (recog_data.operand_type[i] != OP_OUT)
2841 {
2842 enum rtx_code code;
2843 if (GET_MODE_PRECISION (inner_mode)
2844 < GET_MODE_PRECISION (outer_mode))
2845 code = ZERO_EXTEND;
2846 else
2847 code = TRUNCATE;
2848
2849 rtx pat = gen_rtx_SET (new_reg,
2850 gen_rtx_fmt_e (code, outer_mode, inner));
2851 emit_insn_before (pat, insn);
2852 }
2853
2854 if (recog_data.operand_type[i] != OP_IN)
2855 {
2856 enum rtx_code code;
2857 if (GET_MODE_PRECISION (inner_mode)
2858 < GET_MODE_PRECISION (outer_mode))
2859 code = TRUNCATE;
2860 else
2861 code = ZERO_EXTEND;
2862
2863 rtx pat = gen_rtx_SET (inner,
2864 gen_rtx_fmt_e (code, inner_mode, new_reg));
2865 emit_insn_after (pat, insn);
2866 }
2867 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2868 }
2869 }
2870 }
2871
2872 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2873 first use. */
2874
2875 static rtx
2876 nvptx_get_unisimt_master ()
2877 {
2878 rtx &master = cfun->machine->unisimt_master;
2879 return master ? master : master = gen_reg_rtx (SImode);
2880 }
2881
2882 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2883
2884 static rtx
2885 nvptx_get_unisimt_predicate ()
2886 {
2887 rtx &pred = cfun->machine->unisimt_predicate;
2888 return pred ? pred : pred = gen_reg_rtx (BImode);
2889 }
2890
2891 /* Return true if given call insn references one of the functions provided by
2892 the CUDA runtime: malloc, free, vprintf. */
2893
2894 static bool
2895 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2896 {
2897 rtx pat = PATTERN (insn);
2898 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2899 pat = XVECEXP (pat, 0, 0);
2900 if (GET_CODE (pat) == SET)
2901 pat = SET_SRC (pat);
2902 gcc_checking_assert (GET_CODE (pat) == CALL
2903 && GET_CODE (XEXP (pat, 0)) == MEM);
2904 rtx addr = XEXP (XEXP (pat, 0), 0);
2905 if (GET_CODE (addr) != SYMBOL_REF)
2906 return false;
2907 const char *name = XSTR (addr, 0);
2908 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2909 references with forced assembler name refer to PTX syscalls. For vprintf,
2910 accept both normal and forced-assembler-name references. */
2911 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2912 || !strcmp (name, "*malloc")
2913 || !strcmp (name, "*free"));
2914 }
2915
2916 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2917 propagate its value from lane MASTER to current lane. */
2918
2919 static void
2920 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2921 {
2922 rtx reg;
2923 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2924 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2925 }
2926
2927 /* Adjust code for uniform-simt code generation variant by making atomics and
2928 "syscalls" conditionally executed, and inserting shuffle-based propagation
2929 for registers being set. */
2930
2931 static void
2932 nvptx_reorg_uniform_simt ()
2933 {
2934 rtx_insn *insn, *next;
2935
2936 for (insn = get_insns (); insn; insn = next)
2937 {
2938 next = NEXT_INSN (insn);
2939 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2940 && !(NONJUMP_INSN_P (insn)
2941 && GET_CODE (PATTERN (insn)) == PARALLEL
2942 && get_attr_atomic (insn)))
2943 continue;
2944 rtx pat = PATTERN (insn);
2945 rtx master = nvptx_get_unisimt_master ();
2946 for (int i = 0; i < XVECLEN (pat, 0); i++)
2947 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2948 rtx pred = nvptx_get_unisimt_predicate ();
2949 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2950 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2951 validate_change (insn, &PATTERN (insn), pat, false);
2952 }
2953 }
2954
2955 /* Offloading function attributes. */
2956
2957 struct offload_attrs
2958 {
2959 unsigned mask;
2960 int num_gangs;
2961 int num_workers;
2962 int vector_length;
2963 };
2964
2965 /* Define entries for cfun->machine->axis_dim. */
2966
2967 #define MACH_VECTOR_LENGTH 0
2968 #define MACH_MAX_WORKERS 1
2969
2970 static void populate_offload_attrs (offload_attrs *oa);
2971
2972 static void
2973 init_axis_dim (void)
2974 {
2975 offload_attrs oa;
2976 int max_workers;
2977
2978 populate_offload_attrs (&oa);
2979
2980 if (oa.num_workers == 0)
2981 max_workers = PTX_CTA_SIZE / oa.vector_length;
2982 else
2983 max_workers = oa.num_workers;
2984
2985 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2986 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2987 cfun->machine->axis_dim_init_p = true;
2988 }
2989
2990 static int ATTRIBUTE_UNUSED
2991 nvptx_mach_max_workers ()
2992 {
2993 if (!cfun->machine->axis_dim_init_p)
2994 init_axis_dim ();
2995 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
2996 }
2997
2998 static int ATTRIBUTE_UNUSED
2999 nvptx_mach_vector_length ()
3000 {
3001 if (!cfun->machine->axis_dim_init_p)
3002 init_axis_dim ();
3003 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3004 }
3005
3006 /* Loop structure of the function. The entire function is described as
3007 a NULL loop. */
3008
3009 struct parallel
3010 {
3011 /* Parent parallel. */
3012 parallel *parent;
3013
3014 /* Next sibling parallel. */
3015 parallel *next;
3016
3017 /* First child parallel. */
3018 parallel *inner;
3019
3020 /* Partitioning mask of the parallel. */
3021 unsigned mask;
3022
3023 /* Partitioning used within inner parallels. */
3024 unsigned inner_mask;
3025
3026 /* Location of parallel forked and join. The forked is the first
3027 block in the parallel and the join is the first block after of
3028 the partition. */
3029 basic_block forked_block;
3030 basic_block join_block;
3031
3032 rtx_insn *forked_insn;
3033 rtx_insn *join_insn;
3034
3035 rtx_insn *fork_insn;
3036 rtx_insn *joining_insn;
3037
3038 /* Basic blocks in this parallel, but not in child parallels. The
3039 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3040 blocks are not. */
3041 auto_vec<basic_block> blocks;
3042
3043 public:
3044 parallel (parallel *parent, unsigned mode);
3045 ~parallel ();
3046 };
3047
3048 /* Constructor links the new parallel into it's parent's chain of
3049 children. */
3050
3051 parallel::parallel (parallel *parent_, unsigned mask_)
3052 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3053 {
3054 forked_block = join_block = 0;
3055 forked_insn = join_insn = 0;
3056 fork_insn = joining_insn = 0;
3057
3058 if (parent)
3059 {
3060 next = parent->inner;
3061 parent->inner = this;
3062 }
3063 }
3064
3065 parallel::~parallel ()
3066 {
3067 delete inner;
3068 delete next;
3069 }
3070
3071 /* Map of basic blocks to insns */
3072 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3073
3074 /* A tuple of an insn of interest and the BB in which it resides. */
3075 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3076 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3077
3078 /* Split basic blocks such that each forked and join unspecs are at
3079 the start of their basic blocks. Thus afterwards each block will
3080 have a single partitioning mode. We also do the same for return
3081 insns, as they are executed by every thread. Return the
3082 partitioning mode of the function as a whole. Populate MAP with
3083 head and tail blocks. We also clear the BB visited flag, which is
3084 used when finding partitions. */
3085
3086 static void
3087 nvptx_split_blocks (bb_insn_map_t *map)
3088 {
3089 insn_bb_vec_t worklist;
3090 basic_block block;
3091 rtx_insn *insn;
3092
3093 /* Locate all the reorg instructions of interest. */
3094 FOR_ALL_BB_FN (block, cfun)
3095 {
3096 bool seen_insn = false;
3097
3098 /* Clear visited flag, for use by parallel locator */
3099 block->flags &= ~BB_VISITED;
3100
3101 FOR_BB_INSNS (block, insn)
3102 {
3103 if (!INSN_P (insn))
3104 continue;
3105 switch (recog_memoized (insn))
3106 {
3107 default:
3108 seen_insn = true;
3109 continue;
3110 case CODE_FOR_nvptx_forked:
3111 case CODE_FOR_nvptx_join:
3112 break;
3113
3114 case CODE_FOR_return:
3115 /* We also need to split just before return insns, as
3116 that insn needs executing by all threads, but the
3117 block it is in probably does not. */
3118 break;
3119 }
3120
3121 if (seen_insn)
3122 /* We've found an instruction that must be at the start of
3123 a block, but isn't. Add it to the worklist. */
3124 worklist.safe_push (insn_bb_t (insn, block));
3125 else
3126 /* It was already the first instruction. Just add it to
3127 the map. */
3128 map->get_or_insert (block) = insn;
3129 seen_insn = true;
3130 }
3131 }
3132
3133 /* Split blocks on the worklist. */
3134 unsigned ix;
3135 insn_bb_t *elt;
3136 basic_block remap = 0;
3137 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3138 {
3139 if (remap != elt->second)
3140 {
3141 block = elt->second;
3142 remap = block;
3143 }
3144
3145 /* Split block before insn. The insn is in the new block */
3146 edge e = split_block (block, PREV_INSN (elt->first));
3147
3148 block = e->dest;
3149 map->get_or_insert (block) = elt->first;
3150 }
3151 }
3152
3153 /* Return true if MASK contains parallelism that requires shared
3154 memory to broadcast. */
3155
3156 static bool
3157 nvptx_needs_shared_bcast (unsigned mask)
3158 {
3159 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3160 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3161 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3162
3163 return worker || large_vector;
3164 }
3165
3166 /* BLOCK is a basic block containing a head or tail instruction.
3167 Locate the associated prehead or pretail instruction, which must be
3168 in the single predecessor block. */
3169
3170 static rtx_insn *
3171 nvptx_discover_pre (basic_block block, int expected)
3172 {
3173 gcc_assert (block->preds->length () == 1);
3174 basic_block pre_block = (*block->preds)[0]->src;
3175 rtx_insn *pre_insn;
3176
3177 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3178 pre_insn = PREV_INSN (pre_insn))
3179 gcc_assert (pre_insn != BB_HEAD (pre_block));
3180
3181 gcc_assert (recog_memoized (pre_insn) == expected);
3182 return pre_insn;
3183 }
3184
3185 /* Dump this parallel and all its inner parallels. */
3186
3187 static void
3188 nvptx_dump_pars (parallel *par, unsigned depth)
3189 {
3190 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3191 depth, par->mask,
3192 par->forked_block ? par->forked_block->index : -1,
3193 par->join_block ? par->join_block->index : -1);
3194
3195 fprintf (dump_file, " blocks:");
3196
3197 basic_block block;
3198 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3199 fprintf (dump_file, " %d", block->index);
3200 fprintf (dump_file, "\n");
3201 if (par->inner)
3202 nvptx_dump_pars (par->inner, depth + 1);
3203
3204 if (par->next)
3205 nvptx_dump_pars (par->next, depth);
3206 }
3207
3208 /* If BLOCK contains a fork/join marker, process it to create or
3209 terminate a loop structure. Add this block to the current loop,
3210 and then walk successor blocks. */
3211
3212 static parallel *
3213 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3214 {
3215 if (block->flags & BB_VISITED)
3216 return par;
3217 block->flags |= BB_VISITED;
3218
3219 if (rtx_insn **endp = map->get (block))
3220 {
3221 rtx_insn *end = *endp;
3222
3223 /* This is a block head or tail, or return instruction. */
3224 switch (recog_memoized (end))
3225 {
3226 case CODE_FOR_return:
3227 /* Return instructions are in their own block, and we
3228 don't need to do anything more. */
3229 return par;
3230
3231 case CODE_FOR_nvptx_forked:
3232 /* Loop head, create a new inner loop and add it into
3233 our parent's child list. */
3234 {
3235 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3236
3237 gcc_assert (mask);
3238 par = new parallel (par, mask);
3239 par->forked_block = block;
3240 par->forked_insn = end;
3241 if (nvptx_needs_shared_bcast (mask))
3242 par->fork_insn
3243 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3244 }
3245 break;
3246
3247 case CODE_FOR_nvptx_join:
3248 /* A loop tail. Finish the current loop and return to
3249 parent. */
3250 {
3251 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3252
3253 gcc_assert (par->mask == mask);
3254 par->join_block = block;
3255 par->join_insn = end;
3256 if (nvptx_needs_shared_bcast (mask))
3257 par->joining_insn
3258 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3259 par = par->parent;
3260 }
3261 break;
3262
3263 default:
3264 gcc_unreachable ();
3265 }
3266 }
3267
3268 if (par)
3269 /* Add this block onto the current loop's list of blocks. */
3270 par->blocks.safe_push (block);
3271 else
3272 /* This must be the entry block. Create a NULL parallel. */
3273 par = new parallel (0, 0);
3274
3275 /* Walk successor blocks. */
3276 edge e;
3277 edge_iterator ei;
3278
3279 FOR_EACH_EDGE (e, ei, block->succs)
3280 nvptx_find_par (map, par, e->dest);
3281
3282 return par;
3283 }
3284
3285 /* DFS walk the CFG looking for fork & join markers. Construct
3286 loop structures as we go. MAP is a mapping of basic blocks
3287 to head & tail markers, discovered when splitting blocks. This
3288 speeds up the discovery. We rely on the BB visited flag having
3289 been cleared when splitting blocks. */
3290
3291 static parallel *
3292 nvptx_discover_pars (bb_insn_map_t *map)
3293 {
3294 basic_block block;
3295
3296 /* Mark exit blocks as visited. */
3297 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3298 block->flags |= BB_VISITED;
3299
3300 /* And entry block as not. */
3301 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3302 block->flags &= ~BB_VISITED;
3303
3304 parallel *par = nvptx_find_par (map, 0, block);
3305
3306 if (dump_file)
3307 {
3308 fprintf (dump_file, "\nLoops\n");
3309 nvptx_dump_pars (par, 0);
3310 fprintf (dump_file, "\n");
3311 }
3312
3313 return par;
3314 }
3315
3316 /* Analyse a group of BBs within a partitioned region and create N
3317 Single-Entry-Single-Exit regions. Some of those regions will be
3318 trivial ones consisting of a single BB. The blocks of a
3319 partitioned region might form a set of disjoint graphs -- because
3320 the region encloses a differently partitoned sub region.
3321
3322 We use the linear time algorithm described in 'Finding Regions Fast:
3323 Single Entry Single Exit and control Regions in Linear Time'
3324 Johnson, Pearson & Pingali. That algorithm deals with complete
3325 CFGs, where a back edge is inserted from END to START, and thus the
3326 problem becomes one of finding equivalent loops.
3327
3328 In this case we have a partial CFG. We complete it by redirecting
3329 any incoming edge to the graph to be from an arbitrary external BB,
3330 and similarly redirecting any outgoing edge to be to that BB.
3331 Thus we end up with a closed graph.
3332
3333 The algorithm works by building a spanning tree of an undirected
3334 graph and keeping track of back edges from nodes further from the
3335 root in the tree to nodes nearer to the root in the tree. In the
3336 description below, the root is up and the tree grows downwards.
3337
3338 We avoid having to deal with degenerate back-edges to the same
3339 block, by splitting each BB into 3 -- one for input edges, one for
3340 the node itself and one for the output edges. Such back edges are
3341 referred to as 'Brackets'. Cycle equivalent nodes will have the
3342 same set of brackets.
3343
3344 Determining bracket equivalency is done by maintaining a list of
3345 brackets in such a manner that the list length and final bracket
3346 uniquely identify the set.
3347
3348 We use coloring to mark all BBs with cycle equivalency with the
3349 same color. This is the output of the 'Finding Regions Fast'
3350 algorithm. Notice it doesn't actually find the set of nodes within
3351 a particular region, just unorderd sets of nodes that are the
3352 entries and exits of SESE regions.
3353
3354 After determining cycle equivalency, we need to find the minimal
3355 set of SESE regions. Do this with a DFS coloring walk of the
3356 complete graph. We're either 'looking' or 'coloring'. When
3357 looking, and we're in the subgraph, we start coloring the color of
3358 the current node, and remember that node as the start of the
3359 current color's SESE region. Every time we go to a new node, we
3360 decrement the count of nodes with thet color. If it reaches zero,
3361 we remember that node as the end of the current color's SESE region
3362 and return to 'looking'. Otherwise we color the node the current
3363 color.
3364
3365 This way we end up with coloring the inside of non-trivial SESE
3366 regions with the color of that region. */
3367
3368 /* A pair of BBs. We use this to represent SESE regions. */
3369 typedef std::pair<basic_block, basic_block> bb_pair_t;
3370 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3371
3372 /* A node in the undirected CFG. The discriminator SECOND indicates just
3373 above or just below the BB idicated by FIRST. */
3374 typedef std::pair<basic_block, int> pseudo_node_t;
3375
3376 /* A bracket indicates an edge towards the root of the spanning tree of the
3377 undirected graph. Each bracket has a color, determined
3378 from the currrent set of brackets. */
3379 struct bracket
3380 {
3381 pseudo_node_t back; /* Back target */
3382
3383 /* Current color and size of set. */
3384 unsigned color;
3385 unsigned size;
3386
3387 bracket (pseudo_node_t back_)
3388 : back (back_), color (~0u), size (~0u)
3389 {
3390 }
3391
3392 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3393 {
3394 if (length != size)
3395 {
3396 size = length;
3397 color = color_counts.length ();
3398 color_counts.quick_push (0);
3399 }
3400 color_counts[color]++;
3401 return color;
3402 }
3403 };
3404
3405 typedef auto_vec<bracket> bracket_vec_t;
3406
3407 /* Basic block info for finding SESE regions. */
3408
3409 struct bb_sese
3410 {
3411 int node; /* Node number in spanning tree. */
3412 int parent; /* Parent node number. */
3413
3414 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3415 edges arrive at pseudo-node Ai and the outgoing edges leave at
3416 pseudo-node Ao. We have to remember which way we arrived at a
3417 particular node when generating the spanning tree. dir > 0 means
3418 we arrived at Ai, dir < 0 means we arrived at Ao. */
3419 int dir;
3420
3421 /* Lowest numbered pseudo-node reached via a backedge from thsis
3422 node, or any descendant. */
3423 pseudo_node_t high;
3424
3425 int color; /* Cycle-equivalence color */
3426
3427 /* Stack of brackets for this node. */
3428 bracket_vec_t brackets;
3429
3430 bb_sese (unsigned node_, unsigned p, int dir_)
3431 :node (node_), parent (p), dir (dir_)
3432 {
3433 }
3434 ~bb_sese ();
3435
3436 /* Push a bracket ending at BACK. */
3437 void push (const pseudo_node_t &back)
3438 {
3439 if (dump_file)
3440 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3441 back.first ? back.first->index : 0, back.second);
3442 brackets.safe_push (bracket (back));
3443 }
3444
3445 void append (bb_sese *child);
3446 void remove (const pseudo_node_t &);
3447
3448 /* Set node's color. */
3449 void set_color (auto_vec<unsigned> &color_counts)
3450 {
3451 color = brackets.last ().get_color (color_counts, brackets.length ());
3452 }
3453 };
3454
3455 bb_sese::~bb_sese ()
3456 {
3457 }
3458
3459 /* Destructively append CHILD's brackets. */
3460
3461 void
3462 bb_sese::append (bb_sese *child)
3463 {
3464 if (int len = child->brackets.length ())
3465 {
3466 int ix;
3467
3468 if (dump_file)
3469 {
3470 for (ix = 0; ix < len; ix++)
3471 {
3472 const pseudo_node_t &pseudo = child->brackets[ix].back;
3473 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3474 child->node, pseudo.first ? pseudo.first->index : 0,
3475 pseudo.second);
3476 }
3477 }
3478 if (!brackets.length ())
3479 std::swap (brackets, child->brackets);
3480 else
3481 {
3482 brackets.reserve (len);
3483 for (ix = 0; ix < len; ix++)
3484 brackets.quick_push (child->brackets[ix]);
3485 }
3486 }
3487 }
3488
3489 /* Remove brackets that terminate at PSEUDO. */
3490
3491 void
3492 bb_sese::remove (const pseudo_node_t &pseudo)
3493 {
3494 unsigned removed = 0;
3495 int len = brackets.length ();
3496
3497 for (int ix = 0; ix < len; ix++)
3498 {
3499 if (brackets[ix].back == pseudo)
3500 {
3501 if (dump_file)
3502 fprintf (dump_file, "Removing backedge %d:%+d\n",
3503 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3504 removed++;
3505 }
3506 else if (removed)
3507 brackets[ix-removed] = brackets[ix];
3508 }
3509 while (removed--)
3510 brackets.pop ();
3511 }
3512
3513 /* Accessors for BB's aux pointer. */
3514 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3515 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3516
3517 /* DFS walk creating SESE data structures. Only cover nodes with
3518 BB_VISITED set. Append discovered blocks to LIST. We number in
3519 increments of 3 so that the above and below pseudo nodes can be
3520 implicitly numbered too. */
3521
3522 static int
3523 nvptx_sese_number (int n, int p, int dir, basic_block b,
3524 auto_vec<basic_block> *list)
3525 {
3526 if (BB_GET_SESE (b))
3527 return n;
3528
3529 if (dump_file)
3530 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3531 b->index, n, p, dir);
3532
3533 BB_SET_SESE (b, new bb_sese (n, p, dir));
3534 p = n;
3535
3536 n += 3;
3537 list->quick_push (b);
3538
3539 /* First walk the nodes on the 'other side' of this node, then walk
3540 the nodes on the same side. */
3541 for (unsigned ix = 2; ix; ix--)
3542 {
3543 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3544 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3545 : offsetof (edge_def, src));
3546 edge e;
3547 edge_iterator (ei);
3548
3549 FOR_EACH_EDGE (e, ei, edges)
3550 {
3551 basic_block target = *(basic_block *)((char *)e + offset);
3552
3553 if (target->flags & BB_VISITED)
3554 n = nvptx_sese_number (n, p, dir, target, list);
3555 }
3556 dir = -dir;
3557 }
3558 return n;
3559 }
3560
3561 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3562 EDGES are the outgoing edges and OFFSET is the offset to the src
3563 or dst block on the edges. */
3564
3565 static void
3566 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3567 vec<edge, va_gc> *edges, size_t offset)
3568 {
3569 edge e;
3570 edge_iterator (ei);
3571 int hi_back = depth;
3572 pseudo_node_t node_back (0, depth);
3573 int hi_child = depth;
3574 pseudo_node_t node_child (0, depth);
3575 basic_block child = NULL;
3576 unsigned num_children = 0;
3577 int usd = -dir * sese->dir;
3578
3579 if (dump_file)
3580 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3581 me->index, sese->node, dir);
3582
3583 if (dir < 0)
3584 {
3585 /* This is the above pseudo-child. It has the BB itself as an
3586 additional child node. */
3587 node_child = sese->high;
3588 hi_child = node_child.second;
3589 if (node_child.first)
3590 hi_child += BB_GET_SESE (node_child.first)->node;
3591 num_children++;
3592 }
3593
3594 /* Examine each edge.
3595 - if it is a child (a) append its bracket list and (b) record
3596 whether it is the child with the highest reaching bracket.
3597 - if it is an edge to ancestor, record whether it's the highest
3598 reaching backlink. */
3599 FOR_EACH_EDGE (e, ei, edges)
3600 {
3601 basic_block target = *(basic_block *)((char *)e + offset);
3602
3603 if (bb_sese *t_sese = BB_GET_SESE (target))
3604 {
3605 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3606 {
3607 /* Child node. Append its bracket list. */
3608 num_children++;
3609 sese->append (t_sese);
3610
3611 /* Compare it's hi value. */
3612 int t_hi = t_sese->high.second;
3613
3614 if (basic_block child_hi_block = t_sese->high.first)
3615 t_hi += BB_GET_SESE (child_hi_block)->node;
3616
3617 if (hi_child > t_hi)
3618 {
3619 hi_child = t_hi;
3620 node_child = t_sese->high;
3621 child = target;
3622 }
3623 }
3624 else if (t_sese->node < sese->node + dir
3625 && !(dir < 0 && sese->parent == t_sese->node))
3626 {
3627 /* Non-parental ancestor node -- a backlink. */
3628 int d = usd * t_sese->dir;
3629 int back = t_sese->node + d;
3630
3631 if (hi_back > back)
3632 {
3633 hi_back = back;
3634 node_back = pseudo_node_t (target, d);
3635 }
3636 }
3637 }
3638 else
3639 { /* Fallen off graph, backlink to entry node. */
3640 hi_back = 0;
3641 node_back = pseudo_node_t (0, 0);
3642 }
3643 }
3644
3645 /* Remove any brackets that terminate at this pseudo node. */
3646 sese->remove (pseudo_node_t (me, dir));
3647
3648 /* Now push any backlinks from this pseudo node. */
3649 FOR_EACH_EDGE (e, ei, edges)
3650 {
3651 basic_block target = *(basic_block *)((char *)e + offset);
3652 if (bb_sese *t_sese = BB_GET_SESE (target))
3653 {
3654 if (t_sese->node < sese->node + dir
3655 && !(dir < 0 && sese->parent == t_sese->node))
3656 /* Non-parental ancestor node - backedge from me. */
3657 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3658 }
3659 else
3660 {
3661 /* back edge to entry node */
3662 sese->push (pseudo_node_t (0, 0));
3663 }
3664 }
3665
3666 /* If this node leads directly or indirectly to a no-return region of
3667 the graph, then fake a backedge to entry node. */
3668 if (!sese->brackets.length () || !edges || !edges->length ())
3669 {
3670 hi_back = 0;
3671 node_back = pseudo_node_t (0, 0);
3672 sese->push (node_back);
3673 }
3674
3675 /* Record the highest reaching backedge from us or a descendant. */
3676 sese->high = hi_back < hi_child ? node_back : node_child;
3677
3678 if (num_children > 1)
3679 {
3680 /* There is more than one child -- this is a Y shaped piece of
3681 spanning tree. We have to insert a fake backedge from this
3682 node to the highest ancestor reached by not-the-highest
3683 reaching child. Note that there may be multiple children
3684 with backedges to the same highest node. That's ok and we
3685 insert the edge to that highest node. */
3686 hi_child = depth;
3687 if (dir < 0 && child)
3688 {
3689 node_child = sese->high;
3690 hi_child = node_child.second;
3691 if (node_child.first)
3692 hi_child += BB_GET_SESE (node_child.first)->node;
3693 }
3694
3695 FOR_EACH_EDGE (e, ei, edges)
3696 {
3697 basic_block target = *(basic_block *)((char *)e + offset);
3698
3699 if (target == child)
3700 /* Ignore the highest child. */
3701 continue;
3702
3703 bb_sese *t_sese = BB_GET_SESE (target);
3704 if (!t_sese)
3705 continue;
3706 if (t_sese->parent != sese->node)
3707 /* Not a child. */
3708 continue;
3709
3710 /* Compare its hi value. */
3711 int t_hi = t_sese->high.second;
3712
3713 if (basic_block child_hi_block = t_sese->high.first)
3714 t_hi += BB_GET_SESE (child_hi_block)->node;
3715
3716 if (hi_child > t_hi)
3717 {
3718 hi_child = t_hi;
3719 node_child = t_sese->high;
3720 }
3721 }
3722
3723 sese->push (node_child);
3724 }
3725 }
3726
3727
3728 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3729 proceed to successors. Set SESE entry and exit nodes of
3730 REGIONS. */
3731
3732 static void
3733 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3734 basic_block block, int coloring)
3735 {
3736 bb_sese *sese = BB_GET_SESE (block);
3737
3738 if (block->flags & BB_VISITED)
3739 {
3740 /* If we've already encountered this block, either we must not
3741 be coloring, or it must have been colored the current color. */
3742 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3743 return;
3744 }
3745
3746 block->flags |= BB_VISITED;
3747
3748 if (sese)
3749 {
3750 if (coloring < 0)
3751 {
3752 /* Start coloring a region. */
3753 regions[sese->color].first = block;
3754 coloring = sese->color;
3755 }
3756
3757 if (!--color_counts[sese->color] && sese->color == coloring)
3758 {
3759 /* Found final block of SESE region. */
3760 regions[sese->color].second = block;
3761 coloring = -1;
3762 }
3763 else
3764 /* Color the node, so we can assert on revisiting the node
3765 that the graph is indeed SESE. */
3766 sese->color = coloring;
3767 }
3768 else
3769 /* Fallen off the subgraph, we cannot be coloring. */
3770 gcc_assert (coloring < 0);
3771
3772 /* Walk each successor block. */
3773 if (block->succs && block->succs->length ())
3774 {
3775 edge e;
3776 edge_iterator ei;
3777
3778 FOR_EACH_EDGE (e, ei, block->succs)
3779 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3780 }
3781 else
3782 gcc_assert (coloring < 0);
3783 }
3784
3785 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3786 end up with NULL entries in it. */
3787
3788 static void
3789 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3790 {
3791 basic_block block;
3792 int ix;
3793
3794 /* First clear each BB of the whole function. */
3795 FOR_ALL_BB_FN (block, cfun)
3796 {
3797 block->flags &= ~BB_VISITED;
3798 BB_SET_SESE (block, 0);
3799 }
3800
3801 /* Mark blocks in the function that are in this graph. */
3802 for (ix = 0; blocks.iterate (ix, &block); ix++)
3803 block->flags |= BB_VISITED;
3804
3805 /* Counts of nodes assigned to each color. There cannot be more
3806 colors than blocks (and hopefully there will be fewer). */
3807 auto_vec<unsigned> color_counts;
3808 color_counts.reserve (blocks.length ());
3809
3810 /* Worklist of nodes in the spanning tree. Again, there cannot be
3811 more nodes in the tree than blocks (there will be fewer if the
3812 CFG of blocks is disjoint). */
3813 auto_vec<basic_block> spanlist;
3814 spanlist.reserve (blocks.length ());
3815
3816 /* Make sure every block has its cycle class determined. */
3817 for (ix = 0; blocks.iterate (ix, &block); ix++)
3818 {
3819 if (BB_GET_SESE (block))
3820 /* We already met this block in an earlier graph solve. */
3821 continue;
3822
3823 if (dump_file)
3824 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3825
3826 /* Number the nodes reachable from block initial DFS order. */
3827 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3828
3829 /* Now walk in reverse DFS order to find cycle equivalents. */
3830 while (spanlist.length ())
3831 {
3832 block = spanlist.pop ();
3833 bb_sese *sese = BB_GET_SESE (block);
3834
3835 /* Do the pseudo node below. */
3836 nvptx_sese_pseudo (block, sese, depth, +1,
3837 sese->dir > 0 ? block->succs : block->preds,
3838 (sese->dir > 0 ? offsetof (edge_def, dest)
3839 : offsetof (edge_def, src)));
3840 sese->set_color (color_counts);
3841 /* Do the pseudo node above. */
3842 nvptx_sese_pseudo (block, sese, depth, -1,
3843 sese->dir < 0 ? block->succs : block->preds,
3844 (sese->dir < 0 ? offsetof (edge_def, dest)
3845 : offsetof (edge_def, src)));
3846 }
3847 if (dump_file)
3848 fprintf (dump_file, "\n");
3849 }
3850
3851 if (dump_file)
3852 {
3853 unsigned count;
3854 const char *comma = "";
3855
3856 fprintf (dump_file, "Found %d cycle equivalents\n",
3857 color_counts.length ());
3858 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3859 {
3860 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3861
3862 comma = "";
3863 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3864 if (BB_GET_SESE (block)->color == ix)
3865 {
3866 block->flags |= BB_VISITED;
3867 fprintf (dump_file, "%s%d", comma, block->index);
3868 comma=",";
3869 }
3870 fprintf (dump_file, "}");
3871 comma = ", ";
3872 }
3873 fprintf (dump_file, "\n");
3874 }
3875
3876 /* Now we've colored every block in the subgraph. We now need to
3877 determine the minimal set of SESE regions that cover that
3878 subgraph. Do this with a DFS walk of the complete function.
3879 During the walk we're either 'looking' or 'coloring'. When we
3880 reach the last node of a particular color, we stop coloring and
3881 return to looking. */
3882
3883 /* There cannot be more SESE regions than colors. */
3884 regions.reserve (color_counts.length ());
3885 for (ix = color_counts.length (); ix--;)
3886 regions.quick_push (bb_pair_t (0, 0));
3887
3888 for (ix = 0; blocks.iterate (ix, &block); ix++)
3889 block->flags &= ~BB_VISITED;
3890
3891 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3892
3893 if (dump_file)
3894 {
3895 const char *comma = "";
3896 int len = regions.length ();
3897
3898 fprintf (dump_file, "SESE regions:");
3899 for (ix = 0; ix != len; ix++)
3900 {
3901 basic_block from = regions[ix].first;
3902 basic_block to = regions[ix].second;
3903
3904 if (from)
3905 {
3906 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3907 if (to != from)
3908 fprintf (dump_file, "->%d", to->index);
3909
3910 int color = BB_GET_SESE (from)->color;
3911
3912 /* Print the blocks within the region (excluding ends). */
3913 FOR_EACH_BB_FN (block, cfun)
3914 {
3915 bb_sese *sese = BB_GET_SESE (block);
3916
3917 if (sese && sese->color == color
3918 && block != from && block != to)
3919 fprintf (dump_file, ".%d", block->index);
3920 }
3921 fprintf (dump_file, "}");
3922 }
3923 comma = ",";
3924 }
3925 fprintf (dump_file, "\n\n");
3926 }
3927
3928 for (ix = 0; blocks.iterate (ix, &block); ix++)
3929 delete BB_GET_SESE (block);
3930 }
3931
3932 #undef BB_SET_SESE
3933 #undef BB_GET_SESE
3934
3935 /* Propagate live state at the start of a partitioned region. IS_CALL
3936 indicates whether the propagation is for a (partitioned) call
3937 instruction. BLOCK provides the live register information, and
3938 might not contain INSN. Propagation is inserted just after INSN. RW
3939 indicates whether we are reading and/or writing state. This
3940 separation is needed for worker-level proppagation where we
3941 essentially do a spill & fill. FN is the underlying worker
3942 function to generate the propagation instructions for single
3943 register. DATA is user data.
3944
3945 Returns true if we didn't emit any instructions.
3946
3947 We propagate the live register set for non-calls and the entire
3948 frame for calls and non-calls. We could do better by (a)
3949 propagating just the live set that is used within the partitioned
3950 regions and (b) only propagating stack entries that are used. The
3951 latter might be quite hard to determine. */
3952
3953 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
3954
3955 static bool
3956 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3957 propagate_mask rw, propagator_fn fn, void *data, bool vector)
3958 {
3959 bitmap live = DF_LIVE_IN (block);
3960 bitmap_iterator iterator;
3961 unsigned ix;
3962 bool empty = true;
3963
3964 /* Copy the frame array. */
3965 HOST_WIDE_INT fs = get_frame_size ();
3966 if (fs)
3967 {
3968 rtx tmp = gen_reg_rtx (DImode);
3969 rtx idx = NULL_RTX;
3970 rtx ptr = gen_reg_rtx (Pmode);
3971 rtx pred = NULL_RTX;
3972 rtx_code_label *label = NULL;
3973
3974 empty = false;
3975 /* The frame size might not be DImode compatible, but the frame
3976 array's declaration will be. So it's ok to round up here. */
3977 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3978 /* Detect single iteration loop. */
3979 if (fs == 1)
3980 fs = 0;
3981
3982 start_sequence ();
3983 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3984 if (fs)
3985 {
3986 idx = gen_reg_rtx (SImode);
3987 pred = gen_reg_rtx (BImode);
3988 label = gen_label_rtx ();
3989
3990 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3991 /* Allow worker function to initialize anything needed. */
3992 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
3993 if (init)
3994 emit_insn (init);
3995 emit_label (label);
3996 LABEL_NUSES (label)++;
3997 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3998 }
3999 if (rw & PM_read)
4000 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4001 emit_insn (fn (tmp, rw, fs, data, vector));
4002 if (rw & PM_write)
4003 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4004 if (fs)
4005 {
4006 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4007 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4008 emit_insn (gen_br_true_uni (pred, label));
4009 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4010 if (fini)
4011 emit_insn (fini);
4012 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4013 }
4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4015 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4016 rtx cpy = get_insns ();
4017 end_sequence ();
4018 insn = emit_insn_after (cpy, insn);
4019 }
4020
4021 if (!is_call)
4022 /* Copy live registers. */
4023 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4024 {
4025 rtx reg = regno_reg_rtx[ix];
4026
4027 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4028 {
4029 rtx bcast = fn (reg, rw, 0, data, vector);
4030
4031 insn = emit_insn_after (bcast, insn);
4032 empty = false;
4033 }
4034 }
4035 return empty;
4036 }
4037
4038 /* Worker for nvptx_warp_propagate. */
4039
4040 static rtx
4041 warp_prop_gen (rtx reg, propagate_mask pm,
4042 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4043 bool ARG_UNUSED (vector))
4044 {
4045 if (!(pm & PM_read_write))
4046 return 0;
4047
4048 return nvptx_gen_warp_bcast (reg);
4049 }
4050
4051 /* Propagate state that is live at start of BLOCK across the vectors
4052 of a single warp. Propagation is inserted just after INSN.
4053 IS_CALL and return as for nvptx_propagate. */
4054
4055 static bool
4056 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4057 {
4058 return nvptx_propagate (is_call, block, insn, PM_read_write,
4059 warp_prop_gen, 0, false);
4060 }
4061
4062 /* Worker for nvptx_shared_propagate. */
4063
4064 static rtx
4065 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4066 bool vector)
4067 {
4068 broadcast_data_t *data = (broadcast_data_t *)data_;
4069
4070 if (pm & PM_loop_begin)
4071 {
4072 /* Starting a loop, initialize pointer. */
4073 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4074
4075 oacc_bcast_align = MAX (oacc_bcast_align, align);
4076 data->offset = ROUND_UP (data->offset, align);
4077
4078 data->ptr = gen_reg_rtx (Pmode);
4079
4080 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4081 }
4082 else if (pm & PM_loop_end)
4083 {
4084 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4085 data->ptr = NULL_RTX;
4086 return clobber;
4087 }
4088 else
4089 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4090 }
4091
4092 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4093 indicates if this is just before partitioned mode (do spill), or
4094 just after it starts (do fill). Sequence is inserted just after
4095 INSN. IS_CALL and return as for nvptx_propagate. */
4096
4097 static bool
4098 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4099 rtx_insn *insn, bool vector)
4100 {
4101 broadcast_data_t data;
4102
4103 data.base = gen_reg_rtx (Pmode);
4104 data.offset = 0;
4105 data.ptr = NULL_RTX;
4106
4107 bool empty = nvptx_propagate (is_call, block, insn,
4108 pre_p ? PM_read : PM_write, shared_prop_gen,
4109 &data, vector);
4110 gcc_assert (empty == !data.offset);
4111 if (data.offset)
4112 {
4113 rtx bcast_sym = oacc_bcast_sym;
4114
4115 /* Stuff was emitted, initialize the base pointer now. */
4116 if (vector && nvptx_mach_max_workers () > 1)
4117 {
4118 if (!cfun->machine->bcast_partition)
4119 {
4120 /* It would be nice to place this register in
4121 DATA_AREA_SHARED. */
4122 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4123 }
4124 if (!cfun->machine->sync_bar)
4125 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4126
4127 bcast_sym = cfun->machine->bcast_partition;
4128 }
4129
4130 rtx init = gen_rtx_SET (data.base, bcast_sym);
4131 emit_insn_after (init, insn);
4132
4133 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4134 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4135 ? nvptx_mach_max_workers () + 1
4136 : 1);
4137
4138 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4139 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4140 }
4141 return empty;
4142 }
4143
4144 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4145 which is an integer or a register. THREADS is the number of threads
4146 controlled by the barrier. */
4147
4148 static rtx
4149 nvptx_cta_sync (rtx lock, int threads)
4150 {
4151 return gen_nvptx_barsync (lock, GEN_INT (threads));
4152 }
4153
4154 #if WORKAROUND_PTXJIT_BUG
4155 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4156 real insns. */
4157
4158 static rtx_insn *
4159 bb_first_real_insn (basic_block bb)
4160 {
4161 rtx_insn *insn;
4162
4163 /* Find first insn of from block. */
4164 FOR_BB_INSNS (bb, insn)
4165 if (INSN_P (insn))
4166 return insn;
4167
4168 return 0;
4169 }
4170 #endif
4171
4172 /* Return true if INSN needs neutering. */
4173
4174 static bool
4175 needs_neutering_p (rtx_insn *insn)
4176 {
4177 if (!INSN_P (insn))
4178 return false;
4179
4180 switch (recog_memoized (insn))
4181 {
4182 case CODE_FOR_nvptx_fork:
4183 case CODE_FOR_nvptx_forked:
4184 case CODE_FOR_nvptx_joining:
4185 case CODE_FOR_nvptx_join:
4186 case CODE_FOR_nvptx_barsync:
4187 return false;
4188 default:
4189 return true;
4190 }
4191 }
4192
4193 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4194
4195 static bool
4196 verify_neutering_jumps (basic_block from,
4197 rtx_insn *vector_jump, rtx_insn *worker_jump,
4198 rtx_insn *vector_label, rtx_insn *worker_label)
4199 {
4200 basic_block bb = from;
4201 rtx_insn *insn = BB_HEAD (bb);
4202 bool seen_worker_jump = false;
4203 bool seen_vector_jump = false;
4204 bool seen_worker_label = false;
4205 bool seen_vector_label = false;
4206 bool worker_neutered = false;
4207 bool vector_neutered = false;
4208 while (true)
4209 {
4210 if (insn == worker_jump)
4211 {
4212 seen_worker_jump = true;
4213 worker_neutered = true;
4214 gcc_assert (!vector_neutered);
4215 }
4216 else if (insn == vector_jump)
4217 {
4218 seen_vector_jump = true;
4219 vector_neutered = true;
4220 }
4221 else if (insn == worker_label)
4222 {
4223 seen_worker_label = true;
4224 gcc_assert (worker_neutered);
4225 worker_neutered = false;
4226 }
4227 else if (insn == vector_label)
4228 {
4229 seen_vector_label = true;
4230 gcc_assert (vector_neutered);
4231 vector_neutered = false;
4232 }
4233 else if (INSN_P (insn))
4234 switch (recog_memoized (insn))
4235 {
4236 case CODE_FOR_nvptx_barsync:
4237 gcc_assert (!vector_neutered && !worker_neutered);
4238 break;
4239 default:
4240 break;
4241 }
4242
4243 if (insn != BB_END (bb))
4244 insn = NEXT_INSN (insn);
4245 else if (JUMP_P (insn) && single_succ_p (bb)
4246 && !seen_vector_jump && !seen_worker_jump)
4247 {
4248 bb = single_succ (bb);
4249 insn = BB_HEAD (bb);
4250 }
4251 else
4252 break;
4253 }
4254
4255 gcc_assert (!(vector_jump && !seen_vector_jump));
4256 gcc_assert (!(worker_jump && !seen_worker_jump));
4257
4258 if (seen_vector_label || seen_worker_label)
4259 {
4260 gcc_assert (!(vector_label && !seen_vector_label));
4261 gcc_assert (!(worker_label && !seen_worker_label));
4262
4263 return true;
4264 }
4265
4266 return false;
4267 }
4268
4269 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4270
4271 static void
4272 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4273 rtx_insn *worker_label)
4274 {
4275 basic_block bb = to;
4276 rtx_insn *insn = BB_END (bb);
4277 bool seen_worker_label = false;
4278 bool seen_vector_label = false;
4279 while (true)
4280 {
4281 if (insn == worker_label)
4282 {
4283 seen_worker_label = true;
4284 gcc_assert (!seen_vector_label);
4285 }
4286 else if (insn == vector_label)
4287 seen_vector_label = true;
4288 else if (INSN_P (insn))
4289 switch (recog_memoized (insn))
4290 {
4291 case CODE_FOR_nvptx_barsync:
4292 gcc_assert (!seen_vector_label && !seen_worker_label);
4293 break;
4294 }
4295
4296 if (insn != BB_HEAD (bb))
4297 insn = PREV_INSN (insn);
4298 else
4299 break;
4300 }
4301
4302 gcc_assert (!(vector_label && !seen_vector_label));
4303 gcc_assert (!(worker_label && !seen_worker_label));
4304 }
4305
4306 /* Single neutering according to MASK. FROM is the incoming block and
4307 TO is the outgoing block. These may be the same block. Insert at
4308 start of FROM:
4309
4310 if (tid.<axis>) goto end.
4311
4312 and insert before ending branch of TO (if there is such an insn):
4313
4314 end:
4315 <possibly-broadcast-cond>
4316 <branch>
4317
4318 We currently only use differnt FROM and TO when skipping an entire
4319 loop. We could do more if we detected superblocks. */
4320
4321 static void
4322 nvptx_single (unsigned mask, basic_block from, basic_block to)
4323 {
4324 rtx_insn *head = BB_HEAD (from);
4325 rtx_insn *tail = BB_END (to);
4326 unsigned skip_mask = mask;
4327
4328 while (true)
4329 {
4330 /* Find first insn of from block. */
4331 while (head != BB_END (from) && !needs_neutering_p (head))
4332 head = NEXT_INSN (head);
4333
4334 if (from == to)
4335 break;
4336
4337 if (!(JUMP_P (head) && single_succ_p (from)))
4338 break;
4339
4340 basic_block jump_target = single_succ (from);
4341 if (!single_pred_p (jump_target))
4342 break;
4343
4344 from = jump_target;
4345 head = BB_HEAD (from);
4346 }
4347
4348 /* Find last insn of to block */
4349 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4350 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4351 tail = PREV_INSN (tail);
4352
4353 /* Detect if tail is a branch. */
4354 rtx tail_branch = NULL_RTX;
4355 rtx cond_branch = NULL_RTX;
4356 if (tail && INSN_P (tail))
4357 {
4358 tail_branch = PATTERN (tail);
4359 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4360 tail_branch = NULL_RTX;
4361 else
4362 {
4363 cond_branch = SET_SRC (tail_branch);
4364 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4365 cond_branch = NULL_RTX;
4366 }
4367 }
4368
4369 if (tail == head)
4370 {
4371 /* If this is empty, do nothing. */
4372 if (!head || !needs_neutering_p (head))
4373 return;
4374
4375 if (cond_branch)
4376 {
4377 /* If we're only doing vector single, there's no need to
4378 emit skip code because we'll not insert anything. */
4379 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4380 skip_mask = 0;
4381 }
4382 else if (tail_branch)
4383 /* Block with only unconditional branch. Nothing to do. */
4384 return;
4385 }
4386
4387 /* Insert the vector test inside the worker test. */
4388 unsigned mode;
4389 rtx_insn *before = tail;
4390 rtx_insn *neuter_start = NULL;
4391 rtx_insn *worker_label = NULL, *vector_label = NULL;
4392 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4393 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4394 if (GOMP_DIM_MASK (mode) & skip_mask)
4395 {
4396 rtx_code_label *label = gen_label_rtx ();
4397 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4398 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4399 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4400
4401 if (!pred)
4402 {
4403 pred = gen_reg_rtx (BImode);
4404 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4405 }
4406
4407 rtx br;
4408 if (mode == GOMP_DIM_VECTOR)
4409 br = gen_br_true (pred, label);
4410 else
4411 br = gen_br_true_uni (pred, label);
4412 if (neuter_start)
4413 neuter_start = emit_insn_after (br, neuter_start);
4414 else
4415 neuter_start = emit_insn_before (br, head);
4416 *mode_jump = neuter_start;
4417
4418 LABEL_NUSES (label)++;
4419 rtx_insn *label_insn;
4420 if (tail_branch)
4421 {
4422 label_insn = emit_label_before (label, before);
4423 before = label_insn;
4424 }
4425 else
4426 {
4427 label_insn = emit_label_after (label, tail);
4428 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4429 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4430 emit_insn_after (gen_exit (), label_insn);
4431 }
4432
4433 if (mode == GOMP_DIM_VECTOR)
4434 vector_label = label_insn;
4435 else
4436 worker_label = label_insn;
4437 }
4438
4439 /* Now deal with propagating the branch condition. */
4440 if (cond_branch)
4441 {
4442 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4443
4444 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4445 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4446 {
4447 /* Vector mode only, do a shuffle. */
4448 #if WORKAROUND_PTXJIT_BUG
4449 /* The branch condition %rcond is propagated like this:
4450
4451 {
4452 .reg .u32 %x;
4453 mov.u32 %x,%tid.x;
4454 setp.ne.u32 %rnotvzero,%x,0;
4455 }
4456
4457 @%rnotvzero bra Lskip;
4458 setp.<op>.<type> %rcond,op1,op2;
4459 Lskip:
4460 selp.u32 %rcondu32,1,0,%rcond;
4461 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4462 setp.ne.u32 %rcond,%rcondu32,0;
4463
4464 There seems to be a bug in the ptx JIT compiler (observed at driver
4465 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4466 unless %rcond is initialized to something before 'bra Lskip'. The
4467 bug is not observed with ptxas from cuda 8.0.61.
4468
4469 It is true that the code is non-trivial: at Lskip, %rcond is
4470 uninitialized in threads 1-31, and after the selp the same holds
4471 for %rcondu32. But shfl propagates the defined value in thread 0
4472 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4473 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4474
4475 There is nothing in the PTX spec to suggest that this is wrong, or
4476 to explain why the extra initialization is needed. So, we classify
4477 it as a JIT bug, and the extra initialization as workaround:
4478
4479 {
4480 .reg .u32 %x;
4481 mov.u32 %x,%tid.x;
4482 setp.ne.u32 %rnotvzero,%x,0;
4483 }
4484
4485 +.reg .pred %rcond2;
4486 +setp.eq.u32 %rcond2, 1, 0;
4487
4488 @%rnotvzero bra Lskip;
4489 setp.<op>.<type> %rcond,op1,op2;
4490 +mov.pred %rcond2, %rcond;
4491 Lskip:
4492 +mov.pred %rcond, %rcond2;
4493 selp.u32 %rcondu32,1,0,%rcond;
4494 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4495 setp.ne.u32 %rcond,%rcondu32,0;
4496 */
4497 rtx_insn *label = PREV_INSN (tail);
4498 gcc_assert (label && LABEL_P (label));
4499 rtx tmp = gen_reg_rtx (BImode);
4500 emit_insn_before (gen_movbi (tmp, const0_rtx),
4501 bb_first_real_insn (from));
4502 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4503 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4504 #endif
4505 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4506 }
4507 else
4508 {
4509 /* Includes worker mode, do spill & fill. By construction
4510 we should never have worker mode only. */
4511 broadcast_data_t data;
4512 unsigned size = GET_MODE_SIZE (SImode);
4513 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4514 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4515 rtx barrier = GEN_INT (0);
4516 int threads = 0;
4517
4518 data.base = oacc_bcast_sym;
4519 data.ptr = 0;
4520
4521 bool use_partitioning_p = (vector && !worker
4522 && nvptx_mach_max_workers () > 1
4523 && cfun->machine->bcast_partition);
4524 if (use_partitioning_p)
4525 {
4526 data.base = cfun->machine->bcast_partition;
4527 barrier = cfun->machine->sync_bar;
4528 threads = nvptx_mach_vector_length ();
4529 }
4530 gcc_assert (data.base != NULL);
4531 gcc_assert (barrier);
4532
4533 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4534 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4535 ? nvptx_mach_max_workers () + 1
4536 : 1);
4537
4538 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4539 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4540
4541 data.offset = 0;
4542 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4543 vector),
4544 before);
4545
4546 /* Barrier so other workers can see the write. */
4547 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4548 data.offset = 0;
4549 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4550 vector),
4551 tail);
4552 /* This barrier is needed to avoid worker zero clobbering
4553 the broadcast buffer before all the other workers have
4554 had a chance to read this instance of it. */
4555 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4556 }
4557
4558 extract_insn (tail);
4559 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4560 UNSPEC_BR_UNIFIED);
4561 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4562 }
4563
4564 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4565 vector_label, worker_label);
4566 if (!seen_label)
4567 verify_neutering_labels (to, vector_label, worker_label);
4568 }
4569
4570 /* PAR is a parallel that is being skipped in its entirety according to
4571 MASK. Treat this as skipping a superblock starting at forked
4572 and ending at joining. */
4573
4574 static void
4575 nvptx_skip_par (unsigned mask, parallel *par)
4576 {
4577 basic_block tail = par->join_block;
4578 gcc_assert (tail->preds->length () == 1);
4579
4580 basic_block pre_tail = (*tail->preds)[0]->src;
4581 gcc_assert (pre_tail->succs->length () == 1);
4582
4583 nvptx_single (mask, par->forked_block, pre_tail);
4584 }
4585
4586 /* If PAR has a single inner parallel and PAR itself only contains
4587 empty entry and exit blocks, swallow the inner PAR. */
4588
4589 static void
4590 nvptx_optimize_inner (parallel *par)
4591 {
4592 parallel *inner = par->inner;
4593
4594 /* We mustn't be the outer dummy par. */
4595 if (!par->mask)
4596 return;
4597
4598 /* We must have a single inner par. */
4599 if (!inner || inner->next)
4600 return;
4601
4602 /* We must only contain 2 blocks ourselves -- the head and tail of
4603 the inner par. */
4604 if (par->blocks.length () != 2)
4605 return;
4606
4607 /* We must be disjoint partitioning. As we only have vector and
4608 worker partitioning, this is sufficient to guarantee the pars
4609 have adjacent partitioning. */
4610 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4611 /* This indicates malformed code generation. */
4612 return;
4613
4614 /* The outer forked insn should be immediately followed by the inner
4615 fork insn. */
4616 rtx_insn *forked = par->forked_insn;
4617 rtx_insn *fork = BB_END (par->forked_block);
4618
4619 if (NEXT_INSN (forked) != fork)
4620 return;
4621 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4622
4623 /* The outer joining insn must immediately follow the inner join
4624 insn. */
4625 rtx_insn *joining = par->joining_insn;
4626 rtx_insn *join = inner->join_insn;
4627 if (NEXT_INSN (join) != joining)
4628 return;
4629
4630 /* Preconditions met. Swallow the inner par. */
4631 if (dump_file)
4632 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4633 inner->mask, inner->forked_block->index,
4634 inner->join_block->index,
4635 par->mask, par->forked_block->index, par->join_block->index);
4636
4637 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4638
4639 par->blocks.reserve (inner->blocks.length ());
4640 while (inner->blocks.length ())
4641 par->blocks.quick_push (inner->blocks.pop ());
4642
4643 par->inner = inner->inner;
4644 inner->inner = NULL;
4645
4646 delete inner;
4647 }
4648
4649 /* Process the parallel PAR and all its contained
4650 parallels. We do everything but the neutering. Return mask of
4651 partitioned modes used within this parallel. */
4652
4653 static unsigned
4654 nvptx_process_pars (parallel *par)
4655 {
4656 if (nvptx_optimize)
4657 nvptx_optimize_inner (par);
4658
4659 unsigned inner_mask = par->mask;
4660
4661 /* Do the inner parallels first. */
4662 if (par->inner)
4663 {
4664 par->inner_mask = nvptx_process_pars (par->inner);
4665 inner_mask |= par->inner_mask;
4666 }
4667
4668 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4669 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4670 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4671 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4672
4673 if (worker || large_vector)
4674 {
4675 nvptx_shared_propagate (false, is_call, par->forked_block,
4676 par->forked_insn, !worker);
4677 bool no_prop_p
4678 = nvptx_shared_propagate (true, is_call, par->forked_block,
4679 par->fork_insn, !worker);
4680 bool empty_loop_p
4681 = !is_call && (NEXT_INSN (par->forked_insn)
4682 && NEXT_INSN (par->forked_insn) == par->joining_insn);
4683 rtx barrier = GEN_INT (0);
4684 int threads = 0;
4685
4686 if (!worker && cfun->machine->sync_bar)
4687 {
4688 barrier = cfun->machine->sync_bar;
4689 threads = nvptx_mach_vector_length ();
4690 }
4691
4692 if (no_prop_p && empty_loop_p)
4693 ;
4694 else if (no_prop_p && is_call)
4695 ;
4696 else
4697 {
4698 /* Insert begin and end synchronizations. */
4699 emit_insn_before (nvptx_cta_sync (barrier, threads),
4700 par->forked_insn);
4701 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4702 }
4703 }
4704 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4705 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4706
4707 /* Now do siblings. */
4708 if (par->next)
4709 inner_mask |= nvptx_process_pars (par->next);
4710 return inner_mask;
4711 }
4712
4713 /* Neuter the parallel described by PAR. We recurse in depth-first
4714 order. MODES are the partitioning of the execution and OUTER is
4715 the partitioning of the parallels we are contained in. */
4716
4717 static void
4718 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4719 {
4720 unsigned me = (par->mask
4721 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4722 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4723 unsigned skip_mask = 0, neuter_mask = 0;
4724
4725 if (par->inner)
4726 nvptx_neuter_pars (par->inner, modes, outer | me);
4727
4728 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4729 {
4730 if ((outer | me) & GOMP_DIM_MASK (mode))
4731 {} /* Mode is partitioned: no neutering. */
4732 else if (!(modes & GOMP_DIM_MASK (mode)))
4733 {} /* Mode is not used: nothing to do. */
4734 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4735 || !par->forked_insn)
4736 /* Partitioned in inner parallels, or we're not a partitioned
4737 at all: neuter individual blocks. */
4738 neuter_mask |= GOMP_DIM_MASK (mode);
4739 else if (!par->parent || !par->parent->forked_insn
4740 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4741 /* Parent isn't a parallel or contains this paralleling: skip
4742 parallel at this level. */
4743 skip_mask |= GOMP_DIM_MASK (mode);
4744 else
4745 {} /* Parent will skip this parallel itself. */
4746 }
4747
4748 if (neuter_mask)
4749 {
4750 int ix, len;
4751
4752 if (nvptx_optimize)
4753 {
4754 /* Neuter whole SESE regions. */
4755 bb_pair_vec_t regions;
4756
4757 nvptx_find_sese (par->blocks, regions);
4758 len = regions.length ();
4759 for (ix = 0; ix != len; ix++)
4760 {
4761 basic_block from = regions[ix].first;
4762 basic_block to = regions[ix].second;
4763
4764 if (from)
4765 nvptx_single (neuter_mask, from, to);
4766 else
4767 gcc_assert (!to);
4768 }
4769 }
4770 else
4771 {
4772 /* Neuter each BB individually. */
4773 len = par->blocks.length ();
4774 for (ix = 0; ix != len; ix++)
4775 {
4776 basic_block block = par->blocks[ix];
4777
4778 nvptx_single (neuter_mask, block, block);
4779 }
4780 }
4781 }
4782
4783 if (skip_mask)
4784 nvptx_skip_par (skip_mask, par);
4785
4786 if (par->next)
4787 nvptx_neuter_pars (par->next, modes, outer);
4788 }
4789
4790 static void
4791 populate_offload_attrs (offload_attrs *oa)
4792 {
4793 tree attr = oacc_get_fn_attrib (current_function_decl);
4794 tree dims = TREE_VALUE (attr);
4795 unsigned ix;
4796
4797 oa->mask = 0;
4798
4799 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4800 {
4801 tree t = TREE_VALUE (dims);
4802 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4803 tree allowed = TREE_PURPOSE (dims);
4804
4805 if (size != 1 && !(allowed && integer_zerop (allowed)))
4806 oa->mask |= GOMP_DIM_MASK (ix);
4807
4808 switch (ix)
4809 {
4810 case GOMP_DIM_GANG:
4811 oa->num_gangs = size;
4812 break;
4813
4814 case GOMP_DIM_WORKER:
4815 oa->num_workers = size;
4816 break;
4817
4818 case GOMP_DIM_VECTOR:
4819 oa->vector_length = size;
4820 break;
4821 }
4822 }
4823 }
4824
4825 #if WORKAROUND_PTXJIT_BUG_2
4826 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4827 is needed in the nvptx target because the branches generated for
4828 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4829
4830 static rtx
4831 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4832 {
4833 rtx pat;
4834 if ((strict && !JUMP_P (insn))
4835 || (!strict && !INSN_P (insn)))
4836 return NULL_RTX;
4837 pat = PATTERN (insn);
4838
4839 /* The set is allowed to appear either as the insn pattern or
4840 the first set in a PARALLEL. */
4841 if (GET_CODE (pat) == PARALLEL)
4842 pat = XVECEXP (pat, 0, 0);
4843 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4844 return pat;
4845
4846 return NULL_RTX;
4847 }
4848
4849 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4850
4851 static rtx
4852 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4853 {
4854 rtx x = nvptx_pc_set (insn, strict);
4855
4856 if (!x)
4857 return NULL_RTX;
4858 x = SET_SRC (x);
4859 if (GET_CODE (x) == LABEL_REF)
4860 return x;
4861 if (GET_CODE (x) != IF_THEN_ELSE)
4862 return NULL_RTX;
4863 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4864 return XEXP (x, 1);
4865 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4866 return XEXP (x, 2);
4867 return NULL_RTX;
4868 }
4869
4870 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4871 insn inbetween the branch and the label. This works around a JIT bug
4872 observed at driver version 384.111, at -O0 for sm_50. */
4873
4874 static void
4875 prevent_branch_around_nothing (void)
4876 {
4877 rtx_insn *seen_label = NULL;
4878 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4879 {
4880 if (INSN_P (insn) && condjump_p (insn))
4881 {
4882 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4883 continue;
4884 }
4885
4886 if (seen_label == NULL)
4887 continue;
4888
4889 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4890 continue;
4891
4892 if (INSN_P (insn))
4893 switch (recog_memoized (insn))
4894 {
4895 case CODE_FOR_nvptx_fork:
4896 case CODE_FOR_nvptx_forked:
4897 case CODE_FOR_nvptx_joining:
4898 case CODE_FOR_nvptx_join:
4899 continue;
4900 default:
4901 seen_label = NULL;
4902 continue;
4903 }
4904
4905 if (LABEL_P (insn) && insn == seen_label)
4906 emit_insn_before (gen_fake_nop (), insn);
4907
4908 seen_label = NULL;
4909 }
4910 }
4911 #endif
4912
4913 #ifdef WORKAROUND_PTXJIT_BUG_3
4914 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4915 works around a hang observed at driver version 390.48 for sm_50. */
4916
4917 static void
4918 workaround_barsyncs (void)
4919 {
4920 bool seen_barsync = false;
4921 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4922 {
4923 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4924 {
4925 if (seen_barsync)
4926 {
4927 emit_insn_before (gen_nvptx_membar_cta (), insn);
4928 emit_insn_before (gen_nvptx_membar_cta (), insn);
4929 }
4930
4931 seen_barsync = true;
4932 continue;
4933 }
4934
4935 if (!seen_barsync)
4936 continue;
4937
4938 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4939 continue;
4940 else if (INSN_P (insn))
4941 switch (recog_memoized (insn))
4942 {
4943 case CODE_FOR_nvptx_fork:
4944 case CODE_FOR_nvptx_forked:
4945 case CODE_FOR_nvptx_joining:
4946 case CODE_FOR_nvptx_join:
4947 continue;
4948 default:
4949 break;
4950 }
4951
4952 seen_barsync = false;
4953 }
4954 }
4955 #endif
4956
4957 /* PTX-specific reorganization
4958 - Split blocks at fork and join instructions
4959 - Compute live registers
4960 - Mark now-unused registers, so function begin doesn't declare
4961 unused registers.
4962 - Insert state propagation when entering partitioned mode
4963 - Insert neutering instructions when in single mode
4964 - Replace subregs with suitable sequences.
4965 */
4966
4967 static void
4968 nvptx_reorg (void)
4969 {
4970 /* We are freeing block_for_insn in the toplev to keep compatibility
4971 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4972 compute_bb_for_insn ();
4973
4974 thread_prologue_and_epilogue_insns ();
4975
4976 /* Split blocks and record interesting unspecs. */
4977 bb_insn_map_t bb_insn_map;
4978
4979 nvptx_split_blocks (&bb_insn_map);
4980
4981 /* Compute live regs */
4982 df_clear_flags (DF_LR_RUN_DCE);
4983 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4984 df_live_add_problem ();
4985 df_live_set_all_dirty ();
4986 df_analyze ();
4987 regstat_init_n_sets_and_refs ();
4988
4989 if (dump_file)
4990 df_dump (dump_file);
4991
4992 /* Mark unused regs as unused. */
4993 int max_regs = max_reg_num ();
4994 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4995 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4996 regno_reg_rtx[i] = const0_rtx;
4997
4998 /* Determine launch dimensions of the function. If it is not an
4999 offloaded function (i.e. this is a regular compiler), the
5000 function has no neutering. */
5001 tree attr = oacc_get_fn_attrib (current_function_decl);
5002 if (attr)
5003 {
5004 /* If we determined this mask before RTL expansion, we could
5005 elide emission of some levels of forks and joins. */
5006 offload_attrs oa;
5007
5008 populate_offload_attrs (&oa);
5009
5010 /* If there is worker neutering, there must be vector
5011 neutering. Otherwise the hardware will fail. */
5012 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5013 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5014
5015 /* Discover & process partitioned regions. */
5016 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5017 nvptx_process_pars (pars);
5018 nvptx_neuter_pars (pars, oa.mask, 0);
5019 delete pars;
5020 }
5021
5022 /* Replace subregs. */
5023 nvptx_reorg_subreg ();
5024
5025 if (TARGET_UNIFORM_SIMT)
5026 nvptx_reorg_uniform_simt ();
5027
5028 #if WORKAROUND_PTXJIT_BUG_2
5029 prevent_branch_around_nothing ();
5030 #endif
5031
5032 #ifdef WORKAROUND_PTXJIT_BUG_3
5033 workaround_barsyncs ();
5034 #endif
5035
5036 regstat_free_n_sets_and_refs ();
5037
5038 df_finish_pass (true);
5039 }
5040 \f
5041 /* Handle a "kernel" attribute; arguments as in
5042 struct attribute_spec.handler. */
5043
5044 static tree
5045 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5046 int ARG_UNUSED (flags), bool *no_add_attrs)
5047 {
5048 tree decl = *node;
5049
5050 if (TREE_CODE (decl) != FUNCTION_DECL)
5051 {
5052 error ("%qE attribute only applies to functions", name);
5053 *no_add_attrs = true;
5054 }
5055 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5056 {
5057 error ("%qE attribute requires a void return type", name);
5058 *no_add_attrs = true;
5059 }
5060
5061 return NULL_TREE;
5062 }
5063
5064 /* Handle a "shared" attribute; arguments as in
5065 struct attribute_spec.handler. */
5066
5067 static tree
5068 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5069 int ARG_UNUSED (flags), bool *no_add_attrs)
5070 {
5071 tree decl = *node;
5072
5073 if (TREE_CODE (decl) != VAR_DECL)
5074 {
5075 error ("%qE attribute only applies to variables", name);
5076 *no_add_attrs = true;
5077 }
5078 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5079 {
5080 error ("%qE attribute not allowed with auto storage class", name);
5081 *no_add_attrs = true;
5082 }
5083
5084 return NULL_TREE;
5085 }
5086
5087 /* Table of valid machine attributes. */
5088 static const struct attribute_spec nvptx_attribute_table[] =
5089 {
5090 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5091 affects_type_identity, handler, exclude } */
5092 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5093 NULL },
5094 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5095 NULL },
5096 { NULL, 0, 0, false, false, false, false, NULL, NULL }
5097 };
5098 \f
5099 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5100
5101 static HOST_WIDE_INT
5102 nvptx_vector_alignment (const_tree type)
5103 {
5104 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5105
5106 return MIN (align, BIGGEST_ALIGNMENT);
5107 }
5108
5109 /* Indicate that INSN cannot be duplicated. */
5110
5111 static bool
5112 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5113 {
5114 switch (recog_memoized (insn))
5115 {
5116 case CODE_FOR_nvptx_shufflesi:
5117 case CODE_FOR_nvptx_shufflesf:
5118 case CODE_FOR_nvptx_barsync:
5119 case CODE_FOR_nvptx_fork:
5120 case CODE_FOR_nvptx_forked:
5121 case CODE_FOR_nvptx_joining:
5122 case CODE_FOR_nvptx_join:
5123 return true;
5124 default:
5125 return false;
5126 }
5127 }
5128
5129 /* Section anchors do not work. Initialization for flag_section_anchor
5130 probes the existence of the anchoring target hooks and prevents
5131 anchoring if they don't exist. However, we may be being used with
5132 a host-side compiler that does support anchoring, and hence see
5133 the anchor flag set (as it's not recalculated). So provide an
5134 implementation denying anchoring. */
5135
5136 static bool
5137 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5138 {
5139 return false;
5140 }
5141 \f
5142 /* Record a symbol for mkoffload to enter into the mapping table. */
5143
5144 static void
5145 nvptx_record_offload_symbol (tree decl)
5146 {
5147 switch (TREE_CODE (decl))
5148 {
5149 case VAR_DECL:
5150 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5151 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5152 break;
5153
5154 case FUNCTION_DECL:
5155 {
5156 tree attr = oacc_get_fn_attrib (decl);
5157 /* OpenMP offloading does not set this attribute. */
5158 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5159
5160 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5161 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5162
5163 for (; dims; dims = TREE_CHAIN (dims))
5164 {
5165 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5166
5167 gcc_assert (!TREE_PURPOSE (dims));
5168 fprintf (asm_out_file, ", %#x", size);
5169 }
5170
5171 fprintf (asm_out_file, "\n");
5172 }
5173 break;
5174
5175 default:
5176 gcc_unreachable ();
5177 }
5178 }
5179
5180 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5181 at the start of a file. */
5182
5183 static void
5184 nvptx_file_start (void)
5185 {
5186 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5187 fputs ("\t.version\t3.1\n", asm_out_file);
5188 if (TARGET_SM35)
5189 fputs ("\t.target\tsm_35\n", asm_out_file);
5190 else
5191 fputs ("\t.target\tsm_30\n", asm_out_file);
5192 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5193 fputs ("// END PREAMBLE\n", asm_out_file);
5194 }
5195
5196 /* Emit a declaration for a worker and vector-level buffer in .shared
5197 memory. */
5198
5199 static void
5200 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5201 {
5202 const char *name = XSTR (sym, 0);
5203
5204 write_var_marker (file, true, false, name);
5205 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5206 align, name, size);
5207 }
5208
5209 /* Write out the function declarations we've collected and declare storage
5210 for the broadcast buffer. */
5211
5212 static void
5213 nvptx_file_end (void)
5214 {
5215 hash_table<tree_hasher>::iterator iter;
5216 tree decl;
5217 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5218 nvptx_record_fndecl (decl);
5219 fputs (func_decls.str().c_str(), asm_out_file);
5220
5221 if (oacc_bcast_size)
5222 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5223 oacc_bcast_align, oacc_bcast_size);
5224
5225 if (worker_red_size)
5226 write_shared_buffer (asm_out_file, worker_red_sym,
5227 worker_red_align, worker_red_size);
5228
5229 if (vector_red_size)
5230 write_shared_buffer (asm_out_file, vector_red_sym,
5231 vector_red_align, vector_red_size);
5232
5233 if (need_softstack_decl)
5234 {
5235 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5236 /* 32 is the maximum number of warps in a block. Even though it's an
5237 external declaration, emit the array size explicitly; otherwise, it
5238 may fail at PTX JIT time if the definition is later in link order. */
5239 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5240 POINTER_SIZE);
5241 }
5242 if (need_unisimt_decl)
5243 {
5244 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5245 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5246 }
5247 }
5248
5249 /* Expander for the shuffle builtins. */
5250
5251 static rtx
5252 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5253 {
5254 if (ignore)
5255 return target;
5256
5257 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5258 NULL_RTX, mode, EXPAND_NORMAL);
5259 if (!REG_P (src))
5260 src = copy_to_mode_reg (mode, src);
5261
5262 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5263 NULL_RTX, SImode, EXPAND_NORMAL);
5264 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5265 NULL_RTX, SImode, EXPAND_NORMAL);
5266
5267 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5268 idx = copy_to_mode_reg (SImode, idx);
5269
5270 rtx pat = nvptx_gen_shuffle (target, src, idx,
5271 (nvptx_shuffle_kind) INTVAL (op));
5272 if (pat)
5273 emit_insn (pat);
5274
5275 return target;
5276 }
5277
5278 const char *
5279 nvptx_output_red_partition (rtx dst, rtx offset)
5280 {
5281 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5282 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5283
5284 if (offset == const0_rtx)
5285 fprintf (asm_out_file, zero_offset, REGNO (dst),
5286 REGNO (cfun->machine->red_partition));
5287 else
5288 fprintf (asm_out_file, with_offset, REGNO (dst),
5289 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5290
5291 return "";
5292 }
5293
5294 /* Shared-memory reduction address expander. */
5295
5296 static rtx
5297 nvptx_expand_shared_addr (tree exp, rtx target,
5298 machine_mode ARG_UNUSED (mode), int ignore,
5299 int vector)
5300 {
5301 if (ignore)
5302 return target;
5303
5304 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5305 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5306 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5307 rtx addr = worker_red_sym;
5308
5309 if (vector)
5310 {
5311 offload_attrs oa;
5312
5313 populate_offload_attrs (&oa);
5314
5315 unsigned int psize = ROUND_UP (size + offset, align);
5316 unsigned int pnum = nvptx_mach_max_workers ();
5317 vector_red_partition = MAX (vector_red_partition, psize);
5318 vector_red_size = MAX (vector_red_size, psize * pnum);
5319 vector_red_align = MAX (vector_red_align, align);
5320
5321 if (cfun->machine->red_partition == NULL)
5322 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5323
5324 addr = gen_reg_rtx (Pmode);
5325 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5326 }
5327 else
5328 {
5329 worker_red_align = MAX (worker_red_align, align);
5330 worker_red_size = MAX (worker_red_size, size + offset);
5331
5332 if (offset)
5333 {
5334 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5335 addr = gen_rtx_CONST (Pmode, addr);
5336 }
5337 }
5338
5339 emit_move_insn (target, addr);
5340 return target;
5341 }
5342
5343 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5344 not require taking the address of any object, other than the memory
5345 cell being operated on. */
5346
5347 static rtx
5348 nvptx_expand_cmp_swap (tree exp, rtx target,
5349 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5350 {
5351 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5352
5353 if (!target)
5354 target = gen_reg_rtx (mode);
5355
5356 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5357 NULL_RTX, Pmode, EXPAND_NORMAL);
5358 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5359 NULL_RTX, mode, EXPAND_NORMAL);
5360 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5361 NULL_RTX, mode, EXPAND_NORMAL);
5362 rtx pat;
5363
5364 mem = gen_rtx_MEM (mode, mem);
5365 if (!REG_P (cmp))
5366 cmp = copy_to_mode_reg (mode, cmp);
5367 if (!REG_P (src))
5368 src = copy_to_mode_reg (mode, src);
5369
5370 if (mode == SImode)
5371 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5372 else
5373 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5374
5375 emit_insn (pat);
5376
5377 return target;
5378 }
5379
5380
5381 /* Codes for all the NVPTX builtins. */
5382 enum nvptx_builtins
5383 {
5384 NVPTX_BUILTIN_SHUFFLE,
5385 NVPTX_BUILTIN_SHUFFLELL,
5386 NVPTX_BUILTIN_WORKER_ADDR,
5387 NVPTX_BUILTIN_VECTOR_ADDR,
5388 NVPTX_BUILTIN_CMP_SWAP,
5389 NVPTX_BUILTIN_CMP_SWAPLL,
5390 NVPTX_BUILTIN_MAX
5391 };
5392
5393 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5394
5395 /* Return the NVPTX builtin for CODE. */
5396
5397 static tree
5398 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5399 {
5400 if (code >= NVPTX_BUILTIN_MAX)
5401 return error_mark_node;
5402
5403 return nvptx_builtin_decls[code];
5404 }
5405
5406 /* Set up all builtin functions for this target. */
5407
5408 static void
5409 nvptx_init_builtins (void)
5410 {
5411 #define DEF(ID, NAME, T) \
5412 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5413 = add_builtin_function ("__builtin_nvptx_" NAME, \
5414 build_function_type_list T, \
5415 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5416 #define ST sizetype
5417 #define UINT unsigned_type_node
5418 #define LLUINT long_long_unsigned_type_node
5419 #define PTRVOID ptr_type_node
5420
5421 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5422 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5423 DEF (WORKER_ADDR, "worker_addr",
5424 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5425 DEF (VECTOR_ADDR, "vector_addr",
5426 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5427 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5428 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5429
5430 #undef DEF
5431 #undef ST
5432 #undef UINT
5433 #undef LLUINT
5434 #undef PTRVOID
5435 }
5436
5437 /* Expand an expression EXP that calls a built-in function,
5438 with result going to TARGET if that's convenient
5439 (and in mode MODE if that's convenient).
5440 SUBTARGET may be used as the target for computing one of EXP's operands.
5441 IGNORE is nonzero if the value is to be ignored. */
5442
5443 static rtx
5444 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5445 machine_mode mode, int ignore)
5446 {
5447 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5448 switch (DECL_FUNCTION_CODE (fndecl))
5449 {
5450 case NVPTX_BUILTIN_SHUFFLE:
5451 case NVPTX_BUILTIN_SHUFFLELL:
5452 return nvptx_expand_shuffle (exp, target, mode, ignore);
5453
5454 case NVPTX_BUILTIN_WORKER_ADDR:
5455 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5456
5457 case NVPTX_BUILTIN_VECTOR_ADDR:
5458 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5459
5460 case NVPTX_BUILTIN_CMP_SWAP:
5461 case NVPTX_BUILTIN_CMP_SWAPLL:
5462 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5463
5464 default: gcc_unreachable ();
5465 }
5466 }
5467
5468 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5469
5470 static int
5471 nvptx_simt_vf ()
5472 {
5473 return PTX_WARP_SIZE;
5474 }
5475
5476 static bool
5477 nvptx_welformed_vector_length_p (int l)
5478 {
5479 gcc_assert (l > 0);
5480 return l % PTX_WARP_SIZE == 0;
5481 }
5482
5483 static void
5484 nvptx_apply_dim_limits (int dims[])
5485 {
5486 /* Check that the vector_length is not too large. */
5487 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5488 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5489
5490 /* Check that the number of workers is not too large. */
5491 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5492 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5493
5494 /* Ensure that num_worker * vector_length <= cta size. */
5495 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5496 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5497 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5498 }
5499
5500 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5501
5502 static bool
5503 has_vector_partitionable_routine_calls_p (tree fndecl)
5504 {
5505 if (!fndecl)
5506 return false;
5507
5508 basic_block bb;
5509 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5510 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5511 gsi_next_nondebug (&i))
5512 {
5513 gimple *stmt = gsi_stmt (i);
5514 if (gimple_code (stmt) != GIMPLE_CALL)
5515 continue;
5516
5517 tree callee = gimple_call_fndecl (stmt);
5518 if (!callee)
5519 continue;
5520
5521 tree attrs = oacc_get_fn_attrib (callee);
5522 if (attrs == NULL_TREE)
5523 return false;
5524
5525 int partition_level = oacc_fn_attrib_level (attrs);
5526 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5527 if (!seq_routine_p)
5528 return true;
5529 }
5530
5531 return false;
5532 }
5533
5534 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5535 DIMS has changed. */
5536
5537 static void
5538 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level)
5539 {
5540 bool oacc_default_dims_p = false;
5541 bool oacc_min_dims_p = false;
5542 bool offload_region_p = false;
5543 bool routine_p = false;
5544 bool routine_seq_p = false;
5545
5546 if (decl == NULL_TREE)
5547 {
5548 if (fn_level == -1)
5549 oacc_default_dims_p = true;
5550 else if (fn_level == -2)
5551 oacc_min_dims_p = true;
5552 else
5553 gcc_unreachable ();
5554 }
5555 else if (fn_level == -1)
5556 offload_region_p = true;
5557 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5558 {
5559 routine_p = true;
5560 routine_seq_p = fn_level == GOMP_DIM_MAX;
5561 }
5562 else
5563 gcc_unreachable ();
5564
5565 if (routine_p)
5566 {
5567 /* OpenACC routines in C arrive here with the following attributes
5568 (omitting the 'omp declare target'):
5569 seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
5570 vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
5571 worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
5572 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5573
5574 If we take f.i. the oacc function attribute of the worker routine
5575 (0 1, 1 0, 1 0), then:
5576 - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
5577 meaning: worker routine, that is:
5578 - can't contain gang loop (0),
5579 - can contain worker loop (1),
5580 - can contain vector loop (1).
5581 - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
5582 dimensions: gang: 1, worker: 0, vector: 0.
5583
5584 OTOH, routines in Fortran arrive here with these attributes:
5585 seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
5586 vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
5587 worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
5588 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5589 that is, the same as for C but with the dimensions set to 0.
5590
5591 This is due to a bug in the Fortran front-end: PR72741. Work around
5592 this bug by forcing the dimensions to be the same in Fortran as for C,
5593 to be able to handle C and Fortran routines uniformly in this
5594 function. */
5595 dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
5596 dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
5597 dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
5598 }
5599
5600 if (oacc_min_dims_p)
5601 {
5602 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5603 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5604 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5605
5606 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5607 return;
5608 }
5609
5610 if (routine_p)
5611 {
5612 if (!routine_seq_p)
5613 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5614
5615 return;
5616 }
5617
5618 if (oacc_default_dims_p)
5619 {
5620 /* -1 : not set
5621 0 : set at runtime, f.i. -fopenacc-dims=-
5622 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5623 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5624 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5625 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5626
5627 /* But -fopenacc-dims=- is not yet supported on trunk. */
5628 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5629 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5630 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5631 }
5632
5633 if (offload_region_p)
5634 {
5635 /* -1 : not set
5636 0 : set using variable, f.i. num_gangs (n)
5637 >= 1: set using constant, f.i. num_gangs (1). */
5638 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5639 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5640 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5641 }
5642
5643 int old_dims[GOMP_DIM_MAX];
5644 unsigned int i;
5645 for (i = 0; i < GOMP_DIM_MAX; ++i)
5646 old_dims[i] = dims[i];
5647
5648 const char *vector_reason = NULL;
5649 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5650 {
5651 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5652 {
5653 vector_reason = G_("using vector_length (%d) due to call to"
5654 " vector-partitionable routine, ignoring %d");
5655 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5656 }
5657 }
5658
5659 if (dims[GOMP_DIM_VECTOR] == 0)
5660 {
5661 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5662 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
5663 }
5664
5665 if (dims[GOMP_DIM_VECTOR] > 0
5666 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5667 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
5668
5669 nvptx_apply_dim_limits (dims);
5670
5671 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5672 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5673 vector_reason != NULL
5674 ? vector_reason
5675 : G_("using vector_length (%d), ignoring %d"),
5676 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5677
5678 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5679 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5680 G_("using num_workers (%d), ignoring %d"),
5681 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5682
5683 if (oacc_default_dims_p)
5684 {
5685 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
5686 if (dims[GOMP_DIM_WORKER] < 0)
5687 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5688 if (dims[GOMP_DIM_GANG] < 0)
5689 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5690 nvptx_apply_dim_limits (dims);
5691 }
5692 }
5693
5694 /* Validate compute dimensions of an OpenACC offload or routine, fill
5695 in non-unity defaults. FN_LEVEL indicates the level at which a
5696 routine might spawn a loop. It is negative for non-routines. If
5697 DECL is null, we are validating the default dimensions. */
5698
5699 static bool
5700 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
5701 {
5702 int old_dims[GOMP_DIM_MAX];
5703 unsigned int i;
5704
5705 for (i = 0; i < GOMP_DIM_MAX; ++i)
5706 old_dims[i] = dims[i];
5707
5708 nvptx_goacc_validate_dims_1 (decl, dims, fn_level);
5709
5710 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5711 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5712 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5713
5714 for (i = 0; i < GOMP_DIM_MAX; ++i)
5715 if (old_dims[i] != dims[i])
5716 return true;
5717
5718 return false;
5719 }
5720
5721 /* Return maximum dimension size, or zero for unbounded. */
5722
5723 static int
5724 nvptx_dim_limit (int axis)
5725 {
5726 switch (axis)
5727 {
5728 case GOMP_DIM_VECTOR:
5729 return PTX_MAX_VECTOR_LENGTH;
5730
5731 default:
5732 break;
5733 }
5734 return 0;
5735 }
5736
5737 /* Determine whether fork & joins are needed. */
5738
5739 static bool
5740 nvptx_goacc_fork_join (gcall *call, const int dims[],
5741 bool ARG_UNUSED (is_fork))
5742 {
5743 tree arg = gimple_call_arg (call, 2);
5744 unsigned axis = TREE_INT_CST_LOW (arg);
5745
5746 /* We only care about worker and vector partitioning. */
5747 if (axis < GOMP_DIM_WORKER)
5748 return false;
5749
5750 /* If the size is 1, there's no partitioning. */
5751 if (dims[axis] == 1)
5752 return false;
5753
5754 return true;
5755 }
5756
5757 /* Generate a PTX builtin function call that returns the address in
5758 the worker reduction buffer at OFFSET. TYPE is the type of the
5759 data at that location. */
5760
5761 static tree
5762 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5763 {
5764 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5765 if (vector)
5766 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5767 machine_mode mode = TYPE_MODE (type);
5768 tree fndecl = nvptx_builtin_decl (addr_dim, true);
5769 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5770 tree align = build_int_cst (unsigned_type_node,
5771 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5772 tree call = build_call_expr (fndecl, 3, offset, size, align);
5773
5774 return fold_convert (build_pointer_type (type), call);
5775 }
5776
5777 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5778 will cast the variable if necessary. */
5779
5780 static void
5781 nvptx_generate_vector_shuffle (location_t loc,
5782 tree dest_var, tree var, unsigned shift,
5783 gimple_seq *seq)
5784 {
5785 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5786 tree_code code = NOP_EXPR;
5787 tree arg_type = unsigned_type_node;
5788 tree var_type = TREE_TYPE (var);
5789 tree dest_type = var_type;
5790
5791 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5792 var_type = TREE_TYPE (var_type);
5793
5794 if (TREE_CODE (var_type) == REAL_TYPE)
5795 code = VIEW_CONVERT_EXPR;
5796
5797 if (TYPE_SIZE (var_type)
5798 == TYPE_SIZE (long_long_unsigned_type_node))
5799 {
5800 fn = NVPTX_BUILTIN_SHUFFLELL;
5801 arg_type = long_long_unsigned_type_node;
5802 }
5803
5804 tree call = nvptx_builtin_decl (fn, true);
5805 tree bits = build_int_cst (unsigned_type_node, shift);
5806 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5807 tree expr;
5808
5809 if (var_type != dest_type)
5810 {
5811 /* Do real and imaginary parts separately. */
5812 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5813 real = fold_build1 (code, arg_type, real);
5814 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5815 real = fold_build1 (code, var_type, real);
5816
5817 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5818 imag = fold_build1 (code, arg_type, imag);
5819 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5820 imag = fold_build1 (code, var_type, imag);
5821
5822 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5823 }
5824 else
5825 {
5826 expr = fold_build1 (code, arg_type, var);
5827 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5828 expr = fold_build1 (code, dest_type, expr);
5829 }
5830
5831 gimplify_assign (dest_var, expr, seq);
5832 }
5833
5834 /* Lazily generate the global lock var decl and return its address. */
5835
5836 static tree
5837 nvptx_global_lock_addr ()
5838 {
5839 tree v = global_lock_var;
5840
5841 if (!v)
5842 {
5843 tree name = get_identifier ("__reduction_lock");
5844 tree type = build_qualified_type (unsigned_type_node,
5845 TYPE_QUAL_VOLATILE);
5846 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5847 global_lock_var = v;
5848 DECL_ARTIFICIAL (v) = 1;
5849 DECL_EXTERNAL (v) = 1;
5850 TREE_STATIC (v) = 1;
5851 TREE_PUBLIC (v) = 1;
5852 TREE_USED (v) = 1;
5853 mark_addressable (v);
5854 mark_decl_referenced (v);
5855 }
5856
5857 return build_fold_addr_expr (v);
5858 }
5859
5860 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5861 GSI. We use a lockless scheme for nearly all case, which looks
5862 like:
5863 actual = initval(OP);
5864 do {
5865 guess = actual;
5866 write = guess OP myval;
5867 actual = cmp&swap (ptr, guess, write)
5868 } while (actual bit-different-to guess);
5869 return write;
5870
5871 This relies on a cmp&swap instruction, which is available for 32-
5872 and 64-bit types. Larger types must use a locking scheme. */
5873
5874 static tree
5875 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5876 tree ptr, tree var, tree_code op)
5877 {
5878 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5879 tree_code code = NOP_EXPR;
5880 tree arg_type = unsigned_type_node;
5881 tree var_type = TREE_TYPE (var);
5882
5883 if (TREE_CODE (var_type) == COMPLEX_TYPE
5884 || TREE_CODE (var_type) == REAL_TYPE)
5885 code = VIEW_CONVERT_EXPR;
5886
5887 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5888 {
5889 arg_type = long_long_unsigned_type_node;
5890 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5891 }
5892
5893 tree swap_fn = nvptx_builtin_decl (fn, true);
5894
5895 gimple_seq init_seq = NULL;
5896 tree init_var = make_ssa_name (arg_type);
5897 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5898 init_expr = fold_build1 (code, arg_type, init_expr);
5899 gimplify_assign (init_var, init_expr, &init_seq);
5900 gimple *init_end = gimple_seq_last (init_seq);
5901
5902 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5903
5904 /* Split the block just after the init stmts. */
5905 basic_block pre_bb = gsi_bb (*gsi);
5906 edge pre_edge = split_block (pre_bb, init_end);
5907 basic_block loop_bb = pre_edge->dest;
5908 pre_bb = pre_edge->src;
5909 /* Reset the iterator. */
5910 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5911
5912 tree expect_var = make_ssa_name (arg_type);
5913 tree actual_var = make_ssa_name (arg_type);
5914 tree write_var = make_ssa_name (arg_type);
5915
5916 /* Build and insert the reduction calculation. */
5917 gimple_seq red_seq = NULL;
5918 tree write_expr = fold_build1 (code, var_type, expect_var);
5919 write_expr = fold_build2 (op, var_type, write_expr, var);
5920 write_expr = fold_build1 (code, arg_type, write_expr);
5921 gimplify_assign (write_var, write_expr, &red_seq);
5922
5923 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5924
5925 /* Build & insert the cmp&swap sequence. */
5926 gimple_seq latch_seq = NULL;
5927 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5928 ptr, expect_var, write_var);
5929 gimplify_assign (actual_var, swap_expr, &latch_seq);
5930
5931 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5932 NULL_TREE, NULL_TREE);
5933 gimple_seq_add_stmt (&latch_seq, cond);
5934
5935 gimple *latch_end = gimple_seq_last (latch_seq);
5936 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5937
5938 /* Split the block just after the latch stmts. */
5939 edge post_edge = split_block (loop_bb, latch_end);
5940 basic_block post_bb = post_edge->dest;
5941 loop_bb = post_edge->src;
5942 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5943
5944 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5945 post_edge->probability = profile_probability::even ();
5946 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5947 loop_edge->probability = profile_probability::even ();
5948 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5949 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5950
5951 gphi *phi = create_phi_node (expect_var, loop_bb);
5952 add_phi_arg (phi, init_var, pre_edge, loc);
5953 add_phi_arg (phi, actual_var, loop_edge, loc);
5954
5955 loop *loop = alloc_loop ();
5956 loop->header = loop_bb;
5957 loop->latch = loop_bb;
5958 add_loop (loop, loop_bb->loop_father);
5959
5960 return fold_build1 (code, var_type, write_var);
5961 }
5962
5963 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5964 GSI. This is necessary for types larger than 64 bits, where there
5965 is no cmp&swap instruction to implement a lockless scheme. We use
5966 a lock variable in global memory.
5967
5968 while (cmp&swap (&lock_var, 0, 1))
5969 continue;
5970 T accum = *ptr;
5971 accum = accum OP var;
5972 *ptr = accum;
5973 cmp&swap (&lock_var, 1, 0);
5974 return accum;
5975
5976 A lock in global memory is necessary to force execution engine
5977 descheduling and avoid resource starvation that can occur if the
5978 lock is in .shared memory. */
5979
5980 static tree
5981 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5982 tree ptr, tree var, tree_code op)
5983 {
5984 tree var_type = TREE_TYPE (var);
5985 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5986 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5987 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5988
5989 /* Split the block just before the gsi. Insert a gimple nop to make
5990 this easier. */
5991 gimple *nop = gimple_build_nop ();
5992 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5993 basic_block entry_bb = gsi_bb (*gsi);
5994 edge entry_edge = split_block (entry_bb, nop);
5995 basic_block lock_bb = entry_edge->dest;
5996 /* Reset the iterator. */
5997 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5998
5999 /* Build and insert the locking sequence. */
6000 gimple_seq lock_seq = NULL;
6001 tree lock_var = make_ssa_name (unsigned_type_node);
6002 tree lock_expr = nvptx_global_lock_addr ();
6003 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6004 uns_unlocked, uns_locked);
6005 gimplify_assign (lock_var, lock_expr, &lock_seq);
6006 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6007 NULL_TREE, NULL_TREE);
6008 gimple_seq_add_stmt (&lock_seq, cond);
6009 gimple *lock_end = gimple_seq_last (lock_seq);
6010 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6011
6012 /* Split the block just after the lock sequence. */
6013 edge locked_edge = split_block (lock_bb, lock_end);
6014 basic_block update_bb = locked_edge->dest;
6015 lock_bb = locked_edge->src;
6016 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6017
6018 /* Create the lock loop ... */
6019 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6020 locked_edge->probability = profile_probability::even ();
6021 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6022 loop_edge->probability = profile_probability::even ();
6023 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6024 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6025
6026 /* ... and the loop structure. */
6027 loop *lock_loop = alloc_loop ();
6028 lock_loop->header = lock_bb;
6029 lock_loop->latch = lock_bb;
6030 lock_loop->nb_iterations_estimate = 1;
6031 lock_loop->any_estimate = true;
6032 add_loop (lock_loop, entry_bb->loop_father);
6033
6034 /* Build and insert the reduction calculation. */
6035 gimple_seq red_seq = NULL;
6036 tree acc_in = make_ssa_name (var_type);
6037 tree ref_in = build_simple_mem_ref (ptr);
6038 TREE_THIS_VOLATILE (ref_in) = 1;
6039 gimplify_assign (acc_in, ref_in, &red_seq);
6040
6041 tree acc_out = make_ssa_name (var_type);
6042 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6043 gimplify_assign (acc_out, update_expr, &red_seq);
6044
6045 tree ref_out = build_simple_mem_ref (ptr);
6046 TREE_THIS_VOLATILE (ref_out) = 1;
6047 gimplify_assign (ref_out, acc_out, &red_seq);
6048
6049 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6050
6051 /* Build & insert the unlock sequence. */
6052 gimple_seq unlock_seq = NULL;
6053 tree unlock_expr = nvptx_global_lock_addr ();
6054 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6055 uns_locked, uns_unlocked);
6056 gimplify_and_add (unlock_expr, &unlock_seq);
6057 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6058
6059 return acc_out;
6060 }
6061
6062 /* Emit a sequence to update a reduction accumlator at *PTR with the
6063 value held in VAR using operator OP. Return the updated value.
6064
6065 TODO: optimize for atomic ops and indepedent complex ops. */
6066
6067 static tree
6068 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6069 tree ptr, tree var, tree_code op)
6070 {
6071 tree type = TREE_TYPE (var);
6072 tree size = TYPE_SIZE (type);
6073
6074 if (size == TYPE_SIZE (unsigned_type_node)
6075 || size == TYPE_SIZE (long_long_unsigned_type_node))
6076 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6077 else
6078 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
6079 }
6080
6081 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6082
6083 static void
6084 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6085 {
6086 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6087 tree lhs = gimple_call_lhs (call);
6088 tree var = gimple_call_arg (call, 2);
6089 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6090 gimple_seq seq = NULL;
6091
6092 push_gimplify_context (true);
6093
6094 if (level != GOMP_DIM_GANG)
6095 {
6096 /* Copy the receiver object. */
6097 tree ref_to_res = gimple_call_arg (call, 1);
6098
6099 if (!integer_zerop (ref_to_res))
6100 var = build_simple_mem_ref (ref_to_res);
6101 }
6102
6103 if (level == GOMP_DIM_WORKER
6104 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6105 {
6106 /* Store incoming value to worker reduction buffer. */
6107 tree offset = gimple_call_arg (call, 5);
6108 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6109 level == GOMP_DIM_VECTOR);
6110 tree ptr = make_ssa_name (TREE_TYPE (call));
6111
6112 gimplify_assign (ptr, call, &seq);
6113 tree ref = build_simple_mem_ref (ptr);
6114 TREE_THIS_VOLATILE (ref) = 1;
6115 gimplify_assign (ref, var, &seq);
6116 }
6117
6118 if (lhs)
6119 gimplify_assign (lhs, var, &seq);
6120
6121 pop_gimplify_context (NULL);
6122 gsi_replace_with_seq (&gsi, seq, true);
6123 }
6124
6125 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6126
6127 static void
6128 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6129 {
6130 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6131 tree lhs = gimple_call_lhs (call);
6132 tree var = gimple_call_arg (call, 2);
6133 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6134 enum tree_code rcode
6135 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6136 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6137 TREE_TYPE (var));
6138 gimple_seq seq = NULL;
6139
6140 push_gimplify_context (true);
6141
6142 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6143 {
6144 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6145 tree tid = make_ssa_name (integer_type_node);
6146 tree dim_vector = gimple_call_arg (call, 3);
6147 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6148 dim_vector);
6149 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6150 NULL_TREE, NULL_TREE);
6151
6152 gimple_call_set_lhs (tid_call, tid);
6153 gimple_seq_add_stmt (&seq, tid_call);
6154 gimple_seq_add_stmt (&seq, cond_stmt);
6155
6156 /* Split the block just after the call. */
6157 edge init_edge = split_block (gsi_bb (gsi), call);
6158 basic_block init_bb = init_edge->dest;
6159 basic_block call_bb = init_edge->src;
6160
6161 /* Fixup flags from call_bb to init_bb. */
6162 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6163 init_edge->probability = profile_probability::even ();
6164
6165 /* Set the initialization stmts. */
6166 gimple_seq init_seq = NULL;
6167 tree init_var = make_ssa_name (TREE_TYPE (var));
6168 gimplify_assign (init_var, init, &init_seq);
6169 gsi = gsi_start_bb (init_bb);
6170 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6171
6172 /* Split block just after the init stmt. */
6173 gsi_prev (&gsi);
6174 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6175 basic_block dst_bb = inited_edge->dest;
6176
6177 /* Create false edge from call_bb to dst_bb. */
6178 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6179 nop_edge->probability = profile_probability::even ();
6180
6181 /* Create phi node in dst block. */
6182 gphi *phi = create_phi_node (lhs, dst_bb);
6183 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6184 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6185
6186 /* Reset dominator of dst bb. */
6187 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6188
6189 /* Reset the gsi. */
6190 gsi = gsi_for_stmt (call);
6191 }
6192 else
6193 {
6194 if (level == GOMP_DIM_GANG)
6195 {
6196 /* If there's no receiver object, propagate the incoming VAR. */
6197 tree ref_to_res = gimple_call_arg (call, 1);
6198 if (integer_zerop (ref_to_res))
6199 init = var;
6200 }
6201
6202 gimplify_assign (lhs, init, &seq);
6203 }
6204
6205 pop_gimplify_context (NULL);
6206 gsi_replace_with_seq (&gsi, seq, true);
6207 }
6208
6209 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
6210
6211 static void
6212 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
6213 {
6214 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6215 tree lhs = gimple_call_lhs (call);
6216 tree ref_to_res = gimple_call_arg (call, 1);
6217 tree var = gimple_call_arg (call, 2);
6218 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6219 enum tree_code op
6220 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6221 gimple_seq seq = NULL;
6222 tree r = NULL_TREE;;
6223
6224 push_gimplify_context (true);
6225
6226 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6227 {
6228 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6229 but that requires a method of emitting a unified jump at the
6230 gimple level. */
6231 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
6232 {
6233 tree other_var = make_ssa_name (TREE_TYPE (var));
6234 nvptx_generate_vector_shuffle (gimple_location (call),
6235 other_var, var, shfl, &seq);
6236
6237 r = make_ssa_name (TREE_TYPE (var));
6238 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6239 var, other_var), &seq);
6240 var = r;
6241 }
6242 }
6243 else
6244 {
6245 tree accum = NULL_TREE;
6246
6247 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
6248 {
6249 /* Get reduction buffer address. */
6250 tree offset = gimple_call_arg (call, 5);
6251 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6252 level == GOMP_DIM_VECTOR);
6253 tree ptr = make_ssa_name (TREE_TYPE (call));
6254
6255 gimplify_assign (ptr, call, &seq);
6256 accum = ptr;
6257 }
6258 else if (integer_zerop (ref_to_res))
6259 r = var;
6260 else
6261 accum = ref_to_res;
6262
6263 if (accum)
6264 {
6265 /* UPDATE the accumulator. */
6266 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6267 seq = NULL;
6268 r = nvptx_reduction_update (gimple_location (call), &gsi,
6269 accum, var, op);
6270 }
6271 }
6272
6273 if (lhs)
6274 gimplify_assign (lhs, r, &seq);
6275 pop_gimplify_context (NULL);
6276
6277 gsi_replace_with_seq (&gsi, seq, true);
6278 }
6279
6280 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6281
6282 static void
6283 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
6284 {
6285 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6286 tree lhs = gimple_call_lhs (call);
6287 tree var = gimple_call_arg (call, 2);
6288 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6289 gimple_seq seq = NULL;
6290
6291 push_gimplify_context (true);
6292 if (level == GOMP_DIM_WORKER
6293 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6294 {
6295 /* Read the worker reduction buffer. */
6296 tree offset = gimple_call_arg (call, 5);
6297 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6298 level == GOMP_DIM_VECTOR);
6299 tree ptr = make_ssa_name (TREE_TYPE (call));
6300
6301 gimplify_assign (ptr, call, &seq);
6302 var = build_simple_mem_ref (ptr);
6303 TREE_THIS_VOLATILE (var) = 1;
6304 }
6305
6306 if (level != GOMP_DIM_GANG)
6307 {
6308 /* Write to the receiver object. */
6309 tree ref_to_res = gimple_call_arg (call, 1);
6310
6311 if (!integer_zerop (ref_to_res))
6312 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6313 }
6314
6315 if (lhs)
6316 gimplify_assign (lhs, var, &seq);
6317
6318 pop_gimplify_context (NULL);
6319
6320 gsi_replace_with_seq (&gsi, seq, true);
6321 }
6322
6323 /* NVPTX reduction expander. */
6324
6325 static void
6326 nvptx_goacc_reduction (gcall *call)
6327 {
6328 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6329 offload_attrs oa;
6330
6331 populate_offload_attrs (&oa);
6332
6333 switch (code)
6334 {
6335 case IFN_GOACC_REDUCTION_SETUP:
6336 nvptx_goacc_reduction_setup (call, &oa);
6337 break;
6338
6339 case IFN_GOACC_REDUCTION_INIT:
6340 nvptx_goacc_reduction_init (call, &oa);
6341 break;
6342
6343 case IFN_GOACC_REDUCTION_FINI:
6344 nvptx_goacc_reduction_fini (call, &oa);
6345 break;
6346
6347 case IFN_GOACC_REDUCTION_TEARDOWN:
6348 nvptx_goacc_reduction_teardown (call, &oa);
6349 break;
6350
6351 default:
6352 gcc_unreachable ();
6353 }
6354 }
6355
6356 static bool
6357 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6358 rtx x ATTRIBUTE_UNUSED)
6359 {
6360 return true;
6361 }
6362
6363 static bool
6364 nvptx_vector_mode_supported (machine_mode mode)
6365 {
6366 return (mode == V2SImode
6367 || mode == V2DImode);
6368 }
6369
6370 /* Return the preferred mode for vectorizing scalar MODE. */
6371
6372 static machine_mode
6373 nvptx_preferred_simd_mode (scalar_mode mode)
6374 {
6375 switch (mode)
6376 {
6377 case E_DImode:
6378 return V2DImode;
6379 case E_SImode:
6380 return V2SImode;
6381
6382 default:
6383 return default_preferred_simd_mode (mode);
6384 }
6385 }
6386
6387 unsigned int
6388 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6389 {
6390 if (TREE_CODE (type) == INTEGER_TYPE)
6391 {
6392 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6393 if (size == GET_MODE_SIZE (TImode))
6394 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6395 }
6396
6397 return basic_align;
6398 }
6399
6400 /* Implement TARGET_MODES_TIEABLE_P. */
6401
6402 static bool
6403 nvptx_modes_tieable_p (machine_mode, machine_mode)
6404 {
6405 return false;
6406 }
6407
6408 /* Implement TARGET_HARD_REGNO_NREGS. */
6409
6410 static unsigned int
6411 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6412 {
6413 return 1;
6414 }
6415
6416 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6417
6418 static bool
6419 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6420 {
6421 return false;
6422 }
6423
6424 static GTY(()) tree nvptx_previous_fndecl;
6425
6426 static void
6427 nvptx_set_current_function (tree fndecl)
6428 {
6429 if (!fndecl || fndecl == nvptx_previous_fndecl)
6430 return;
6431
6432 nvptx_previous_fndecl = fndecl;
6433 vector_red_partition = 0;
6434 oacc_bcast_partition = 0;
6435 }
6436
6437 #undef TARGET_OPTION_OVERRIDE
6438 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6439
6440 #undef TARGET_ATTRIBUTE_TABLE
6441 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6442
6443 #undef TARGET_LRA_P
6444 #define TARGET_LRA_P hook_bool_void_false
6445
6446 #undef TARGET_LEGITIMATE_ADDRESS_P
6447 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6448
6449 #undef TARGET_PROMOTE_FUNCTION_MODE
6450 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6451
6452 #undef TARGET_FUNCTION_ARG
6453 #define TARGET_FUNCTION_ARG nvptx_function_arg
6454 #undef TARGET_FUNCTION_INCOMING_ARG
6455 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6456 #undef TARGET_FUNCTION_ARG_ADVANCE
6457 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6458 #undef TARGET_FUNCTION_ARG_BOUNDARY
6459 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6460 #undef TARGET_PASS_BY_REFERENCE
6461 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6462 #undef TARGET_FUNCTION_VALUE_REGNO_P
6463 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6464 #undef TARGET_FUNCTION_VALUE
6465 #define TARGET_FUNCTION_VALUE nvptx_function_value
6466 #undef TARGET_LIBCALL_VALUE
6467 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6468 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6469 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6470 #undef TARGET_GET_DRAP_RTX
6471 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6472 #undef TARGET_SPLIT_COMPLEX_ARG
6473 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6474 #undef TARGET_RETURN_IN_MEMORY
6475 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6476 #undef TARGET_OMIT_STRUCT_RETURN_REG
6477 #define TARGET_OMIT_STRUCT_RETURN_REG true
6478 #undef TARGET_STRICT_ARGUMENT_NAMING
6479 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6480 #undef TARGET_CALL_ARGS
6481 #define TARGET_CALL_ARGS nvptx_call_args
6482 #undef TARGET_END_CALL_ARGS
6483 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6484
6485 #undef TARGET_ASM_FILE_START
6486 #define TARGET_ASM_FILE_START nvptx_file_start
6487 #undef TARGET_ASM_FILE_END
6488 #define TARGET_ASM_FILE_END nvptx_file_end
6489 #undef TARGET_ASM_GLOBALIZE_LABEL
6490 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6491 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6492 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6493 #undef TARGET_PRINT_OPERAND
6494 #define TARGET_PRINT_OPERAND nvptx_print_operand
6495 #undef TARGET_PRINT_OPERAND_ADDRESS
6496 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6497 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6498 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6499 #undef TARGET_ASM_INTEGER
6500 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6501 #undef TARGET_ASM_DECL_END
6502 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6503 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6504 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6505 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6506 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6507 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6508 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6509
6510 #undef TARGET_MACHINE_DEPENDENT_REORG
6511 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6512 #undef TARGET_NO_REGISTER_ALLOCATION
6513 #define TARGET_NO_REGISTER_ALLOCATION true
6514
6515 #undef TARGET_ENCODE_SECTION_INFO
6516 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6517 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6518 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6519
6520 #undef TARGET_VECTOR_ALIGNMENT
6521 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6522
6523 #undef TARGET_CANNOT_COPY_INSN_P
6524 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6525
6526 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6527 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6528
6529 #undef TARGET_INIT_BUILTINS
6530 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6531 #undef TARGET_EXPAND_BUILTIN
6532 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6533 #undef TARGET_BUILTIN_DECL
6534 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6535
6536 #undef TARGET_SIMT_VF
6537 #define TARGET_SIMT_VF nvptx_simt_vf
6538
6539 #undef TARGET_GOACC_VALIDATE_DIMS
6540 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6541
6542 #undef TARGET_GOACC_DIM_LIMIT
6543 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6544
6545 #undef TARGET_GOACC_FORK_JOIN
6546 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6547
6548 #undef TARGET_GOACC_REDUCTION
6549 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6550
6551 #undef TARGET_CANNOT_FORCE_CONST_MEM
6552 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6553
6554 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6555 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6556
6557 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6558 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6559 nvptx_preferred_simd_mode
6560
6561 #undef TARGET_MODES_TIEABLE_P
6562 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6563
6564 #undef TARGET_HARD_REGNO_NREGS
6565 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6566
6567 #undef TARGET_CAN_CHANGE_MODE_CLASS
6568 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6569
6570 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6571 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6572
6573 #undef TARGET_SET_CURRENT_FUNCTION
6574 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6575
6576 struct gcc_target targetm = TARGET_INITIALIZER;
6577
6578 #include "gt-nvptx.h"