b2b150f11d7cb8a6f5d1667f7af7a1a6e7286da7
[gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82
83 /* The various PTX memory areas an object might reside in. */
84 enum nvptx_data_area
85 {
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
93 };
94
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls;
105
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
107 {
108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
109 static bool equal (rtx a, rtx b) { return a == b; }
110 };
111
112 static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
116 {
117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
118 static bool equal (tree a, tree b) { return a == b; }
119 };
120
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
123
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 static GTY(()) rtx worker_bcast_sym;
132
133 /* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135 static unsigned worker_red_size;
136 static unsigned worker_red_align;
137 static GTY(()) rtx worker_red_sym;
138
139 /* Global lock variable, needed for 128bit worker & gang reductions. */
140 static GTY(()) tree global_lock_var;
141
142 /* True if any function references __nvptx_stacks. */
143 static bool need_softstack_decl;
144
145 /* True if any function references __nvptx_uni. */
146 static bool need_unisimt_decl;
147
148 /* Allocate a new, cleared machine_function structure. */
149
150 static struct machine_function *
151 nvptx_init_machine_status (void)
152 {
153 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
154 p->return_mode = VOIDmode;
155 return p;
156 }
157
158 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
159 and -fopenacc is also enabled. */
160
161 static void
162 diagnose_openacc_conflict (bool optval, const char *optname)
163 {
164 if (flag_openacc && optval)
165 error ("option %s is not supported together with -fopenacc", optname);
166 }
167
168 /* Implement TARGET_OPTION_OVERRIDE. */
169
170 static void
171 nvptx_option_override (void)
172 {
173 init_machine_status = nvptx_init_machine_status;
174
175 /* Set toplevel_reorder, unless explicitly disabled. We need
176 reordering so that we emit necessary assembler decls of
177 undeclared variables. */
178 if (!global_options_set.x_flag_toplevel_reorder)
179 flag_toplevel_reorder = 1;
180
181 debug_nonbind_markers_p = 0;
182
183 /* Set flag_no_common, unless explicitly disabled. We fake common
184 using .weak, and that's not entirely accurate, so avoid it
185 unless forced. */
186 if (!global_options_set.x_flag_no_common)
187 flag_no_common = 1;
188
189 /* The patch area requires nops, which we don't have. */
190 if (function_entry_patch_area_size > 0)
191 sorry ("not generating patch area, nops not supported");
192
193 /* Assumes that it will see only hard registers. */
194 flag_var_tracking = 0;
195
196 if (nvptx_optimize < 0)
197 nvptx_optimize = optimize > 0;
198
199 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
200 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
201 declared_libfuncs_htab
202 = hash_table<declared_libfunc_hasher>::create_ggc (17);
203
204 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
205 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
206 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
207
208 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
209 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
210 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
211
212 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
213 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
214 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
215
216 if (TARGET_GOMP)
217 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
218 }
219
220 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
221 deal with ptx ideosyncracies. */
222
223 const char *
224 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
225 {
226 switch (mode)
227 {
228 case E_BLKmode:
229 return ".b8";
230 case E_BImode:
231 return ".pred";
232 case E_QImode:
233 if (promote)
234 return ".u32";
235 else
236 return ".u8";
237 case E_HImode:
238 return ".u16";
239 case E_SImode:
240 return ".u32";
241 case E_DImode:
242 return ".u64";
243
244 case E_SFmode:
245 return ".f32";
246 case E_DFmode:
247 return ".f64";
248
249 case E_V2SImode:
250 return ".v2.u32";
251 case E_V2DImode:
252 return ".v2.u64";
253
254 default:
255 gcc_unreachable ();
256 }
257 }
258
259 /* Encode the PTX data area that DECL (which might not actually be a
260 _DECL) should reside in. */
261
262 static void
263 nvptx_encode_section_info (tree decl, rtx rtl, int first)
264 {
265 default_encode_section_info (decl, rtl, first);
266 if (first && MEM_P (rtl))
267 {
268 nvptx_data_area area = DATA_AREA_GENERIC;
269
270 if (TREE_CONSTANT (decl))
271 area = DATA_AREA_CONST;
272 else if (TREE_CODE (decl) == VAR_DECL)
273 {
274 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
275 {
276 area = DATA_AREA_SHARED;
277 if (DECL_INITIAL (decl))
278 error ("static initialization of variable %q+D in %<.shared%>"
279 " memory is not supported", decl);
280 }
281 else
282 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
283 }
284
285 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
286 }
287 }
288
289 /* Return the PTX name of the data area in which SYM should be
290 placed. The symbol must have already been processed by
291 nvptx_encode_seciton_info, or equivalent. */
292
293 static const char *
294 section_for_sym (rtx sym)
295 {
296 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
297 /* Same order as nvptx_data_area enum. */
298 static char const *const areas[] =
299 {"", ".global", ".shared", ".local", ".const", ".param"};
300
301 return areas[area];
302 }
303
304 /* Similarly for a decl. */
305
306 static const char *
307 section_for_decl (const_tree decl)
308 {
309 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
310 }
311
312 /* Check NAME for special function names and redirect them by returning a
313 replacement. This applies to malloc, free and realloc, for which we
314 want to use libgcc wrappers, and call, which triggers a bug in
315 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
316 not active in an offload compiler -- the names are all set by the
317 host-side compiler. */
318
319 static const char *
320 nvptx_name_replacement (const char *name)
321 {
322 if (strcmp (name, "call") == 0)
323 return "__nvptx_call";
324 if (strcmp (name, "malloc") == 0)
325 return "__nvptx_malloc";
326 if (strcmp (name, "free") == 0)
327 return "__nvptx_free";
328 if (strcmp (name, "realloc") == 0)
329 return "__nvptx_realloc";
330 return name;
331 }
332
333 /* If MODE should be treated as two registers of an inner mode, return
334 that inner mode. Otherwise return VOIDmode. */
335
336 static machine_mode
337 maybe_split_mode (machine_mode mode)
338 {
339 if (COMPLEX_MODE_P (mode))
340 return GET_MODE_INNER (mode);
341
342 if (mode == TImode)
343 return DImode;
344
345 return VOIDmode;
346 }
347
348 /* Return true if mode should be treated as two registers. */
349
350 static bool
351 split_mode_p (machine_mode mode)
352 {
353 return maybe_split_mode (mode) != VOIDmode;
354 }
355
356 /* Output a register, subreg, or register pair (with optional
357 enclosing braces). */
358
359 static void
360 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
361 int subreg_offset = -1)
362 {
363 if (inner_mode == VOIDmode)
364 {
365 if (HARD_REGISTER_NUM_P (regno))
366 fprintf (file, "%s", reg_names[regno]);
367 else
368 fprintf (file, "%%r%d", regno);
369 }
370 else if (subreg_offset >= 0)
371 {
372 output_reg (file, regno, VOIDmode);
373 fprintf (file, "$%d", subreg_offset);
374 }
375 else
376 {
377 if (subreg_offset == -1)
378 fprintf (file, "{");
379 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
380 fprintf (file, ",");
381 output_reg (file, regno, inner_mode, 0);
382 if (subreg_offset == -1)
383 fprintf (file, "}");
384 }
385 }
386
387 /* Emit forking instructions for MASK. */
388
389 static void
390 nvptx_emit_forking (unsigned mask, bool is_call)
391 {
392 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
393 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
394 if (mask)
395 {
396 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
397
398 /* Emit fork at all levels. This helps form SESE regions, as
399 it creates a block with a single successor before entering a
400 partitooned region. That is a good candidate for the end of
401 an SESE region. */
402 if (!is_call)
403 emit_insn (gen_nvptx_fork (op));
404 emit_insn (gen_nvptx_forked (op));
405 }
406 }
407
408 /* Emit joining instructions for MASK. */
409
410 static void
411 nvptx_emit_joining (unsigned mask, bool is_call)
412 {
413 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
414 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
415 if (mask)
416 {
417 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
418
419 /* Emit joining for all non-call pars to ensure there's a single
420 predecessor for the block the join insn ends up in. This is
421 needed for skipping entire loops. */
422 if (!is_call)
423 emit_insn (gen_nvptx_joining (op));
424 emit_insn (gen_nvptx_join (op));
425 }
426 }
427
428 \f
429 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
430 returned in memory. Integer and floating types supported by the
431 machine are passed in registers, everything else is passed in
432 memory. Complex types are split. */
433
434 static bool
435 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
436 {
437 if (type)
438 {
439 if (AGGREGATE_TYPE_P (type))
440 return true;
441 if (TREE_CODE (type) == VECTOR_TYPE)
442 return true;
443 }
444
445 if (!for_return && COMPLEX_MODE_P (mode))
446 /* Complex types are passed as two underlying args. */
447 mode = GET_MODE_INNER (mode);
448
449 if (GET_MODE_CLASS (mode) != MODE_INT
450 && GET_MODE_CLASS (mode) != MODE_FLOAT)
451 return true;
452
453 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
454 return true;
455
456 return false;
457 }
458
459 /* A non-memory argument of mode MODE is being passed, determine the mode it
460 should be promoted to. This is also used for determining return
461 type promotion. */
462
463 static machine_mode
464 promote_arg (machine_mode mode, bool prototyped)
465 {
466 if (!prototyped && mode == SFmode)
467 /* K&R float promotion for unprototyped functions. */
468 mode = DFmode;
469 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
470 mode = SImode;
471
472 return mode;
473 }
474
475 /* A non-memory return type of MODE is being returned. Determine the
476 mode it should be promoted to. */
477
478 static machine_mode
479 promote_return (machine_mode mode)
480 {
481 return promote_arg (mode, true);
482 }
483
484 /* Implement TARGET_FUNCTION_ARG. */
485
486 static rtx
487 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
488 const_tree, bool named)
489 {
490 if (mode == VOIDmode || !named)
491 return NULL_RTX;
492
493 return gen_reg_rtx (mode);
494 }
495
496 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
497
498 static rtx
499 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
500 const_tree, bool named)
501 {
502 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
503
504 if (mode == VOIDmode || !named)
505 return NULL_RTX;
506
507 /* No need to deal with split modes here, the only case that can
508 happen is complex modes and those are dealt with by
509 TARGET_SPLIT_COMPLEX_ARG. */
510 return gen_rtx_UNSPEC (mode,
511 gen_rtvec (1, GEN_INT (cum->count)),
512 UNSPEC_ARG_REG);
513 }
514
515 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
516
517 static void
518 nvptx_function_arg_advance (cumulative_args_t cum_v,
519 machine_mode ARG_UNUSED (mode),
520 const_tree ARG_UNUSED (type),
521 bool ARG_UNUSED (named))
522 {
523 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
524
525 cum->count++;
526 }
527
528 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
529
530 For nvptx This is only used for varadic args. The type has already
531 been promoted and/or converted to invisible reference. */
532
533 static unsigned
534 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
535 {
536 return GET_MODE_ALIGNMENT (mode);
537 }
538
539 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
540
541 For nvptx, we know how to handle functions declared as stdarg: by
542 passing an extra pointer to the unnamed arguments. However, the
543 Fortran frontend can produce a different situation, where a
544 function pointer is declared with no arguments, but the actual
545 function and calls to it take more arguments. In that case, we
546 want to ensure the call matches the definition of the function. */
547
548 static bool
549 nvptx_strict_argument_naming (cumulative_args_t cum_v)
550 {
551 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
552
553 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
554 }
555
556 /* Implement TARGET_LIBCALL_VALUE. */
557
558 static rtx
559 nvptx_libcall_value (machine_mode mode, const_rtx)
560 {
561 if (!cfun || !cfun->machine->doing_call)
562 /* Pretend to return in a hard reg for early uses before pseudos can be
563 generated. */
564 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
565
566 return gen_reg_rtx (mode);
567 }
568
569 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
570 where function FUNC returns or receives a value of data type TYPE. */
571
572 static rtx
573 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
574 bool outgoing)
575 {
576 machine_mode mode = promote_return (TYPE_MODE (type));
577
578 if (outgoing)
579 {
580 gcc_assert (cfun);
581 cfun->machine->return_mode = mode;
582 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
583 }
584
585 return nvptx_libcall_value (mode, NULL_RTX);
586 }
587
588 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
589
590 static bool
591 nvptx_function_value_regno_p (const unsigned int regno)
592 {
593 return regno == NVPTX_RETURN_REGNUM;
594 }
595
596 /* Types with a mode other than those supported by the machine are passed by
597 reference in memory. */
598
599 static bool
600 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
601 machine_mode mode, const_tree type,
602 bool ARG_UNUSED (named))
603 {
604 return pass_in_memory (mode, type, false);
605 }
606
607 /* Implement TARGET_RETURN_IN_MEMORY. */
608
609 static bool
610 nvptx_return_in_memory (const_tree type, const_tree)
611 {
612 return pass_in_memory (TYPE_MODE (type), type, true);
613 }
614
615 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
616
617 static machine_mode
618 nvptx_promote_function_mode (const_tree type, machine_mode mode,
619 int *ARG_UNUSED (punsignedp),
620 const_tree funtype, int for_return)
621 {
622 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
623 }
624
625 /* Helper for write_arg. Emit a single PTX argument of MODE, either
626 in a prototype, or as copy in a function prologue. ARGNO is the
627 index of this argument in the PTX function. FOR_REG is negative,
628 if we're emitting the PTX prototype. It is zero if we're copying
629 to an argument register and it is greater than zero if we're
630 copying to a specific hard register. */
631
632 static int
633 write_arg_mode (std::stringstream &s, int for_reg, int argno,
634 machine_mode mode)
635 {
636 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
637
638 if (for_reg < 0)
639 {
640 /* Writing PTX prototype. */
641 s << (argno ? ", " : " (");
642 s << ".param" << ptx_type << " %in_ar" << argno;
643 }
644 else
645 {
646 s << "\t.reg" << ptx_type << " ";
647 if (for_reg)
648 s << reg_names[for_reg];
649 else
650 s << "%ar" << argno;
651 s << ";\n";
652 if (argno >= 0)
653 {
654 s << "\tld.param" << ptx_type << " ";
655 if (for_reg)
656 s << reg_names[for_reg];
657 else
658 s << "%ar" << argno;
659 s << ", [%in_ar" << argno << "];\n";
660 }
661 }
662 return argno + 1;
663 }
664
665 /* Process function parameter TYPE to emit one or more PTX
666 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
667 is true, if this is a prototyped function, rather than an old-style
668 C declaration. Returns the next argument number to use.
669
670 The promotion behavior here must match the regular GCC function
671 parameter marshalling machinery. */
672
673 static int
674 write_arg_type (std::stringstream &s, int for_reg, int argno,
675 tree type, bool prototyped)
676 {
677 machine_mode mode = TYPE_MODE (type);
678
679 if (mode == VOIDmode)
680 return argno;
681
682 if (pass_in_memory (mode, type, false))
683 mode = Pmode;
684 else
685 {
686 bool split = TREE_CODE (type) == COMPLEX_TYPE;
687
688 if (split)
689 {
690 /* Complex types are sent as two separate args. */
691 type = TREE_TYPE (type);
692 mode = TYPE_MODE (type);
693 prototyped = true;
694 }
695
696 mode = promote_arg (mode, prototyped);
697 if (split)
698 argno = write_arg_mode (s, for_reg, argno, mode);
699 }
700
701 return write_arg_mode (s, for_reg, argno, mode);
702 }
703
704 /* Emit a PTX return as a prototype or function prologue declaration
705 for MODE. */
706
707 static void
708 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
709 {
710 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
711 const char *pfx = "\t.reg";
712 const char *sfx = ";\n";
713
714 if (for_proto)
715 pfx = "(.param", sfx = "_out) ";
716
717 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
718 }
719
720 /* Process a function return TYPE to emit a PTX return as a prototype
721 or function prologue declaration. Returns true if return is via an
722 additional pointer parameter. The promotion behavior here must
723 match the regular GCC function return mashalling. */
724
725 static bool
726 write_return_type (std::stringstream &s, bool for_proto, tree type)
727 {
728 machine_mode mode = TYPE_MODE (type);
729
730 if (mode == VOIDmode)
731 return false;
732
733 bool return_in_mem = pass_in_memory (mode, type, true);
734
735 if (return_in_mem)
736 {
737 if (for_proto)
738 return return_in_mem;
739
740 /* Named return values can cause us to return a pointer as well
741 as expect an argument for the return location. This is
742 optimization-level specific, so no caller can make use of
743 this data, but more importantly for us, we must ensure it
744 doesn't change the PTX prototype. */
745 mode = (machine_mode) cfun->machine->return_mode;
746
747 if (mode == VOIDmode)
748 return return_in_mem;
749
750 /* Clear return_mode to inhibit copy of retval to non-existent
751 retval parameter. */
752 cfun->machine->return_mode = VOIDmode;
753 }
754 else
755 mode = promote_return (mode);
756
757 write_return_mode (s, for_proto, mode);
758
759 return return_in_mem;
760 }
761
762 /* Look for attributes in ATTRS that would indicate we must write a function
763 as a .entry kernel rather than a .func. Return true if one is found. */
764
765 static bool
766 write_as_kernel (tree attrs)
767 {
768 return (lookup_attribute ("kernel", attrs) != NULL_TREE
769 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
770 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
771 /* For OpenMP target regions, the corresponding kernel entry is emitted from
772 write_omp_entry as a separate function. */
773 }
774
775 /* Emit a linker marker for a function decl or defn. */
776
777 static void
778 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
779 const char *name)
780 {
781 s << "\n// BEGIN";
782 if (globalize)
783 s << " GLOBAL";
784 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
785 s << name << "\n";
786 }
787
788 /* Emit a linker marker for a variable decl or defn. */
789
790 static void
791 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
792 {
793 fprintf (file, "\n// BEGIN%s VAR %s: ",
794 globalize ? " GLOBAL" : "",
795 is_defn ? "DEF" : "DECL");
796 assemble_name_raw (file, name);
797 fputs ("\n", file);
798 }
799
800 /* Write a .func or .kernel declaration or definition along with
801 a helper comment for use by ld. S is the stream to write to, DECL
802 the decl for the function with name NAME. For definitions, emit
803 a declaration too. */
804
805 static const char *
806 write_fn_proto (std::stringstream &s, bool is_defn,
807 const char *name, const_tree decl)
808 {
809 if (is_defn)
810 /* Emit a declaration. The PTX assembler gets upset without it. */
811 name = write_fn_proto (s, false, name, decl);
812 else
813 {
814 /* Avoid repeating the name replacement. */
815 name = nvptx_name_replacement (name);
816 if (name[0] == '*')
817 name++;
818 }
819
820 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
821
822 /* PTX declaration. */
823 if (DECL_EXTERNAL (decl))
824 s << ".extern ";
825 else if (TREE_PUBLIC (decl))
826 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
827 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
828
829 tree fntype = TREE_TYPE (decl);
830 tree result_type = TREE_TYPE (fntype);
831
832 /* atomic_compare_exchange_$n builtins have an exceptional calling
833 convention. */
834 int not_atomic_weak_arg = -1;
835 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
836 switch (DECL_FUNCTION_CODE (decl))
837 {
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
841 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
842 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
843 /* These atomics skip the 'weak' parm in an actual library
844 call. We must skip it in the prototype too. */
845 not_atomic_weak_arg = 3;
846 break;
847
848 default:
849 break;
850 }
851
852 /* Declare the result. */
853 bool return_in_mem = write_return_type (s, true, result_type);
854
855 s << name;
856
857 int argno = 0;
858
859 /* Emit argument list. */
860 if (return_in_mem)
861 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
862
863 /* We get:
864 NULL in TYPE_ARG_TYPES, for old-style functions
865 NULL in DECL_ARGUMENTS, for builtin functions without another
866 declaration.
867 So we have to pick the best one we have. */
868 tree args = TYPE_ARG_TYPES (fntype);
869 bool prototyped = true;
870 if (!args)
871 {
872 args = DECL_ARGUMENTS (decl);
873 prototyped = false;
874 }
875
876 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
877 {
878 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
879
880 if (not_atomic_weak_arg)
881 argno = write_arg_type (s, -1, argno, type, prototyped);
882 else
883 gcc_assert (type == boolean_type_node);
884 }
885
886 if (stdarg_p (fntype))
887 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
888
889 if (DECL_STATIC_CHAIN (decl))
890 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
891
892 if (!argno && strcmp (name, "main") == 0)
893 {
894 argno = write_arg_type (s, -1, argno, integer_type_node, true);
895 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
896 }
897
898 if (argno)
899 s << ")";
900
901 s << (is_defn ? "\n" : ";\n");
902
903 return name;
904 }
905
906 /* Construct a function declaration from a call insn. This can be
907 necessary for two reasons - either we have an indirect call which
908 requires a .callprototype declaration, or we have a libcall
909 generated by emit_library_call for which no decl exists. */
910
911 static void
912 write_fn_proto_from_insn (std::stringstream &s, const char *name,
913 rtx result, rtx pat)
914 {
915 if (!name)
916 {
917 s << "\t.callprototype ";
918 name = "_";
919 }
920 else
921 {
922 name = nvptx_name_replacement (name);
923 write_fn_marker (s, false, true, name);
924 s << "\t.extern .func ";
925 }
926
927 if (result != NULL_RTX)
928 write_return_mode (s, true, GET_MODE (result));
929
930 s << name;
931
932 int arg_end = XVECLEN (pat, 0);
933 for (int i = 1; i < arg_end; i++)
934 {
935 /* We don't have to deal with mode splitting & promotion here,
936 as that was already done when generating the call
937 sequence. */
938 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
939
940 write_arg_mode (s, -1, i - 1, mode);
941 }
942 if (arg_end != 1)
943 s << ")";
944 s << ";\n";
945 }
946
947 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
948 table and and write a ptx prototype. These are emitted at end of
949 compilation. */
950
951 static void
952 nvptx_record_fndecl (tree decl)
953 {
954 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
955 if (*slot == NULL)
956 {
957 *slot = decl;
958 const char *name = get_fnname_from_decl (decl);
959 write_fn_proto (func_decls, false, name, decl);
960 }
961 }
962
963 /* Record a libcall or unprototyped external function. CALLEE is the
964 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
965 declaration for it. */
966
967 static void
968 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
969 {
970 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
971 if (*slot == NULL)
972 {
973 *slot = callee;
974
975 const char *name = XSTR (callee, 0);
976 write_fn_proto_from_insn (func_decls, name, retval, pat);
977 }
978 }
979
980 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
981 is prototyped, record it now. Otherwise record it as needed at end
982 of compilation, when we might have more information about it. */
983
984 void
985 nvptx_record_needed_fndecl (tree decl)
986 {
987 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
988 {
989 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
990 if (*slot == NULL)
991 *slot = decl;
992 }
993 else
994 nvptx_record_fndecl (decl);
995 }
996
997 /* SYM is a SYMBOL_REF. If it refers to an external function, record
998 it as needed. */
999
1000 static void
1001 nvptx_maybe_record_fnsym (rtx sym)
1002 {
1003 tree decl = SYMBOL_REF_DECL (sym);
1004
1005 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1006 nvptx_record_needed_fndecl (decl);
1007 }
1008
1009 /* Emit a local array to hold some part of a conventional stack frame
1010 and initialize REGNO to point to it. If the size is zero, it'll
1011 never be valid to dereference, so we can simply initialize to
1012 zero. */
1013
1014 static void
1015 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1016 {
1017 if (size)
1018 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1019 align, reg_names[regno], size);
1020 fprintf (file, "\t.reg.u%d %s;\n",
1021 POINTER_SIZE, reg_names[regno]);
1022 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1023 : "\tmov.u%d %s, 0;\n"),
1024 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1025 }
1026
1027 /* Emit soft stack frame setup sequence. */
1028
1029 static void
1030 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1031 {
1032 /* Maintain 64-bit stack alignment. */
1033 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1034 size = ROUND_UP (size, keep_align);
1035 int bits = POINTER_SIZE;
1036 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1037 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1038 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1039 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1040 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1041 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1042 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1043 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1044 fprintf (file, "\t{\n");
1045 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1046 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1047 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1048 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1049 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1050 bits == 64 ? ".wide" : ".lo", bits / 8);
1051 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1052
1053 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1054 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1055
1056 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1057 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1058 bits, reg_sspprev, reg_sspslot);
1059
1060 /* Initialize %frame = %sspprev - size. */
1061 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1062 bits, reg_frame, reg_sspprev, size);
1063
1064 /* Apply alignment, if larger than 64. */
1065 if (alignment > keep_align)
1066 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1067 bits, reg_frame, reg_frame, -alignment);
1068
1069 size = crtl->outgoing_args_size;
1070 gcc_assert (size % keep_align == 0);
1071
1072 /* Initialize %stack. */
1073 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1074 bits, reg_stack, reg_frame, size);
1075
1076 if (!crtl->is_leaf)
1077 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1078 bits, reg_sspslot, reg_stack);
1079 fprintf (file, "\t}\n");
1080 cfun->machine->has_softstack = true;
1081 need_softstack_decl = true;
1082 }
1083
1084 /* Emit code to initialize the REGNO predicate register to indicate
1085 whether we are not lane zero on the NAME axis. */
1086
1087 static void
1088 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1089 {
1090 fprintf (file, "\t{\n");
1091 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1092 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1093 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1094 fprintf (file, "\t}\n");
1095 }
1096
1097 /* Emit code to initialize predicate and master lane index registers for
1098 -muniform-simt code generation variant. */
1099
1100 static void
1101 nvptx_init_unisimt_predicate (FILE *file)
1102 {
1103 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1104 int loc = REGNO (cfun->machine->unisimt_location);
1105 int bits = POINTER_SIZE;
1106 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1107 fprintf (file, "\t{\n");
1108 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1109 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1110 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1111 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1112 bits == 64 ? ".wide" : ".lo");
1113 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1114 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1115 if (cfun->machine->unisimt_predicate)
1116 {
1117 int master = REGNO (cfun->machine->unisimt_master);
1118 int pred = REGNO (cfun->machine->unisimt_predicate);
1119 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1120 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1121 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1122 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1123 /* Compute predicate as 'tid.x == master'. */
1124 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1125 }
1126 fprintf (file, "\t}\n");
1127 need_unisimt_decl = true;
1128 }
1129
1130 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1131
1132 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1133 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1134 {
1135 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1136 __nvptx_uni[tid.y] = 0;
1137 gomp_nvptx_main (ORIG, arg);
1138 }
1139 ORIG itself should not be emitted as a PTX .entry function. */
1140
1141 static void
1142 write_omp_entry (FILE *file, const char *name, const char *orig)
1143 {
1144 static bool gomp_nvptx_main_declared;
1145 if (!gomp_nvptx_main_declared)
1146 {
1147 gomp_nvptx_main_declared = true;
1148 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1149 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1150 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1151 }
1152 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1153 #define NTID_Y "%ntid.y"
1154 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1155 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1156 {\n\
1157 .reg.u32 %r<3>;\n\
1158 .reg.u" PS " %R<4>;\n\
1159 mov.u32 %r0, %tid.y;\n\
1160 mov.u32 %r1, " NTID_Y ";\n\
1161 mov.u32 %r2, %ctaid.x;\n\
1162 cvt.u" PS ".u32 %R1, %r0;\n\
1163 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1164 mov.u" PS " %R0, __nvptx_stacks;\n\
1165 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1166 ld.param.u" PS " %R2, [%stack];\n\
1167 ld.param.u" PS " %R3, [%sz];\n\
1168 add.u" PS " %R2, %R2, %R3;\n\
1169 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1170 st.shared.u" PS " [%R0], %R2;\n\
1171 mov.u" PS " %R0, __nvptx_uni;\n\
1172 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1173 mov.u32 %r0, 0;\n\
1174 st.shared.u32 [%R0], %r0;\n\
1175 mov.u" PS " %R0, \0;\n\
1176 ld.param.u" PS " %R1, [%arg];\n\
1177 {\n\
1178 .param.u" PS " %P<2>;\n\
1179 st.param.u" PS " [%P0], %R0;\n\
1180 st.param.u" PS " [%P1], %R1;\n\
1181 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1182 }\n\
1183 ret.uni;\n\
1184 }\n"
1185 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1186 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1187 #undef ENTRY_TEMPLATE
1188 #undef NTID_Y
1189 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1190 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1191 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1192 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1193 need_softstack_decl = need_unisimt_decl = true;
1194 }
1195
1196 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1197 function, including local var decls and copies from the arguments to
1198 local regs. */
1199
1200 void
1201 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1202 {
1203 tree fntype = TREE_TYPE (decl);
1204 tree result_type = TREE_TYPE (fntype);
1205 int argno = 0;
1206
1207 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1208 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1209 {
1210 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1211 sprintf (buf, "%s$impl", name);
1212 write_omp_entry (file, name, buf);
1213 name = buf;
1214 }
1215 /* We construct the initial part of the function into a string
1216 stream, in order to share the prototype writing code. */
1217 std::stringstream s;
1218 write_fn_proto (s, true, name, decl);
1219 s << "{\n";
1220
1221 bool return_in_mem = write_return_type (s, false, result_type);
1222 if (return_in_mem)
1223 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1224
1225 /* Declare and initialize incoming arguments. */
1226 tree args = TYPE_ARG_TYPES (fntype);
1227 bool prototyped = true;
1228 if (!args)
1229 {
1230 args = DECL_ARGUMENTS (decl);
1231 prototyped = false;
1232 }
1233
1234 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1235 {
1236 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1237
1238 argno = write_arg_type (s, 0, argno, type, prototyped);
1239 }
1240
1241 if (stdarg_p (fntype))
1242 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1243 true);
1244
1245 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1246 write_arg_type (s, STATIC_CHAIN_REGNUM,
1247 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1248 true);
1249
1250 fprintf (file, "%s", s.str().c_str());
1251
1252 /* Usually 'crtl->is_leaf' is computed during register allocator
1253 initialization (which is not done on NVPTX) or for pressure-sensitive
1254 optimizations. Initialize it here, except if already set. */
1255 if (!crtl->is_leaf)
1256 crtl->is_leaf = leaf_function_p ();
1257
1258 HOST_WIDE_INT sz = get_frame_size ();
1259 bool need_frameptr = sz || cfun->machine->has_chain;
1260 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1261 if (!TARGET_SOFT_STACK)
1262 {
1263 /* Declare a local var for outgoing varargs. */
1264 if (cfun->machine->has_varadic)
1265 init_frame (file, STACK_POINTER_REGNUM,
1266 UNITS_PER_WORD, crtl->outgoing_args_size);
1267
1268 /* Declare a local variable for the frame. Force its size to be
1269 DImode-compatible. */
1270 if (need_frameptr)
1271 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1272 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1273 }
1274 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1275 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1276 init_softstack_frame (file, alignment, sz);
1277
1278 if (cfun->machine->has_simtreg)
1279 {
1280 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1281 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1282 align = MAX (align, GET_MODE_SIZE (DImode));
1283 if (!crtl->is_leaf || cfun->calls_alloca)
1284 simtsz = HOST_WIDE_INT_M1U;
1285 if (simtsz == HOST_WIDE_INT_M1U)
1286 simtsz = nvptx_softstack_size;
1287 if (cfun->machine->has_softstack)
1288 simtsz += POINTER_SIZE / 8;
1289 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1290 if (align > GET_MODE_SIZE (DImode))
1291 simtsz += align - GET_MODE_SIZE (DImode);
1292 if (simtsz)
1293 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1294 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1295 }
1296 /* Declare the pseudos we have as ptx registers. */
1297 int maxregs = max_reg_num ();
1298 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1299 {
1300 if (regno_reg_rtx[i] != const0_rtx)
1301 {
1302 machine_mode mode = PSEUDO_REGNO_MODE (i);
1303 machine_mode split = maybe_split_mode (mode);
1304
1305 if (split_mode_p (mode))
1306 mode = split;
1307 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1308 output_reg (file, i, split, -2);
1309 fprintf (file, ";\n");
1310 }
1311 }
1312
1313 /* Emit axis predicates. */
1314 if (cfun->machine->axis_predicate[0])
1315 nvptx_init_axis_predicate (file,
1316 REGNO (cfun->machine->axis_predicate[0]), "y");
1317 if (cfun->machine->axis_predicate[1])
1318 nvptx_init_axis_predicate (file,
1319 REGNO (cfun->machine->axis_predicate[1]), "x");
1320 if (cfun->machine->unisimt_predicate
1321 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1322 nvptx_init_unisimt_predicate (file);
1323 }
1324
1325 /* Output code for switching uniform-simt state. ENTERING indicates whether
1326 we are entering or leaving non-uniform execution region. */
1327
1328 static void
1329 nvptx_output_unisimt_switch (FILE *file, bool entering)
1330 {
1331 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1332 return;
1333 fprintf (file, "\t{\n");
1334 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1335 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1336 if (!crtl->is_leaf)
1337 {
1338 int loc = REGNO (cfun->machine->unisimt_location);
1339 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1340 }
1341 if (cfun->machine->unisimt_predicate)
1342 {
1343 int master = REGNO (cfun->machine->unisimt_master);
1344 int pred = REGNO (cfun->machine->unisimt_predicate);
1345 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1346 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1347 master, entering ? "%ustmp2" : "0");
1348 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1349 }
1350 fprintf (file, "\t}\n");
1351 }
1352
1353 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1354 ENTERING indicates whether we are entering or leaving non-uniform execution.
1355 PTR is the register pointing to allocated storage, it is assigned to on
1356 entering and used to restore state on leaving. SIZE and ALIGN are used only
1357 on entering. */
1358
1359 static void
1360 nvptx_output_softstack_switch (FILE *file, bool entering,
1361 rtx ptr, rtx size, rtx align)
1362 {
1363 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1364 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1365 return;
1366 int bits = POINTER_SIZE, regno = REGNO (ptr);
1367 fprintf (file, "\t{\n");
1368 if (entering)
1369 {
1370 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1371 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1372 cfun->machine->simt_stack_size);
1373 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1374 if (CONST_INT_P (size))
1375 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1376 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1377 else
1378 output_reg (file, REGNO (size), VOIDmode);
1379 fputs (";\n", file);
1380 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1381 fprintf (file,
1382 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1383 bits, regno, regno, UINTVAL (align));
1384 }
1385 if (cfun->machine->has_softstack)
1386 {
1387 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1388 if (entering)
1389 {
1390 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1391 bits, regno, bits / 8, reg_stack);
1392 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1393 bits, reg_stack, regno, bits / 8);
1394 }
1395 else
1396 {
1397 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1398 bits, reg_stack, regno, bits / 8);
1399 }
1400 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1401 }
1402 fprintf (file, "\t}\n");
1403 }
1404
1405 /* Output code to enter non-uniform execution region. DEST is a register
1406 to hold a per-lane allocation given by SIZE and ALIGN. */
1407
1408 const char *
1409 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1410 {
1411 nvptx_output_unisimt_switch (asm_out_file, true);
1412 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1413 return "";
1414 }
1415
1416 /* Output code to leave non-uniform execution region. SRC is the register
1417 holding per-lane storage previously allocated by omp_simt_enter insn. */
1418
1419 const char *
1420 nvptx_output_simt_exit (rtx src)
1421 {
1422 nvptx_output_unisimt_switch (asm_out_file, false);
1423 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1424 return "";
1425 }
1426
1427 /* Output instruction that sets soft stack pointer in shared memory to the
1428 value in register given by SRC_REGNO. */
1429
1430 const char *
1431 nvptx_output_set_softstack (unsigned src_regno)
1432 {
1433 if (cfun->machine->has_softstack && !crtl->is_leaf)
1434 {
1435 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1436 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1437 output_reg (asm_out_file, src_regno, VOIDmode);
1438 fprintf (asm_out_file, ";\n");
1439 }
1440 return "";
1441 }
1442 /* Output a return instruction. Also copy the return value to its outgoing
1443 location. */
1444
1445 const char *
1446 nvptx_output_return (void)
1447 {
1448 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1449
1450 if (mode != VOIDmode)
1451 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1452 nvptx_ptx_type_from_mode (mode, false),
1453 reg_names[NVPTX_RETURN_REGNUM],
1454 reg_names[NVPTX_RETURN_REGNUM]);
1455
1456 return "ret;";
1457 }
1458
1459 /* Terminate a function by writing a closing brace to FILE. */
1460
1461 void
1462 nvptx_function_end (FILE *file)
1463 {
1464 fprintf (file, "}\n");
1465 }
1466 \f
1467 /* Decide whether we can make a sibling call to a function. For ptx, we
1468 can't. */
1469
1470 static bool
1471 nvptx_function_ok_for_sibcall (tree, tree)
1472 {
1473 return false;
1474 }
1475
1476 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1477
1478 static rtx
1479 nvptx_get_drap_rtx (void)
1480 {
1481 if (TARGET_SOFT_STACK && stack_realign_drap)
1482 return arg_pointer_rtx;
1483 return NULL_RTX;
1484 }
1485
1486 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1487 argument to the next call. */
1488
1489 static void
1490 nvptx_call_args (rtx arg, tree fntype)
1491 {
1492 if (!cfun->machine->doing_call)
1493 {
1494 cfun->machine->doing_call = true;
1495 cfun->machine->is_varadic = false;
1496 cfun->machine->num_args = 0;
1497
1498 if (fntype && stdarg_p (fntype))
1499 {
1500 cfun->machine->is_varadic = true;
1501 cfun->machine->has_varadic = true;
1502 cfun->machine->num_args++;
1503 }
1504 }
1505
1506 if (REG_P (arg) && arg != pc_rtx)
1507 {
1508 cfun->machine->num_args++;
1509 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1510 cfun->machine->call_args);
1511 }
1512 }
1513
1514 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1515 information we recorded. */
1516
1517 static void
1518 nvptx_end_call_args (void)
1519 {
1520 cfun->machine->doing_call = false;
1521 free_EXPR_LIST_list (&cfun->machine->call_args);
1522 }
1523
1524 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1525 track of whether calls involving static chains or varargs were seen
1526 in the current function.
1527 For libcalls, maintain a hash table of decls we have seen, and
1528 record a function decl for later when encountering a new one. */
1529
1530 void
1531 nvptx_expand_call (rtx retval, rtx address)
1532 {
1533 rtx callee = XEXP (address, 0);
1534 rtx varargs = NULL_RTX;
1535 unsigned parallel = 0;
1536
1537 if (!call_insn_operand (callee, Pmode))
1538 {
1539 callee = force_reg (Pmode, callee);
1540 address = change_address (address, QImode, callee);
1541 }
1542
1543 if (GET_CODE (callee) == SYMBOL_REF)
1544 {
1545 tree decl = SYMBOL_REF_DECL (callee);
1546 if (decl != NULL_TREE)
1547 {
1548 if (DECL_STATIC_CHAIN (decl))
1549 cfun->machine->has_chain = true;
1550
1551 tree attr = oacc_get_fn_attrib (decl);
1552 if (attr)
1553 {
1554 tree dims = TREE_VALUE (attr);
1555
1556 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1557 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1558 {
1559 if (TREE_PURPOSE (dims)
1560 && !integer_zerop (TREE_PURPOSE (dims)))
1561 break;
1562 /* Not on this axis. */
1563 parallel ^= GOMP_DIM_MASK (ix);
1564 dims = TREE_CHAIN (dims);
1565 }
1566 }
1567 }
1568 }
1569
1570 unsigned nargs = cfun->machine->num_args;
1571 if (cfun->machine->is_varadic)
1572 {
1573 varargs = gen_reg_rtx (Pmode);
1574 emit_move_insn (varargs, stack_pointer_rtx);
1575 }
1576
1577 rtvec vec = rtvec_alloc (nargs + 1);
1578 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1579 int vec_pos = 0;
1580
1581 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1582 rtx tmp_retval = retval;
1583 if (retval)
1584 {
1585 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1586 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1587 call = gen_rtx_SET (tmp_retval, call);
1588 }
1589 XVECEXP (pat, 0, vec_pos++) = call;
1590
1591 /* Construct the call insn, including a USE for each argument pseudo
1592 register. These will be used when printing the insn. */
1593 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1594 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1595
1596 if (varargs)
1597 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1598
1599 gcc_assert (vec_pos = XVECLEN (pat, 0));
1600
1601 nvptx_emit_forking (parallel, true);
1602 emit_call_insn (pat);
1603 nvptx_emit_joining (parallel, true);
1604
1605 if (tmp_retval != retval)
1606 emit_move_insn (retval, tmp_retval);
1607 }
1608
1609 /* Emit a comparison COMPARE, and return the new test to be used in the
1610 jump. */
1611
1612 rtx
1613 nvptx_expand_compare (rtx compare)
1614 {
1615 rtx pred = gen_reg_rtx (BImode);
1616 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1617 XEXP (compare, 0), XEXP (compare, 1));
1618 emit_insn (gen_rtx_SET (pred, cmp));
1619 return gen_rtx_NE (BImode, pred, const0_rtx);
1620 }
1621
1622 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1623
1624 void
1625 nvptx_expand_oacc_fork (unsigned mode)
1626 {
1627 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1628 }
1629
1630 void
1631 nvptx_expand_oacc_join (unsigned mode)
1632 {
1633 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1634 }
1635
1636 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1637 objects. */
1638
1639 static rtx
1640 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1641 {
1642 rtx res;
1643
1644 switch (GET_MODE (src))
1645 {
1646 case E_DImode:
1647 res = gen_unpackdisi2 (dst0, dst1, src);
1648 break;
1649 case E_DFmode:
1650 res = gen_unpackdfsi2 (dst0, dst1, src);
1651 break;
1652 default: gcc_unreachable ();
1653 }
1654 return res;
1655 }
1656
1657 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1658 object. */
1659
1660 static rtx
1661 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1662 {
1663 rtx res;
1664
1665 switch (GET_MODE (dst))
1666 {
1667 case E_DImode:
1668 res = gen_packsidi2 (dst, src0, src1);
1669 break;
1670 case E_DFmode:
1671 res = gen_packsidf2 (dst, src0, src1);
1672 break;
1673 default: gcc_unreachable ();
1674 }
1675 return res;
1676 }
1677
1678 /* Generate an instruction or sequence to broadcast register REG
1679 across the vectors of a single warp. */
1680
1681 rtx
1682 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1683 {
1684 rtx res;
1685
1686 switch (GET_MODE (dst))
1687 {
1688 case E_SImode:
1689 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1690 break;
1691 case E_SFmode:
1692 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1693 break;
1694 case E_DImode:
1695 case E_DFmode:
1696 {
1697 rtx tmp0 = gen_reg_rtx (SImode);
1698 rtx tmp1 = gen_reg_rtx (SImode);
1699
1700 start_sequence ();
1701 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1702 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1703 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1704 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1705 res = get_insns ();
1706 end_sequence ();
1707 }
1708 break;
1709 case E_BImode:
1710 {
1711 rtx tmp = gen_reg_rtx (SImode);
1712
1713 start_sequence ();
1714 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1715 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1716 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1717 res = get_insns ();
1718 end_sequence ();
1719 }
1720 break;
1721 case E_QImode:
1722 case E_HImode:
1723 {
1724 rtx tmp = gen_reg_rtx (SImode);
1725
1726 start_sequence ();
1727 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1728 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1729 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1730 tmp)));
1731 res = get_insns ();
1732 end_sequence ();
1733 }
1734 break;
1735
1736 default:
1737 gcc_unreachable ();
1738 }
1739 return res;
1740 }
1741
1742 /* Generate an instruction or sequence to broadcast register REG
1743 across the vectors of a single warp. */
1744
1745 static rtx
1746 nvptx_gen_vcast (rtx reg)
1747 {
1748 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1749 }
1750
1751 /* Structure used when generating a worker-level spill or fill. */
1752
1753 struct wcast_data_t
1754 {
1755 rtx base; /* Register holding base addr of buffer. */
1756 rtx ptr; /* Iteration var, if needed. */
1757 unsigned offset; /* Offset into worker buffer. */
1758 };
1759
1760 /* Direction of the spill/fill and looping setup/teardown indicator. */
1761
1762 enum propagate_mask
1763 {
1764 PM_read = 1 << 0,
1765 PM_write = 1 << 1,
1766 PM_loop_begin = 1 << 2,
1767 PM_loop_end = 1 << 3,
1768
1769 PM_read_write = PM_read | PM_write
1770 };
1771
1772 /* Generate instruction(s) to spill or fill register REG to/from the
1773 worker broadcast array. PM indicates what is to be done, REP
1774 how many loop iterations will be executed (0 for not a loop). */
1775
1776 static rtx
1777 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1778 {
1779 rtx res;
1780 machine_mode mode = GET_MODE (reg);
1781
1782 switch (mode)
1783 {
1784 case E_BImode:
1785 {
1786 rtx tmp = gen_reg_rtx (SImode);
1787
1788 start_sequence ();
1789 if (pm & PM_read)
1790 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1791 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1792 if (pm & PM_write)
1793 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1794 res = get_insns ();
1795 end_sequence ();
1796 }
1797 break;
1798
1799 default:
1800 {
1801 rtx addr = data->ptr;
1802
1803 if (!addr)
1804 {
1805 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1806
1807 if (align > worker_bcast_align)
1808 worker_bcast_align = align;
1809 data->offset = (data->offset + align - 1) & ~(align - 1);
1810 addr = data->base;
1811 if (data->offset)
1812 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1813 }
1814
1815 addr = gen_rtx_MEM (mode, addr);
1816 if (pm == PM_read)
1817 res = gen_rtx_SET (addr, reg);
1818 else if (pm == PM_write)
1819 res = gen_rtx_SET (reg, addr);
1820 else
1821 gcc_unreachable ();
1822
1823 if (data->ptr)
1824 {
1825 /* We're using a ptr, increment it. */
1826 start_sequence ();
1827
1828 emit_insn (res);
1829 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1830 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1831 res = get_insns ();
1832 end_sequence ();
1833 }
1834 else
1835 rep = 1;
1836 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1837 }
1838 break;
1839 }
1840 return res;
1841 }
1842 \f
1843 /* Returns true if X is a valid address for use in a memory reference. */
1844
1845 static bool
1846 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1847 {
1848 enum rtx_code code = GET_CODE (x);
1849
1850 switch (code)
1851 {
1852 case REG:
1853 return true;
1854
1855 case PLUS:
1856 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1857 return true;
1858 return false;
1859
1860 case CONST:
1861 case SYMBOL_REF:
1862 case LABEL_REF:
1863 return true;
1864
1865 default:
1866 return false;
1867 }
1868 }
1869 \f
1870 /* Machinery to output constant initializers. When beginning an
1871 initializer, we decide on a fragment size (which is visible in ptx
1872 in the type used), and then all initializer data is buffered until
1873 a fragment is filled and ready to be written out. */
1874
1875 static struct
1876 {
1877 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1878 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1879 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1880 out. */
1881 unsigned size; /* Fragment size to accumulate. */
1882 unsigned offset; /* Offset within current fragment. */
1883 bool started; /* Whether we've output any initializer. */
1884 } init_frag;
1885
1886 /* The current fragment is full, write it out. SYM may provide a
1887 symbolic reference we should output, in which case the fragment
1888 value is the addend. */
1889
1890 static void
1891 output_init_frag (rtx sym)
1892 {
1893 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1894 unsigned HOST_WIDE_INT val = init_frag.val;
1895
1896 init_frag.started = true;
1897 init_frag.val = 0;
1898 init_frag.offset = 0;
1899 init_frag.remaining--;
1900
1901 if (sym)
1902 {
1903 bool function = (SYMBOL_REF_DECL (sym)
1904 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1905 if (!function)
1906 fprintf (asm_out_file, "generic(");
1907 output_address (VOIDmode, sym);
1908 if (!function)
1909 fprintf (asm_out_file, ")");
1910 if (val)
1911 fprintf (asm_out_file, " + ");
1912 }
1913
1914 if (!sym || val)
1915 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1916 }
1917
1918 /* Add value VAL of size SIZE to the data we're emitting, and keep
1919 writing out chunks as they fill up. */
1920
1921 static void
1922 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1923 {
1924 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1925
1926 for (unsigned part = 0; size; size -= part)
1927 {
1928 val >>= part * BITS_PER_UNIT;
1929 part = init_frag.size - init_frag.offset;
1930 if (part > size)
1931 part = size;
1932
1933 unsigned HOST_WIDE_INT partial
1934 = val << (init_frag.offset * BITS_PER_UNIT);
1935 init_frag.val |= partial & init_frag.mask;
1936 init_frag.offset += part;
1937
1938 if (init_frag.offset == init_frag.size)
1939 output_init_frag (NULL);
1940 }
1941 }
1942
1943 /* Target hook for assembling integer object X of size SIZE. */
1944
1945 static bool
1946 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1947 {
1948 HOST_WIDE_INT val = 0;
1949
1950 switch (GET_CODE (x))
1951 {
1952 default:
1953 /* Let the generic machinery figure it out, usually for a
1954 CONST_WIDE_INT. */
1955 return false;
1956
1957 case CONST_INT:
1958 nvptx_assemble_value (INTVAL (x), size);
1959 break;
1960
1961 case CONST:
1962 x = XEXP (x, 0);
1963 gcc_assert (GET_CODE (x) == PLUS);
1964 val = INTVAL (XEXP (x, 1));
1965 x = XEXP (x, 0);
1966 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1967 /* FALLTHROUGH */
1968
1969 case SYMBOL_REF:
1970 gcc_assert (size == init_frag.size);
1971 if (init_frag.offset)
1972 sorry ("cannot emit unaligned pointers in ptx assembly");
1973
1974 nvptx_maybe_record_fnsym (x);
1975 init_frag.val = val;
1976 output_init_frag (x);
1977 break;
1978 }
1979
1980 return true;
1981 }
1982
1983 /* Output SIZE zero bytes. We ignore the FILE argument since the
1984 functions we're calling to perform the output just use
1985 asm_out_file. */
1986
1987 void
1988 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1989 {
1990 /* Finish the current fragment, if it's started. */
1991 if (init_frag.offset)
1992 {
1993 unsigned part = init_frag.size - init_frag.offset;
1994 if (part > size)
1995 part = (unsigned) size;
1996 size -= part;
1997 nvptx_assemble_value (0, part);
1998 }
1999
2000 /* If this skip doesn't terminate the initializer, write as many
2001 remaining pieces as possible directly. */
2002 if (size < init_frag.remaining * init_frag.size)
2003 {
2004 while (size >= init_frag.size)
2005 {
2006 size -= init_frag.size;
2007 output_init_frag (NULL_RTX);
2008 }
2009 if (size)
2010 nvptx_assemble_value (0, size);
2011 }
2012 }
2013
2014 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2015 ignore the FILE arg. */
2016
2017 void
2018 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2019 {
2020 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2021 nvptx_assemble_value (str[i], 1);
2022 }
2023
2024 /* Emit a PTX variable decl and prepare for emission of its
2025 initializer. NAME is the symbol name and SETION the PTX data
2026 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2027 The caller has already emitted any indentation and linkage
2028 specifier. It is responsible for any initializer, terminating ;
2029 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2030 this is the opposite way round that PTX wants them! */
2031
2032 static void
2033 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2034 const_tree type, HOST_WIDE_INT size, unsigned align)
2035 {
2036 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2037 && (TYPE_DOMAIN (type) == NULL_TREE);
2038
2039 while (TREE_CODE (type) == ARRAY_TYPE)
2040 type = TREE_TYPE (type);
2041
2042 if (TREE_CODE (type) == VECTOR_TYPE
2043 || TREE_CODE (type) == COMPLEX_TYPE)
2044 /* Neither vector nor complex types can contain the other. */
2045 type = TREE_TYPE (type);
2046
2047 unsigned elt_size = int_size_in_bytes (type);
2048
2049 /* Largest mode we're prepared to accept. For BLKmode types we
2050 don't know if it'll contain pointer constants, so have to choose
2051 pointer size, otherwise we can choose DImode. */
2052 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2053
2054 elt_size |= GET_MODE_SIZE (elt_mode);
2055 elt_size &= -elt_size; /* Extract LSB set. */
2056
2057 init_frag.size = elt_size;
2058 /* Avoid undefined shift behavior by using '2'. */
2059 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2060 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2061 init_frag.val = 0;
2062 init_frag.offset = 0;
2063 init_frag.started = false;
2064 /* Size might not be a multiple of elt size, if there's an
2065 initialized trailing struct array with smaller type than
2066 elt_size. */
2067 init_frag.remaining = (size + elt_size - 1) / elt_size;
2068
2069 fprintf (file, "%s .align %d .u%d ",
2070 section, align / BITS_PER_UNIT,
2071 elt_size * BITS_PER_UNIT);
2072 assemble_name (file, name);
2073
2074 if (size)
2075 /* We make everything an array, to simplify any initialization
2076 emission. */
2077 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2078 else if (atype)
2079 fprintf (file, "[]");
2080 }
2081
2082 /* Called when the initializer for a decl has been completely output through
2083 combinations of the three functions above. */
2084
2085 static void
2086 nvptx_assemble_decl_end (void)
2087 {
2088 if (init_frag.offset)
2089 /* This can happen with a packed struct with trailing array member. */
2090 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2091 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2092 }
2093
2094 /* Output an uninitialized common or file-scope variable. */
2095
2096 void
2097 nvptx_output_aligned_decl (FILE *file, const char *name,
2098 const_tree decl, HOST_WIDE_INT size, unsigned align)
2099 {
2100 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2101
2102 /* If this is public, it is common. The nearest thing we have to
2103 common is weak. */
2104 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2105
2106 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2107 TREE_TYPE (decl), size, align);
2108 nvptx_assemble_decl_end ();
2109 }
2110
2111 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2112 writing a constant variable EXP with NAME and SIZE and its
2113 initializer to FILE. */
2114
2115 static void
2116 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2117 const_tree exp, HOST_WIDE_INT obj_size)
2118 {
2119 write_var_marker (file, true, false, name);
2120
2121 fprintf (file, "\t");
2122
2123 tree type = TREE_TYPE (exp);
2124 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2125 TYPE_ALIGN (type));
2126 }
2127
2128 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2129 a variable DECL with NAME to FILE. */
2130
2131 void
2132 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2133 {
2134 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2135
2136 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2137 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2138
2139 tree type = TREE_TYPE (decl);
2140 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2141 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2142 type, obj_size, DECL_ALIGN (decl));
2143 }
2144
2145 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2146
2147 static void
2148 nvptx_globalize_label (FILE *, const char *)
2149 {
2150 }
2151
2152 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2153 declaration only for variable DECL with NAME to FILE. */
2154
2155 static void
2156 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2157 {
2158 /* The middle end can place constant pool decls into the varpool as
2159 undefined. Until that is fixed, catch the problem here. */
2160 if (DECL_IN_CONSTANT_POOL (decl))
2161 return;
2162
2163 /* We support weak defintions, and hence have the right
2164 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2165 if (DECL_WEAK (decl))
2166 error_at (DECL_SOURCE_LOCATION (decl),
2167 "PTX does not support weak declarations"
2168 " (only weak definitions)");
2169 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2170
2171 fprintf (file, "\t.extern ");
2172 tree size = DECL_SIZE_UNIT (decl);
2173 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2174 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2175 DECL_ALIGN (decl));
2176 nvptx_assemble_decl_end ();
2177 }
2178
2179 /* Output a pattern for a move instruction. */
2180
2181 const char *
2182 nvptx_output_mov_insn (rtx dst, rtx src)
2183 {
2184 machine_mode dst_mode = GET_MODE (dst);
2185 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2186 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2187 machine_mode src_inner = (GET_CODE (src) == SUBREG
2188 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2189
2190 rtx sym = src;
2191 if (GET_CODE (sym) == CONST)
2192 sym = XEXP (XEXP (sym, 0), 0);
2193 if (SYMBOL_REF_P (sym))
2194 {
2195 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2196 return "%.\tcvta%D1%t0\t%0, %1;";
2197 nvptx_maybe_record_fnsym (sym);
2198 }
2199
2200 if (src_inner == dst_inner)
2201 return "%.\tmov%t0\t%0, %1;";
2202
2203 if (CONSTANT_P (src))
2204 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2205 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2206 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2207
2208 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2209 {
2210 if (GET_MODE_BITSIZE (dst_mode) == 128
2211 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2212 {
2213 /* mov.b128 is not supported. */
2214 if (dst_inner == V2DImode && src_inner == TImode)
2215 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2216 else if (dst_inner == TImode && src_inner == V2DImode)
2217 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2218
2219 gcc_unreachable ();
2220 }
2221 return "%.\tmov.b%T0\t%0, %1;";
2222 }
2223
2224 return "%.\tcvt%t0%t1\t%0, %1;";
2225 }
2226
2227 static void nvptx_print_operand (FILE *, rtx, int);
2228
2229 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2230 involves writing .param declarations and in/out copies into them. For
2231 indirect calls, also write the .callprototype. */
2232
2233 const char *
2234 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2235 {
2236 char buf[16];
2237 static int labelno;
2238 bool needs_tgt = register_operand (callee, Pmode);
2239 rtx pat = PATTERN (insn);
2240 if (GET_CODE (pat) == COND_EXEC)
2241 pat = COND_EXEC_CODE (pat);
2242 int arg_end = XVECLEN (pat, 0);
2243 tree decl = NULL_TREE;
2244
2245 fprintf (asm_out_file, "\t{\n");
2246 if (result != NULL)
2247 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2248 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2249 reg_names[NVPTX_RETURN_REGNUM]);
2250
2251 /* Ensure we have a ptx declaration in the output if necessary. */
2252 if (GET_CODE (callee) == SYMBOL_REF)
2253 {
2254 decl = SYMBOL_REF_DECL (callee);
2255 if (!decl
2256 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2257 nvptx_record_libfunc (callee, result, pat);
2258 else if (DECL_EXTERNAL (decl))
2259 nvptx_record_fndecl (decl);
2260 }
2261
2262 if (needs_tgt)
2263 {
2264 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2265 labelno++;
2266 ASM_OUTPUT_LABEL (asm_out_file, buf);
2267 std::stringstream s;
2268 write_fn_proto_from_insn (s, NULL, result, pat);
2269 fputs (s.str().c_str(), asm_out_file);
2270 }
2271
2272 for (int argno = 1; argno < arg_end; argno++)
2273 {
2274 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2275 machine_mode mode = GET_MODE (t);
2276 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2277
2278 /* Mode splitting has already been done. */
2279 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2280 "\t\tst.param%s [%%out_arg%d], ",
2281 ptx_type, argno, ptx_type, argno);
2282 output_reg (asm_out_file, REGNO (t), VOIDmode);
2283 fprintf (asm_out_file, ";\n");
2284 }
2285
2286 /* The '.' stands for the call's predicate, if any. */
2287 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2288 fprintf (asm_out_file, "\t\tcall ");
2289 if (result != NULL_RTX)
2290 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2291
2292 if (decl)
2293 {
2294 const char *name = get_fnname_from_decl (decl);
2295 name = nvptx_name_replacement (name);
2296 assemble_name (asm_out_file, name);
2297 }
2298 else
2299 output_address (VOIDmode, callee);
2300
2301 const char *open = "(";
2302 for (int argno = 1; argno < arg_end; argno++)
2303 {
2304 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2305 open = "";
2306 }
2307 if (decl && DECL_STATIC_CHAIN (decl))
2308 {
2309 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2310 open = "";
2311 }
2312 if (!open[0])
2313 fprintf (asm_out_file, ")");
2314
2315 if (needs_tgt)
2316 {
2317 fprintf (asm_out_file, ", ");
2318 assemble_name (asm_out_file, buf);
2319 }
2320 fprintf (asm_out_file, ";\n");
2321
2322 if (find_reg_note (insn, REG_NORETURN, NULL))
2323 {
2324 /* No return functions confuse the PTX JIT, as it doesn't realize
2325 the flow control barrier they imply. It can seg fault if it
2326 encounters what looks like an unexitable loop. Emit a trailing
2327 trap and exit, which it does grok. */
2328 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2329 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2330 }
2331
2332 if (result)
2333 {
2334 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2335
2336 if (!rval[0])
2337 /* We must escape the '%' that starts RETURN_REGNUM. */
2338 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2339 reg_names[NVPTX_RETURN_REGNUM]);
2340 return rval;
2341 }
2342
2343 return "}";
2344 }
2345
2346 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2347
2348 static bool
2349 nvptx_print_operand_punct_valid_p (unsigned char c)
2350 {
2351 return c == '.' || c== '#';
2352 }
2353
2354 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2355
2356 static void
2357 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2358 {
2359 rtx off;
2360 if (GET_CODE (x) == CONST)
2361 x = XEXP (x, 0);
2362 switch (GET_CODE (x))
2363 {
2364 case PLUS:
2365 off = XEXP (x, 1);
2366 output_address (VOIDmode, XEXP (x, 0));
2367 fprintf (file, "+");
2368 output_address (VOIDmode, off);
2369 break;
2370
2371 case SYMBOL_REF:
2372 case LABEL_REF:
2373 output_addr_const (file, x);
2374 break;
2375
2376 default:
2377 gcc_assert (GET_CODE (x) != MEM);
2378 nvptx_print_operand (file, x, 0);
2379 break;
2380 }
2381 }
2382
2383 /* Write assembly language output for the address ADDR to FILE. */
2384
2385 static void
2386 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2387 {
2388 nvptx_print_address_operand (file, addr, mode);
2389 }
2390
2391 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2392
2393 Meaning of CODE:
2394 . -- print the predicate for the instruction or an emptry string for an
2395 unconditional one.
2396 # -- print a rounding mode for the instruction
2397
2398 A -- print a data area for a MEM
2399 c -- print an opcode suffix for a comparison operator, including a type code
2400 D -- print a data area for a MEM operand
2401 S -- print a shuffle kind specified by CONST_INT
2402 t -- print a type opcode suffix, promoting QImode to 32 bits
2403 T -- print a type size in bits
2404 u -- print a type opcode suffix without promotions. */
2405
2406 static void
2407 nvptx_print_operand (FILE *file, rtx x, int code)
2408 {
2409 if (code == '.')
2410 {
2411 x = current_insn_predicate;
2412 if (x)
2413 {
2414 fputs ("@", file);
2415 if (GET_CODE (x) == EQ)
2416 fputs ("!", file);
2417 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2418 }
2419 return;
2420 }
2421 else if (code == '#')
2422 {
2423 fputs (".rn", file);
2424 return;
2425 }
2426
2427 enum rtx_code x_code = GET_CODE (x);
2428 machine_mode mode = GET_MODE (x);
2429
2430 switch (code)
2431 {
2432 case 'A':
2433 x = XEXP (x, 0);
2434 /* FALLTHROUGH. */
2435
2436 case 'D':
2437 if (GET_CODE (x) == CONST)
2438 x = XEXP (x, 0);
2439 if (GET_CODE (x) == PLUS)
2440 x = XEXP (x, 0);
2441
2442 if (GET_CODE (x) == SYMBOL_REF)
2443 fputs (section_for_sym (x), file);
2444 break;
2445
2446 case 't':
2447 case 'u':
2448 if (x_code == SUBREG)
2449 {
2450 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2451 if (VECTOR_MODE_P (inner_mode)
2452 && (GET_MODE_SIZE (mode)
2453 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2454 mode = GET_MODE_INNER (inner_mode);
2455 else if (split_mode_p (inner_mode))
2456 mode = maybe_split_mode (inner_mode);
2457 else
2458 mode = inner_mode;
2459 }
2460 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2461 break;
2462
2463 case 'H':
2464 case 'L':
2465 {
2466 rtx inner_x = SUBREG_REG (x);
2467 machine_mode inner_mode = GET_MODE (inner_x);
2468 machine_mode split = maybe_split_mode (inner_mode);
2469
2470 output_reg (file, REGNO (inner_x), split,
2471 (code == 'H'
2472 ? GET_MODE_SIZE (inner_mode) / 2
2473 : 0));
2474 }
2475 break;
2476
2477 case 'S':
2478 {
2479 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2480 /* Same order as nvptx_shuffle_kind. */
2481 static const char *const kinds[] =
2482 {".up", ".down", ".bfly", ".idx"};
2483 fputs (kinds[kind], file);
2484 }
2485 break;
2486
2487 case 'T':
2488 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2489 break;
2490
2491 case 'j':
2492 fprintf (file, "@");
2493 goto common;
2494
2495 case 'J':
2496 fprintf (file, "@!");
2497 goto common;
2498
2499 case 'c':
2500 mode = GET_MODE (XEXP (x, 0));
2501 switch (x_code)
2502 {
2503 case EQ:
2504 fputs (".eq", file);
2505 break;
2506 case NE:
2507 if (FLOAT_MODE_P (mode))
2508 fputs (".neu", file);
2509 else
2510 fputs (".ne", file);
2511 break;
2512 case LE:
2513 case LEU:
2514 fputs (".le", file);
2515 break;
2516 case GE:
2517 case GEU:
2518 fputs (".ge", file);
2519 break;
2520 case LT:
2521 case LTU:
2522 fputs (".lt", file);
2523 break;
2524 case GT:
2525 case GTU:
2526 fputs (".gt", file);
2527 break;
2528 case LTGT:
2529 fputs (".ne", file);
2530 break;
2531 case UNEQ:
2532 fputs (".equ", file);
2533 break;
2534 case UNLE:
2535 fputs (".leu", file);
2536 break;
2537 case UNGE:
2538 fputs (".geu", file);
2539 break;
2540 case UNLT:
2541 fputs (".ltu", file);
2542 break;
2543 case UNGT:
2544 fputs (".gtu", file);
2545 break;
2546 case UNORDERED:
2547 fputs (".nan", file);
2548 break;
2549 case ORDERED:
2550 fputs (".num", file);
2551 break;
2552 default:
2553 gcc_unreachable ();
2554 }
2555 if (FLOAT_MODE_P (mode)
2556 || x_code == EQ || x_code == NE
2557 || x_code == GEU || x_code == GTU
2558 || x_code == LEU || x_code == LTU)
2559 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2560 else
2561 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2562 break;
2563 default:
2564 common:
2565 switch (x_code)
2566 {
2567 case SUBREG:
2568 {
2569 rtx inner_x = SUBREG_REG (x);
2570 machine_mode inner_mode = GET_MODE (inner_x);
2571 machine_mode split = maybe_split_mode (inner_mode);
2572
2573 if (VECTOR_MODE_P (inner_mode)
2574 && (GET_MODE_SIZE (mode)
2575 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2576 {
2577 output_reg (file, REGNO (inner_x), VOIDmode);
2578 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2579 }
2580 else if (split_mode_p (inner_mode)
2581 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2582 output_reg (file, REGNO (inner_x), split);
2583 else
2584 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2585 }
2586 break;
2587
2588 case REG:
2589 output_reg (file, REGNO (x), maybe_split_mode (mode));
2590 break;
2591
2592 case MEM:
2593 fputc ('[', file);
2594 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2595 fputc (']', file);
2596 break;
2597
2598 case CONST_INT:
2599 output_addr_const (file, x);
2600 break;
2601
2602 case CONST:
2603 case SYMBOL_REF:
2604 case LABEL_REF:
2605 /* We could use output_addr_const, but that can print things like
2606 "x-8", which breaks ptxas. Need to ensure it is output as
2607 "x+-8". */
2608 nvptx_print_address_operand (file, x, VOIDmode);
2609 break;
2610
2611 case CONST_DOUBLE:
2612 long vals[2];
2613 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2614 vals[0] &= 0xffffffff;
2615 vals[1] &= 0xffffffff;
2616 if (mode == SFmode)
2617 fprintf (file, "0f%08lx", vals[0]);
2618 else
2619 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2620 break;
2621
2622 case CONST_VECTOR:
2623 {
2624 unsigned n = CONST_VECTOR_NUNITS (x);
2625 fprintf (file, "{ ");
2626 for (unsigned i = 0; i < n; ++i)
2627 {
2628 if (i != 0)
2629 fprintf (file, ", ");
2630
2631 rtx elem = CONST_VECTOR_ELT (x, i);
2632 output_addr_const (file, elem);
2633 }
2634 fprintf (file, " }");
2635 }
2636 break;
2637
2638 default:
2639 output_addr_const (file, x);
2640 }
2641 }
2642 }
2643 \f
2644 /* Record replacement regs used to deal with subreg operands. */
2645 struct reg_replace
2646 {
2647 rtx replacement[MAX_RECOG_OPERANDS];
2648 machine_mode mode;
2649 int n_allocated;
2650 int n_in_use;
2651 };
2652
2653 /* Allocate or reuse a replacement in R and return the rtx. */
2654
2655 static rtx
2656 get_replacement (struct reg_replace *r)
2657 {
2658 if (r->n_allocated == r->n_in_use)
2659 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2660 return r->replacement[r->n_in_use++];
2661 }
2662
2663 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2664 the presence of subregs would break the rules for most instructions.
2665 Replace them with a suitable new register of the right size, plus
2666 conversion copyin/copyout instructions. */
2667
2668 static void
2669 nvptx_reorg_subreg (void)
2670 {
2671 struct reg_replace qiregs, hiregs, siregs, diregs;
2672 rtx_insn *insn, *next;
2673
2674 qiregs.n_allocated = 0;
2675 hiregs.n_allocated = 0;
2676 siregs.n_allocated = 0;
2677 diregs.n_allocated = 0;
2678 qiregs.mode = QImode;
2679 hiregs.mode = HImode;
2680 siregs.mode = SImode;
2681 diregs.mode = DImode;
2682
2683 for (insn = get_insns (); insn; insn = next)
2684 {
2685 next = NEXT_INSN (insn);
2686 if (!NONDEBUG_INSN_P (insn)
2687 || asm_noperands (PATTERN (insn)) >= 0
2688 || GET_CODE (PATTERN (insn)) == USE
2689 || GET_CODE (PATTERN (insn)) == CLOBBER)
2690 continue;
2691
2692 qiregs.n_in_use = 0;
2693 hiregs.n_in_use = 0;
2694 siregs.n_in_use = 0;
2695 diregs.n_in_use = 0;
2696 extract_insn (insn);
2697 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2698
2699 for (int i = 0; i < recog_data.n_operands; i++)
2700 {
2701 rtx op = recog_data.operand[i];
2702 if (GET_CODE (op) != SUBREG)
2703 continue;
2704
2705 rtx inner = SUBREG_REG (op);
2706
2707 machine_mode outer_mode = GET_MODE (op);
2708 machine_mode inner_mode = GET_MODE (inner);
2709 gcc_assert (s_ok);
2710 if (s_ok
2711 && (GET_MODE_PRECISION (inner_mode)
2712 >= GET_MODE_PRECISION (outer_mode)))
2713 continue;
2714 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2715 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2716 : outer_mode == HImode ? &hiregs
2717 : outer_mode == SImode ? &siregs
2718 : &diregs);
2719 rtx new_reg = get_replacement (r);
2720
2721 if (recog_data.operand_type[i] != OP_OUT)
2722 {
2723 enum rtx_code code;
2724 if (GET_MODE_PRECISION (inner_mode)
2725 < GET_MODE_PRECISION (outer_mode))
2726 code = ZERO_EXTEND;
2727 else
2728 code = TRUNCATE;
2729
2730 rtx pat = gen_rtx_SET (new_reg,
2731 gen_rtx_fmt_e (code, outer_mode, inner));
2732 emit_insn_before (pat, insn);
2733 }
2734
2735 if (recog_data.operand_type[i] != OP_IN)
2736 {
2737 enum rtx_code code;
2738 if (GET_MODE_PRECISION (inner_mode)
2739 < GET_MODE_PRECISION (outer_mode))
2740 code = TRUNCATE;
2741 else
2742 code = ZERO_EXTEND;
2743
2744 rtx pat = gen_rtx_SET (inner,
2745 gen_rtx_fmt_e (code, inner_mode, new_reg));
2746 emit_insn_after (pat, insn);
2747 }
2748 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2749 }
2750 }
2751 }
2752
2753 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2754 first use. */
2755
2756 static rtx
2757 nvptx_get_unisimt_master ()
2758 {
2759 rtx &master = cfun->machine->unisimt_master;
2760 return master ? master : master = gen_reg_rtx (SImode);
2761 }
2762
2763 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2764
2765 static rtx
2766 nvptx_get_unisimt_predicate ()
2767 {
2768 rtx &pred = cfun->machine->unisimt_predicate;
2769 return pred ? pred : pred = gen_reg_rtx (BImode);
2770 }
2771
2772 /* Return true if given call insn references one of the functions provided by
2773 the CUDA runtime: malloc, free, vprintf. */
2774
2775 static bool
2776 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2777 {
2778 rtx pat = PATTERN (insn);
2779 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2780 pat = XVECEXP (pat, 0, 0);
2781 if (GET_CODE (pat) == SET)
2782 pat = SET_SRC (pat);
2783 gcc_checking_assert (GET_CODE (pat) == CALL
2784 && GET_CODE (XEXP (pat, 0)) == MEM);
2785 rtx addr = XEXP (XEXP (pat, 0), 0);
2786 if (GET_CODE (addr) != SYMBOL_REF)
2787 return false;
2788 const char *name = XSTR (addr, 0);
2789 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2790 references with forced assembler name refer to PTX syscalls. For vprintf,
2791 accept both normal and forced-assembler-name references. */
2792 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2793 || !strcmp (name, "*malloc")
2794 || !strcmp (name, "*free"));
2795 }
2796
2797 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2798 propagate its value from lane MASTER to current lane. */
2799
2800 static void
2801 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2802 {
2803 rtx reg;
2804 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2805 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2806 }
2807
2808 /* Adjust code for uniform-simt code generation variant by making atomics and
2809 "syscalls" conditionally executed, and inserting shuffle-based propagation
2810 for registers being set. */
2811
2812 static void
2813 nvptx_reorg_uniform_simt ()
2814 {
2815 rtx_insn *insn, *next;
2816
2817 for (insn = get_insns (); insn; insn = next)
2818 {
2819 next = NEXT_INSN (insn);
2820 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2821 && !(NONJUMP_INSN_P (insn)
2822 && GET_CODE (PATTERN (insn)) == PARALLEL
2823 && get_attr_atomic (insn)))
2824 continue;
2825 rtx pat = PATTERN (insn);
2826 rtx master = nvptx_get_unisimt_master ();
2827 for (int i = 0; i < XVECLEN (pat, 0); i++)
2828 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2829 rtx pred = nvptx_get_unisimt_predicate ();
2830 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2831 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2832 validate_change (insn, &PATTERN (insn), pat, false);
2833 }
2834 }
2835
2836 /* Loop structure of the function. The entire function is described as
2837 a NULL loop. */
2838
2839 struct parallel
2840 {
2841 /* Parent parallel. */
2842 parallel *parent;
2843
2844 /* Next sibling parallel. */
2845 parallel *next;
2846
2847 /* First child parallel. */
2848 parallel *inner;
2849
2850 /* Partitioning mask of the parallel. */
2851 unsigned mask;
2852
2853 /* Partitioning used within inner parallels. */
2854 unsigned inner_mask;
2855
2856 /* Location of parallel forked and join. The forked is the first
2857 block in the parallel and the join is the first block after of
2858 the partition. */
2859 basic_block forked_block;
2860 basic_block join_block;
2861
2862 rtx_insn *forked_insn;
2863 rtx_insn *join_insn;
2864
2865 rtx_insn *fork_insn;
2866 rtx_insn *joining_insn;
2867
2868 /* Basic blocks in this parallel, but not in child parallels. The
2869 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2870 blocks are not. */
2871 auto_vec<basic_block> blocks;
2872
2873 public:
2874 parallel (parallel *parent, unsigned mode);
2875 ~parallel ();
2876 };
2877
2878 /* Constructor links the new parallel into it's parent's chain of
2879 children. */
2880
2881 parallel::parallel (parallel *parent_, unsigned mask_)
2882 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2883 {
2884 forked_block = join_block = 0;
2885 forked_insn = join_insn = 0;
2886 fork_insn = joining_insn = 0;
2887
2888 if (parent)
2889 {
2890 next = parent->inner;
2891 parent->inner = this;
2892 }
2893 }
2894
2895 parallel::~parallel ()
2896 {
2897 delete inner;
2898 delete next;
2899 }
2900
2901 /* Map of basic blocks to insns */
2902 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2903
2904 /* A tuple of an insn of interest and the BB in which it resides. */
2905 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2906 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2907
2908 /* Split basic blocks such that each forked and join unspecs are at
2909 the start of their basic blocks. Thus afterwards each block will
2910 have a single partitioning mode. We also do the same for return
2911 insns, as they are executed by every thread. Return the
2912 partitioning mode of the function as a whole. Populate MAP with
2913 head and tail blocks. We also clear the BB visited flag, which is
2914 used when finding partitions. */
2915
2916 static void
2917 nvptx_split_blocks (bb_insn_map_t *map)
2918 {
2919 insn_bb_vec_t worklist;
2920 basic_block block;
2921 rtx_insn *insn;
2922
2923 /* Locate all the reorg instructions of interest. */
2924 FOR_ALL_BB_FN (block, cfun)
2925 {
2926 bool seen_insn = false;
2927
2928 /* Clear visited flag, for use by parallel locator */
2929 block->flags &= ~BB_VISITED;
2930
2931 FOR_BB_INSNS (block, insn)
2932 {
2933 if (!INSN_P (insn))
2934 continue;
2935 switch (recog_memoized (insn))
2936 {
2937 default:
2938 seen_insn = true;
2939 continue;
2940 case CODE_FOR_nvptx_forked:
2941 case CODE_FOR_nvptx_join:
2942 break;
2943
2944 case CODE_FOR_return:
2945 /* We also need to split just before return insns, as
2946 that insn needs executing by all threads, but the
2947 block it is in probably does not. */
2948 break;
2949 }
2950
2951 if (seen_insn)
2952 /* We've found an instruction that must be at the start of
2953 a block, but isn't. Add it to the worklist. */
2954 worklist.safe_push (insn_bb_t (insn, block));
2955 else
2956 /* It was already the first instruction. Just add it to
2957 the map. */
2958 map->get_or_insert (block) = insn;
2959 seen_insn = true;
2960 }
2961 }
2962
2963 /* Split blocks on the worklist. */
2964 unsigned ix;
2965 insn_bb_t *elt;
2966 basic_block remap = 0;
2967 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2968 {
2969 if (remap != elt->second)
2970 {
2971 block = elt->second;
2972 remap = block;
2973 }
2974
2975 /* Split block before insn. The insn is in the new block */
2976 edge e = split_block (block, PREV_INSN (elt->first));
2977
2978 block = e->dest;
2979 map->get_or_insert (block) = elt->first;
2980 }
2981 }
2982
2983 /* BLOCK is a basic block containing a head or tail instruction.
2984 Locate the associated prehead or pretail instruction, which must be
2985 in the single predecessor block. */
2986
2987 static rtx_insn *
2988 nvptx_discover_pre (basic_block block, int expected)
2989 {
2990 gcc_assert (block->preds->length () == 1);
2991 basic_block pre_block = (*block->preds)[0]->src;
2992 rtx_insn *pre_insn;
2993
2994 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2995 pre_insn = PREV_INSN (pre_insn))
2996 gcc_assert (pre_insn != BB_HEAD (pre_block));
2997
2998 gcc_assert (recog_memoized (pre_insn) == expected);
2999 return pre_insn;
3000 }
3001
3002 /* Dump this parallel and all its inner parallels. */
3003
3004 static void
3005 nvptx_dump_pars (parallel *par, unsigned depth)
3006 {
3007 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3008 depth, par->mask,
3009 par->forked_block ? par->forked_block->index : -1,
3010 par->join_block ? par->join_block->index : -1);
3011
3012 fprintf (dump_file, " blocks:");
3013
3014 basic_block block;
3015 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3016 fprintf (dump_file, " %d", block->index);
3017 fprintf (dump_file, "\n");
3018 if (par->inner)
3019 nvptx_dump_pars (par->inner, depth + 1);
3020
3021 if (par->next)
3022 nvptx_dump_pars (par->next, depth);
3023 }
3024
3025 /* If BLOCK contains a fork/join marker, process it to create or
3026 terminate a loop structure. Add this block to the current loop,
3027 and then walk successor blocks. */
3028
3029 static parallel *
3030 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3031 {
3032 if (block->flags & BB_VISITED)
3033 return par;
3034 block->flags |= BB_VISITED;
3035
3036 if (rtx_insn **endp = map->get (block))
3037 {
3038 rtx_insn *end = *endp;
3039
3040 /* This is a block head or tail, or return instruction. */
3041 switch (recog_memoized (end))
3042 {
3043 case CODE_FOR_return:
3044 /* Return instructions are in their own block, and we
3045 don't need to do anything more. */
3046 return par;
3047
3048 case CODE_FOR_nvptx_forked:
3049 /* Loop head, create a new inner loop and add it into
3050 our parent's child list. */
3051 {
3052 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3053
3054 gcc_assert (mask);
3055 par = new parallel (par, mask);
3056 par->forked_block = block;
3057 par->forked_insn = end;
3058 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3059 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3060 par->fork_insn
3061 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3062 }
3063 break;
3064
3065 case CODE_FOR_nvptx_join:
3066 /* A loop tail. Finish the current loop and return to
3067 parent. */
3068 {
3069 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3070
3071 gcc_assert (par->mask == mask);
3072 par->join_block = block;
3073 par->join_insn = end;
3074 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3075 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3076 par->joining_insn
3077 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3078 par = par->parent;
3079 }
3080 break;
3081
3082 default:
3083 gcc_unreachable ();
3084 }
3085 }
3086
3087 if (par)
3088 /* Add this block onto the current loop's list of blocks. */
3089 par->blocks.safe_push (block);
3090 else
3091 /* This must be the entry block. Create a NULL parallel. */
3092 par = new parallel (0, 0);
3093
3094 /* Walk successor blocks. */
3095 edge e;
3096 edge_iterator ei;
3097
3098 FOR_EACH_EDGE (e, ei, block->succs)
3099 nvptx_find_par (map, par, e->dest);
3100
3101 return par;
3102 }
3103
3104 /* DFS walk the CFG looking for fork & join markers. Construct
3105 loop structures as we go. MAP is a mapping of basic blocks
3106 to head & tail markers, discovered when splitting blocks. This
3107 speeds up the discovery. We rely on the BB visited flag having
3108 been cleared when splitting blocks. */
3109
3110 static parallel *
3111 nvptx_discover_pars (bb_insn_map_t *map)
3112 {
3113 basic_block block;
3114
3115 /* Mark exit blocks as visited. */
3116 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3117 block->flags |= BB_VISITED;
3118
3119 /* And entry block as not. */
3120 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3121 block->flags &= ~BB_VISITED;
3122
3123 parallel *par = nvptx_find_par (map, 0, block);
3124
3125 if (dump_file)
3126 {
3127 fprintf (dump_file, "\nLoops\n");
3128 nvptx_dump_pars (par, 0);
3129 fprintf (dump_file, "\n");
3130 }
3131
3132 return par;
3133 }
3134
3135 /* Analyse a group of BBs within a partitioned region and create N
3136 Single-Entry-Single-Exit regions. Some of those regions will be
3137 trivial ones consisting of a single BB. The blocks of a
3138 partitioned region might form a set of disjoint graphs -- because
3139 the region encloses a differently partitoned sub region.
3140
3141 We use the linear time algorithm described in 'Finding Regions Fast:
3142 Single Entry Single Exit and control Regions in Linear Time'
3143 Johnson, Pearson & Pingali. That algorithm deals with complete
3144 CFGs, where a back edge is inserted from END to START, and thus the
3145 problem becomes one of finding equivalent loops.
3146
3147 In this case we have a partial CFG. We complete it by redirecting
3148 any incoming edge to the graph to be from an arbitrary external BB,
3149 and similarly redirecting any outgoing edge to be to that BB.
3150 Thus we end up with a closed graph.
3151
3152 The algorithm works by building a spanning tree of an undirected
3153 graph and keeping track of back edges from nodes further from the
3154 root in the tree to nodes nearer to the root in the tree. In the
3155 description below, the root is up and the tree grows downwards.
3156
3157 We avoid having to deal with degenerate back-edges to the same
3158 block, by splitting each BB into 3 -- one for input edges, one for
3159 the node itself and one for the output edges. Such back edges are
3160 referred to as 'Brackets'. Cycle equivalent nodes will have the
3161 same set of brackets.
3162
3163 Determining bracket equivalency is done by maintaining a list of
3164 brackets in such a manner that the list length and final bracket
3165 uniquely identify the set.
3166
3167 We use coloring to mark all BBs with cycle equivalency with the
3168 same color. This is the output of the 'Finding Regions Fast'
3169 algorithm. Notice it doesn't actually find the set of nodes within
3170 a particular region, just unorderd sets of nodes that are the
3171 entries and exits of SESE regions.
3172
3173 After determining cycle equivalency, we need to find the minimal
3174 set of SESE regions. Do this with a DFS coloring walk of the
3175 complete graph. We're either 'looking' or 'coloring'. When
3176 looking, and we're in the subgraph, we start coloring the color of
3177 the current node, and remember that node as the start of the
3178 current color's SESE region. Every time we go to a new node, we
3179 decrement the count of nodes with thet color. If it reaches zero,
3180 we remember that node as the end of the current color's SESE region
3181 and return to 'looking'. Otherwise we color the node the current
3182 color.
3183
3184 This way we end up with coloring the inside of non-trivial SESE
3185 regions with the color of that region. */
3186
3187 /* A pair of BBs. We use this to represent SESE regions. */
3188 typedef std::pair<basic_block, basic_block> bb_pair_t;
3189 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3190
3191 /* A node in the undirected CFG. The discriminator SECOND indicates just
3192 above or just below the BB idicated by FIRST. */
3193 typedef std::pair<basic_block, int> pseudo_node_t;
3194
3195 /* A bracket indicates an edge towards the root of the spanning tree of the
3196 undirected graph. Each bracket has a color, determined
3197 from the currrent set of brackets. */
3198 struct bracket
3199 {
3200 pseudo_node_t back; /* Back target */
3201
3202 /* Current color and size of set. */
3203 unsigned color;
3204 unsigned size;
3205
3206 bracket (pseudo_node_t back_)
3207 : back (back_), color (~0u), size (~0u)
3208 {
3209 }
3210
3211 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3212 {
3213 if (length != size)
3214 {
3215 size = length;
3216 color = color_counts.length ();
3217 color_counts.quick_push (0);
3218 }
3219 color_counts[color]++;
3220 return color;
3221 }
3222 };
3223
3224 typedef auto_vec<bracket> bracket_vec_t;
3225
3226 /* Basic block info for finding SESE regions. */
3227
3228 struct bb_sese
3229 {
3230 int node; /* Node number in spanning tree. */
3231 int parent; /* Parent node number. */
3232
3233 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3234 edges arrive at pseudo-node Ai and the outgoing edges leave at
3235 pseudo-node Ao. We have to remember which way we arrived at a
3236 particular node when generating the spanning tree. dir > 0 means
3237 we arrived at Ai, dir < 0 means we arrived at Ao. */
3238 int dir;
3239
3240 /* Lowest numbered pseudo-node reached via a backedge from thsis
3241 node, or any descendant. */
3242 pseudo_node_t high;
3243
3244 int color; /* Cycle-equivalence color */
3245
3246 /* Stack of brackets for this node. */
3247 bracket_vec_t brackets;
3248
3249 bb_sese (unsigned node_, unsigned p, int dir_)
3250 :node (node_), parent (p), dir (dir_)
3251 {
3252 }
3253 ~bb_sese ();
3254
3255 /* Push a bracket ending at BACK. */
3256 void push (const pseudo_node_t &back)
3257 {
3258 if (dump_file)
3259 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3260 back.first ? back.first->index : 0, back.second);
3261 brackets.safe_push (bracket (back));
3262 }
3263
3264 void append (bb_sese *child);
3265 void remove (const pseudo_node_t &);
3266
3267 /* Set node's color. */
3268 void set_color (auto_vec<unsigned> &color_counts)
3269 {
3270 color = brackets.last ().get_color (color_counts, brackets.length ());
3271 }
3272 };
3273
3274 bb_sese::~bb_sese ()
3275 {
3276 }
3277
3278 /* Destructively append CHILD's brackets. */
3279
3280 void
3281 bb_sese::append (bb_sese *child)
3282 {
3283 if (int len = child->brackets.length ())
3284 {
3285 int ix;
3286
3287 if (dump_file)
3288 {
3289 for (ix = 0; ix < len; ix++)
3290 {
3291 const pseudo_node_t &pseudo = child->brackets[ix].back;
3292 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3293 child->node, pseudo.first ? pseudo.first->index : 0,
3294 pseudo.second);
3295 }
3296 }
3297 if (!brackets.length ())
3298 std::swap (brackets, child->brackets);
3299 else
3300 {
3301 brackets.reserve (len);
3302 for (ix = 0; ix < len; ix++)
3303 brackets.quick_push (child->brackets[ix]);
3304 }
3305 }
3306 }
3307
3308 /* Remove brackets that terminate at PSEUDO. */
3309
3310 void
3311 bb_sese::remove (const pseudo_node_t &pseudo)
3312 {
3313 unsigned removed = 0;
3314 int len = brackets.length ();
3315
3316 for (int ix = 0; ix < len; ix++)
3317 {
3318 if (brackets[ix].back == pseudo)
3319 {
3320 if (dump_file)
3321 fprintf (dump_file, "Removing backedge %d:%+d\n",
3322 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3323 removed++;
3324 }
3325 else if (removed)
3326 brackets[ix-removed] = brackets[ix];
3327 }
3328 while (removed--)
3329 brackets.pop ();
3330 }
3331
3332 /* Accessors for BB's aux pointer. */
3333 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3334 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3335
3336 /* DFS walk creating SESE data structures. Only cover nodes with
3337 BB_VISITED set. Append discovered blocks to LIST. We number in
3338 increments of 3 so that the above and below pseudo nodes can be
3339 implicitly numbered too. */
3340
3341 static int
3342 nvptx_sese_number (int n, int p, int dir, basic_block b,
3343 auto_vec<basic_block> *list)
3344 {
3345 if (BB_GET_SESE (b))
3346 return n;
3347
3348 if (dump_file)
3349 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3350 b->index, n, p, dir);
3351
3352 BB_SET_SESE (b, new bb_sese (n, p, dir));
3353 p = n;
3354
3355 n += 3;
3356 list->quick_push (b);
3357
3358 /* First walk the nodes on the 'other side' of this node, then walk
3359 the nodes on the same side. */
3360 for (unsigned ix = 2; ix; ix--)
3361 {
3362 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3363 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3364 : offsetof (edge_def, src));
3365 edge e;
3366 edge_iterator (ei);
3367
3368 FOR_EACH_EDGE (e, ei, edges)
3369 {
3370 basic_block target = *(basic_block *)((char *)e + offset);
3371
3372 if (target->flags & BB_VISITED)
3373 n = nvptx_sese_number (n, p, dir, target, list);
3374 }
3375 dir = -dir;
3376 }
3377 return n;
3378 }
3379
3380 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3381 EDGES are the outgoing edges and OFFSET is the offset to the src
3382 or dst block on the edges. */
3383
3384 static void
3385 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3386 vec<edge, va_gc> *edges, size_t offset)
3387 {
3388 edge e;
3389 edge_iterator (ei);
3390 int hi_back = depth;
3391 pseudo_node_t node_back (0, depth);
3392 int hi_child = depth;
3393 pseudo_node_t node_child (0, depth);
3394 basic_block child = NULL;
3395 unsigned num_children = 0;
3396 int usd = -dir * sese->dir;
3397
3398 if (dump_file)
3399 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3400 me->index, sese->node, dir);
3401
3402 if (dir < 0)
3403 {
3404 /* This is the above pseudo-child. It has the BB itself as an
3405 additional child node. */
3406 node_child = sese->high;
3407 hi_child = node_child.second;
3408 if (node_child.first)
3409 hi_child += BB_GET_SESE (node_child.first)->node;
3410 num_children++;
3411 }
3412
3413 /* Examine each edge.
3414 - if it is a child (a) append its bracket list and (b) record
3415 whether it is the child with the highest reaching bracket.
3416 - if it is an edge to ancestor, record whether it's the highest
3417 reaching backlink. */
3418 FOR_EACH_EDGE (e, ei, edges)
3419 {
3420 basic_block target = *(basic_block *)((char *)e + offset);
3421
3422 if (bb_sese *t_sese = BB_GET_SESE (target))
3423 {
3424 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3425 {
3426 /* Child node. Append its bracket list. */
3427 num_children++;
3428 sese->append (t_sese);
3429
3430 /* Compare it's hi value. */
3431 int t_hi = t_sese->high.second;
3432
3433 if (basic_block child_hi_block = t_sese->high.first)
3434 t_hi += BB_GET_SESE (child_hi_block)->node;
3435
3436 if (hi_child > t_hi)
3437 {
3438 hi_child = t_hi;
3439 node_child = t_sese->high;
3440 child = target;
3441 }
3442 }
3443 else if (t_sese->node < sese->node + dir
3444 && !(dir < 0 && sese->parent == t_sese->node))
3445 {
3446 /* Non-parental ancestor node -- a backlink. */
3447 int d = usd * t_sese->dir;
3448 int back = t_sese->node + d;
3449
3450 if (hi_back > back)
3451 {
3452 hi_back = back;
3453 node_back = pseudo_node_t (target, d);
3454 }
3455 }
3456 }
3457 else
3458 { /* Fallen off graph, backlink to entry node. */
3459 hi_back = 0;
3460 node_back = pseudo_node_t (0, 0);
3461 }
3462 }
3463
3464 /* Remove any brackets that terminate at this pseudo node. */
3465 sese->remove (pseudo_node_t (me, dir));
3466
3467 /* Now push any backlinks from this pseudo node. */
3468 FOR_EACH_EDGE (e, ei, edges)
3469 {
3470 basic_block target = *(basic_block *)((char *)e + offset);
3471 if (bb_sese *t_sese = BB_GET_SESE (target))
3472 {
3473 if (t_sese->node < sese->node + dir
3474 && !(dir < 0 && sese->parent == t_sese->node))
3475 /* Non-parental ancestor node - backedge from me. */
3476 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3477 }
3478 else
3479 {
3480 /* back edge to entry node */
3481 sese->push (pseudo_node_t (0, 0));
3482 }
3483 }
3484
3485 /* If this node leads directly or indirectly to a no-return region of
3486 the graph, then fake a backedge to entry node. */
3487 if (!sese->brackets.length () || !edges || !edges->length ())
3488 {
3489 hi_back = 0;
3490 node_back = pseudo_node_t (0, 0);
3491 sese->push (node_back);
3492 }
3493
3494 /* Record the highest reaching backedge from us or a descendant. */
3495 sese->high = hi_back < hi_child ? node_back : node_child;
3496
3497 if (num_children > 1)
3498 {
3499 /* There is more than one child -- this is a Y shaped piece of
3500 spanning tree. We have to insert a fake backedge from this
3501 node to the highest ancestor reached by not-the-highest
3502 reaching child. Note that there may be multiple children
3503 with backedges to the same highest node. That's ok and we
3504 insert the edge to that highest node. */
3505 hi_child = depth;
3506 if (dir < 0 && child)
3507 {
3508 node_child = sese->high;
3509 hi_child = node_child.second;
3510 if (node_child.first)
3511 hi_child += BB_GET_SESE (node_child.first)->node;
3512 }
3513
3514 FOR_EACH_EDGE (e, ei, edges)
3515 {
3516 basic_block target = *(basic_block *)((char *)e + offset);
3517
3518 if (target == child)
3519 /* Ignore the highest child. */
3520 continue;
3521
3522 bb_sese *t_sese = BB_GET_SESE (target);
3523 if (!t_sese)
3524 continue;
3525 if (t_sese->parent != sese->node)
3526 /* Not a child. */
3527 continue;
3528
3529 /* Compare its hi value. */
3530 int t_hi = t_sese->high.second;
3531
3532 if (basic_block child_hi_block = t_sese->high.first)
3533 t_hi += BB_GET_SESE (child_hi_block)->node;
3534
3535 if (hi_child > t_hi)
3536 {
3537 hi_child = t_hi;
3538 node_child = t_sese->high;
3539 }
3540 }
3541
3542 sese->push (node_child);
3543 }
3544 }
3545
3546
3547 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3548 proceed to successors. Set SESE entry and exit nodes of
3549 REGIONS. */
3550
3551 static void
3552 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3553 basic_block block, int coloring)
3554 {
3555 bb_sese *sese = BB_GET_SESE (block);
3556
3557 if (block->flags & BB_VISITED)
3558 {
3559 /* If we've already encountered this block, either we must not
3560 be coloring, or it must have been colored the current color. */
3561 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3562 return;
3563 }
3564
3565 block->flags |= BB_VISITED;
3566
3567 if (sese)
3568 {
3569 if (coloring < 0)
3570 {
3571 /* Start coloring a region. */
3572 regions[sese->color].first = block;
3573 coloring = sese->color;
3574 }
3575
3576 if (!--color_counts[sese->color] && sese->color == coloring)
3577 {
3578 /* Found final block of SESE region. */
3579 regions[sese->color].second = block;
3580 coloring = -1;
3581 }
3582 else
3583 /* Color the node, so we can assert on revisiting the node
3584 that the graph is indeed SESE. */
3585 sese->color = coloring;
3586 }
3587 else
3588 /* Fallen off the subgraph, we cannot be coloring. */
3589 gcc_assert (coloring < 0);
3590
3591 /* Walk each successor block. */
3592 if (block->succs && block->succs->length ())
3593 {
3594 edge e;
3595 edge_iterator ei;
3596
3597 FOR_EACH_EDGE (e, ei, block->succs)
3598 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3599 }
3600 else
3601 gcc_assert (coloring < 0);
3602 }
3603
3604 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3605 end up with NULL entries in it. */
3606
3607 static void
3608 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3609 {
3610 basic_block block;
3611 int ix;
3612
3613 /* First clear each BB of the whole function. */
3614 FOR_ALL_BB_FN (block, cfun)
3615 {
3616 block->flags &= ~BB_VISITED;
3617 BB_SET_SESE (block, 0);
3618 }
3619
3620 /* Mark blocks in the function that are in this graph. */
3621 for (ix = 0; blocks.iterate (ix, &block); ix++)
3622 block->flags |= BB_VISITED;
3623
3624 /* Counts of nodes assigned to each color. There cannot be more
3625 colors than blocks (and hopefully there will be fewer). */
3626 auto_vec<unsigned> color_counts;
3627 color_counts.reserve (blocks.length ());
3628
3629 /* Worklist of nodes in the spanning tree. Again, there cannot be
3630 more nodes in the tree than blocks (there will be fewer if the
3631 CFG of blocks is disjoint). */
3632 auto_vec<basic_block> spanlist;
3633 spanlist.reserve (blocks.length ());
3634
3635 /* Make sure every block has its cycle class determined. */
3636 for (ix = 0; blocks.iterate (ix, &block); ix++)
3637 {
3638 if (BB_GET_SESE (block))
3639 /* We already met this block in an earlier graph solve. */
3640 continue;
3641
3642 if (dump_file)
3643 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3644
3645 /* Number the nodes reachable from block initial DFS order. */
3646 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3647
3648 /* Now walk in reverse DFS order to find cycle equivalents. */
3649 while (spanlist.length ())
3650 {
3651 block = spanlist.pop ();
3652 bb_sese *sese = BB_GET_SESE (block);
3653
3654 /* Do the pseudo node below. */
3655 nvptx_sese_pseudo (block, sese, depth, +1,
3656 sese->dir > 0 ? block->succs : block->preds,
3657 (sese->dir > 0 ? offsetof (edge_def, dest)
3658 : offsetof (edge_def, src)));
3659 sese->set_color (color_counts);
3660 /* Do the pseudo node above. */
3661 nvptx_sese_pseudo (block, sese, depth, -1,
3662 sese->dir < 0 ? block->succs : block->preds,
3663 (sese->dir < 0 ? offsetof (edge_def, dest)
3664 : offsetof (edge_def, src)));
3665 }
3666 if (dump_file)
3667 fprintf (dump_file, "\n");
3668 }
3669
3670 if (dump_file)
3671 {
3672 unsigned count;
3673 const char *comma = "";
3674
3675 fprintf (dump_file, "Found %d cycle equivalents\n",
3676 color_counts.length ());
3677 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3678 {
3679 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3680
3681 comma = "";
3682 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3683 if (BB_GET_SESE (block)->color == ix)
3684 {
3685 block->flags |= BB_VISITED;
3686 fprintf (dump_file, "%s%d", comma, block->index);
3687 comma=",";
3688 }
3689 fprintf (dump_file, "}");
3690 comma = ", ";
3691 }
3692 fprintf (dump_file, "\n");
3693 }
3694
3695 /* Now we've colored every block in the subgraph. We now need to
3696 determine the minimal set of SESE regions that cover that
3697 subgraph. Do this with a DFS walk of the complete function.
3698 During the walk we're either 'looking' or 'coloring'. When we
3699 reach the last node of a particular color, we stop coloring and
3700 return to looking. */
3701
3702 /* There cannot be more SESE regions than colors. */
3703 regions.reserve (color_counts.length ());
3704 for (ix = color_counts.length (); ix--;)
3705 regions.quick_push (bb_pair_t (0, 0));
3706
3707 for (ix = 0; blocks.iterate (ix, &block); ix++)
3708 block->flags &= ~BB_VISITED;
3709
3710 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3711
3712 if (dump_file)
3713 {
3714 const char *comma = "";
3715 int len = regions.length ();
3716
3717 fprintf (dump_file, "SESE regions:");
3718 for (ix = 0; ix != len; ix++)
3719 {
3720 basic_block from = regions[ix].first;
3721 basic_block to = regions[ix].second;
3722
3723 if (from)
3724 {
3725 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3726 if (to != from)
3727 fprintf (dump_file, "->%d", to->index);
3728
3729 int color = BB_GET_SESE (from)->color;
3730
3731 /* Print the blocks within the region (excluding ends). */
3732 FOR_EACH_BB_FN (block, cfun)
3733 {
3734 bb_sese *sese = BB_GET_SESE (block);
3735
3736 if (sese && sese->color == color
3737 && block != from && block != to)
3738 fprintf (dump_file, ".%d", block->index);
3739 }
3740 fprintf (dump_file, "}");
3741 }
3742 comma = ",";
3743 }
3744 fprintf (dump_file, "\n\n");
3745 }
3746
3747 for (ix = 0; blocks.iterate (ix, &block); ix++)
3748 delete BB_GET_SESE (block);
3749 }
3750
3751 #undef BB_SET_SESE
3752 #undef BB_GET_SESE
3753
3754 /* Propagate live state at the start of a partitioned region. BLOCK
3755 provides the live register information, and might not contain
3756 INSN. Propagation is inserted just after INSN. RW indicates whether
3757 we are reading and/or writing state. This
3758 separation is needed for worker-level proppagation where we
3759 essentially do a spill & fill. FN is the underlying worker
3760 function to generate the propagation instructions for single
3761 register. DATA is user data.
3762
3763 We propagate the live register set and the entire frame. We could
3764 do better by (a) propagating just the live set that is used within
3765 the partitioned regions and (b) only propagating stack entries that
3766 are used. The latter might be quite hard to determine. */
3767
3768 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3769
3770 static void
3771 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3772 propagator_fn fn, void *data)
3773 {
3774 bitmap live = DF_LIVE_IN (block);
3775 bitmap_iterator iterator;
3776 unsigned ix;
3777
3778 /* Copy the frame array. */
3779 HOST_WIDE_INT fs = get_frame_size ();
3780 if (fs)
3781 {
3782 rtx tmp = gen_reg_rtx (DImode);
3783 rtx idx = NULL_RTX;
3784 rtx ptr = gen_reg_rtx (Pmode);
3785 rtx pred = NULL_RTX;
3786 rtx_code_label *label = NULL;
3787
3788 /* The frame size might not be DImode compatible, but the frame
3789 array's declaration will be. So it's ok to round up here. */
3790 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3791 /* Detect single iteration loop. */
3792 if (fs == 1)
3793 fs = 0;
3794
3795 start_sequence ();
3796 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3797 if (fs)
3798 {
3799 idx = gen_reg_rtx (SImode);
3800 pred = gen_reg_rtx (BImode);
3801 label = gen_label_rtx ();
3802
3803 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3804 /* Allow worker function to initialize anything needed. */
3805 rtx init = fn (tmp, PM_loop_begin, fs, data);
3806 if (init)
3807 emit_insn (init);
3808 emit_label (label);
3809 LABEL_NUSES (label)++;
3810 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3811 }
3812 if (rw & PM_read)
3813 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3814 emit_insn (fn (tmp, rw, fs, data));
3815 if (rw & PM_write)
3816 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3817 if (fs)
3818 {
3819 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3820 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3821 emit_insn (gen_br_true_uni (pred, label));
3822 rtx fini = fn (tmp, PM_loop_end, fs, data);
3823 if (fini)
3824 emit_insn (fini);
3825 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3826 }
3827 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3828 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3829 rtx cpy = get_insns ();
3830 end_sequence ();
3831 insn = emit_insn_after (cpy, insn);
3832 }
3833
3834 /* Copy live registers. */
3835 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3836 {
3837 rtx reg = regno_reg_rtx[ix];
3838
3839 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3840 {
3841 rtx bcast = fn (reg, rw, 0, data);
3842
3843 insn = emit_insn_after (bcast, insn);
3844 }
3845 }
3846 }
3847
3848 /* Worker for nvptx_vpropagate. */
3849
3850 static rtx
3851 vprop_gen (rtx reg, propagate_mask pm,
3852 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3853 {
3854 if (!(pm & PM_read_write))
3855 return 0;
3856
3857 return nvptx_gen_vcast (reg);
3858 }
3859
3860 /* Propagate state that is live at start of BLOCK across the vectors
3861 of a single warp. Propagation is inserted just after INSN. */
3862
3863 static void
3864 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3865 {
3866 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3867 }
3868
3869 /* Worker for nvptx_wpropagate. */
3870
3871 static rtx
3872 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3873 {
3874 wcast_data_t *data = (wcast_data_t *)data_;
3875
3876 if (pm & PM_loop_begin)
3877 {
3878 /* Starting a loop, initialize pointer. */
3879 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3880
3881 if (align > worker_bcast_align)
3882 worker_bcast_align = align;
3883 data->offset = (data->offset + align - 1) & ~(align - 1);
3884
3885 data->ptr = gen_reg_rtx (Pmode);
3886
3887 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3888 }
3889 else if (pm & PM_loop_end)
3890 {
3891 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3892 data->ptr = NULL_RTX;
3893 return clobber;
3894 }
3895 else
3896 return nvptx_gen_wcast (reg, pm, rep, data);
3897 }
3898
3899 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3900 indicates if this is just before partitioned mode (do spill), or
3901 just after it starts (do fill). Sequence is inserted just after
3902 INSN. */
3903
3904 static void
3905 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3906 {
3907 wcast_data_t data;
3908
3909 data.base = gen_reg_rtx (Pmode);
3910 data.offset = 0;
3911 data.ptr = NULL_RTX;
3912
3913 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3914 if (data.offset)
3915 {
3916 /* Stuff was emitted, initialize the base pointer now. */
3917 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3918 emit_insn_after (init, insn);
3919
3920 if (worker_bcast_size < data.offset)
3921 worker_bcast_size = data.offset;
3922 }
3923 }
3924
3925 /* Emit a worker-level synchronization barrier. We use different
3926 markers for before and after synchronizations. */
3927
3928 static rtx
3929 nvptx_wsync (bool after)
3930 {
3931 return gen_nvptx_barsync (GEN_INT (after));
3932 }
3933
3934 #if WORKAROUND_PTXJIT_BUG
3935 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3936 real insns. */
3937
3938 static rtx_insn *
3939 bb_first_real_insn (basic_block bb)
3940 {
3941 rtx_insn *insn;
3942
3943 /* Find first insn of from block. */
3944 FOR_BB_INSNS (bb, insn)
3945 if (INSN_P (insn))
3946 return insn;
3947
3948 return 0;
3949 }
3950 #endif
3951
3952 /* Single neutering according to MASK. FROM is the incoming block and
3953 TO is the outgoing block. These may be the same block. Insert at
3954 start of FROM:
3955
3956 if (tid.<axis>) goto end.
3957
3958 and insert before ending branch of TO (if there is such an insn):
3959
3960 end:
3961 <possibly-broadcast-cond>
3962 <branch>
3963
3964 We currently only use differnt FROM and TO when skipping an entire
3965 loop. We could do more if we detected superblocks. */
3966
3967 static void
3968 nvptx_single (unsigned mask, basic_block from, basic_block to)
3969 {
3970 rtx_insn *head = BB_HEAD (from);
3971 rtx_insn *tail = BB_END (to);
3972 unsigned skip_mask = mask;
3973
3974 while (true)
3975 {
3976 /* Find first insn of from block. */
3977 while (head != BB_END (from)
3978 && (!INSN_P (head)
3979 || recog_memoized (head) == CODE_FOR_nvptx_barsync))
3980 head = NEXT_INSN (head);
3981
3982 if (from == to)
3983 break;
3984
3985 if (!(JUMP_P (head) && single_succ_p (from)))
3986 break;
3987
3988 basic_block jump_target = single_succ (from);
3989 if (!single_pred_p (jump_target))
3990 break;
3991
3992 from = jump_target;
3993 head = BB_HEAD (from);
3994 }
3995
3996 /* Find last insn of to block */
3997 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3998 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3999 tail = PREV_INSN (tail);
4000
4001 /* Detect if tail is a branch. */
4002 rtx tail_branch = NULL_RTX;
4003 rtx cond_branch = NULL_RTX;
4004 if (tail && INSN_P (tail))
4005 {
4006 tail_branch = PATTERN (tail);
4007 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4008 tail_branch = NULL_RTX;
4009 else
4010 {
4011 cond_branch = SET_SRC (tail_branch);
4012 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4013 cond_branch = NULL_RTX;
4014 }
4015 }
4016
4017 if (tail == head)
4018 {
4019 /* If this is empty, do nothing. */
4020 if (!head || !INSN_P (head))
4021 return;
4022
4023 /* If this is a dummy insn, do nothing. */
4024 switch (recog_memoized (head))
4025 {
4026 default:
4027 break;
4028 case CODE_FOR_nvptx_barsync:
4029 case CODE_FOR_nvptx_fork:
4030 case CODE_FOR_nvptx_forked:
4031 case CODE_FOR_nvptx_joining:
4032 case CODE_FOR_nvptx_join:
4033 return;
4034 }
4035
4036 if (cond_branch)
4037 {
4038 /* If we're only doing vector single, there's no need to
4039 emit skip code because we'll not insert anything. */
4040 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4041 skip_mask = 0;
4042 }
4043 else if (tail_branch)
4044 /* Block with only unconditional branch. Nothing to do. */
4045 return;
4046 }
4047
4048 /* Insert the vector test inside the worker test. */
4049 unsigned mode;
4050 rtx_insn *before = tail;
4051 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4052 if (GOMP_DIM_MASK (mode) & skip_mask)
4053 {
4054 rtx_code_label *label = gen_label_rtx ();
4055 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4056
4057 if (!pred)
4058 {
4059 pred = gen_reg_rtx (BImode);
4060 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4061 }
4062
4063 rtx br;
4064 if (mode == GOMP_DIM_VECTOR)
4065 br = gen_br_true (pred, label);
4066 else
4067 br = gen_br_true_uni (pred, label);
4068 emit_insn_before (br, head);
4069
4070 LABEL_NUSES (label)++;
4071 if (tail_branch)
4072 before = emit_label_before (label, before);
4073 else
4074 {
4075 rtx_insn *label_insn = emit_label_after (label, tail);
4076 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4077 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4078 emit_insn_after (gen_exit (), label_insn);
4079 }
4080 }
4081
4082 /* Now deal with propagating the branch condition. */
4083 if (cond_branch)
4084 {
4085 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4086
4087 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4088 {
4089 /* Vector mode only, do a shuffle. */
4090 #if WORKAROUND_PTXJIT_BUG
4091 /* The branch condition %rcond is propagated like this:
4092
4093 {
4094 .reg .u32 %x;
4095 mov.u32 %x,%tid.x;
4096 setp.ne.u32 %rnotvzero,%x,0;
4097 }
4098
4099 @%rnotvzero bra Lskip;
4100 setp.<op>.<type> %rcond,op1,op2;
4101 Lskip:
4102 selp.u32 %rcondu32,1,0,%rcond;
4103 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4104 setp.ne.u32 %rcond,%rcondu32,0;
4105
4106 There seems to be a bug in the ptx JIT compiler (observed at driver
4107 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4108 unless %rcond is initialized to something before 'bra Lskip'. The
4109 bug is not observed with ptxas from cuda 8.0.61.
4110
4111 It is true that the code is non-trivial: at Lskip, %rcond is
4112 uninitialized in threads 1-31, and after the selp the same holds
4113 for %rcondu32. But shfl propagates the defined value in thread 0
4114 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4115 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4116
4117 There is nothing in the PTX spec to suggest that this is wrong, or
4118 to explain why the extra initialization is needed. So, we classify
4119 it as a JIT bug, and the extra initialization as workaround:
4120
4121 {
4122 .reg .u32 %x;
4123 mov.u32 %x,%tid.x;
4124 setp.ne.u32 %rnotvzero,%x,0;
4125 }
4126
4127 +.reg .pred %rcond2;
4128 +setp.eq.u32 %rcond2, 1, 0;
4129
4130 @%rnotvzero bra Lskip;
4131 setp.<op>.<type> %rcond,op1,op2;
4132 +mov.pred %rcond2, %rcond;
4133 Lskip:
4134 +mov.pred %rcond, %rcond2;
4135 selp.u32 %rcondu32,1,0,%rcond;
4136 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4137 setp.ne.u32 %rcond,%rcondu32,0;
4138 */
4139 rtx_insn *label = PREV_INSN (tail);
4140 gcc_assert (label && LABEL_P (label));
4141 rtx tmp = gen_reg_rtx (BImode);
4142 emit_insn_before (gen_movbi (tmp, const0_rtx),
4143 bb_first_real_insn (from));
4144 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4145 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4146 #endif
4147 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4148 }
4149 else
4150 {
4151 /* Includes worker mode, do spill & fill. By construction
4152 we should never have worker mode only. */
4153 wcast_data_t data;
4154
4155 data.base = worker_bcast_sym;
4156 data.ptr = 0;
4157
4158 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4159 worker_bcast_size = GET_MODE_SIZE (SImode);
4160
4161 data.offset = 0;
4162 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4163 before);
4164 /* Barrier so other workers can see the write. */
4165 emit_insn_before (nvptx_wsync (false), tail);
4166 data.offset = 0;
4167 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4168 /* This barrier is needed to avoid worker zero clobbering
4169 the broadcast buffer before all the other workers have
4170 had a chance to read this instance of it. */
4171 emit_insn_before (nvptx_wsync (true), tail);
4172 }
4173
4174 extract_insn (tail);
4175 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4176 UNSPEC_BR_UNIFIED);
4177 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4178 }
4179 }
4180
4181 /* PAR is a parallel that is being skipped in its entirety according to
4182 MASK. Treat this as skipping a superblock starting at forked
4183 and ending at joining. */
4184
4185 static void
4186 nvptx_skip_par (unsigned mask, parallel *par)
4187 {
4188 basic_block tail = par->join_block;
4189 gcc_assert (tail->preds->length () == 1);
4190
4191 basic_block pre_tail = (*tail->preds)[0]->src;
4192 gcc_assert (pre_tail->succs->length () == 1);
4193
4194 nvptx_single (mask, par->forked_block, pre_tail);
4195 }
4196
4197 /* If PAR has a single inner parallel and PAR itself only contains
4198 empty entry and exit blocks, swallow the inner PAR. */
4199
4200 static void
4201 nvptx_optimize_inner (parallel *par)
4202 {
4203 parallel *inner = par->inner;
4204
4205 /* We mustn't be the outer dummy par. */
4206 if (!par->mask)
4207 return;
4208
4209 /* We must have a single inner par. */
4210 if (!inner || inner->next)
4211 return;
4212
4213 /* We must only contain 2 blocks ourselves -- the head and tail of
4214 the inner par. */
4215 if (par->blocks.length () != 2)
4216 return;
4217
4218 /* We must be disjoint partitioning. As we only have vector and
4219 worker partitioning, this is sufficient to guarantee the pars
4220 have adjacent partitioning. */
4221 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4222 /* This indicates malformed code generation. */
4223 return;
4224
4225 /* The outer forked insn should be immediately followed by the inner
4226 fork insn. */
4227 rtx_insn *forked = par->forked_insn;
4228 rtx_insn *fork = BB_END (par->forked_block);
4229
4230 if (NEXT_INSN (forked) != fork)
4231 return;
4232 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4233
4234 /* The outer joining insn must immediately follow the inner join
4235 insn. */
4236 rtx_insn *joining = par->joining_insn;
4237 rtx_insn *join = inner->join_insn;
4238 if (NEXT_INSN (join) != joining)
4239 return;
4240
4241 /* Preconditions met. Swallow the inner par. */
4242 if (dump_file)
4243 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4244 inner->mask, inner->forked_block->index,
4245 inner->join_block->index,
4246 par->mask, par->forked_block->index, par->join_block->index);
4247
4248 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4249
4250 par->blocks.reserve (inner->blocks.length ());
4251 while (inner->blocks.length ())
4252 par->blocks.quick_push (inner->blocks.pop ());
4253
4254 par->inner = inner->inner;
4255 inner->inner = NULL;
4256
4257 delete inner;
4258 }
4259
4260 /* Process the parallel PAR and all its contained
4261 parallels. We do everything but the neutering. Return mask of
4262 partitioned modes used within this parallel. */
4263
4264 static unsigned
4265 nvptx_process_pars (parallel *par)
4266 {
4267 if (nvptx_optimize)
4268 nvptx_optimize_inner (par);
4269
4270 unsigned inner_mask = par->mask;
4271
4272 /* Do the inner parallels first. */
4273 if (par->inner)
4274 {
4275 par->inner_mask = nvptx_process_pars (par->inner);
4276 inner_mask |= par->inner_mask;
4277 }
4278
4279 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
4280 /* No propagation needed for a call. */;
4281 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4282 {
4283 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
4284 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
4285 /* Insert begin and end synchronizations. */
4286 emit_insn_before (nvptx_wsync (false), par->forked_insn);
4287 emit_insn_before (nvptx_wsync (true), par->join_insn);
4288 }
4289 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4290 nvptx_vpropagate (par->forked_block, par->forked_insn);
4291
4292 /* Now do siblings. */
4293 if (par->next)
4294 inner_mask |= nvptx_process_pars (par->next);
4295 return inner_mask;
4296 }
4297
4298 /* Neuter the parallel described by PAR. We recurse in depth-first
4299 order. MODES are the partitioning of the execution and OUTER is
4300 the partitioning of the parallels we are contained in. */
4301
4302 static void
4303 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4304 {
4305 unsigned me = (par->mask
4306 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4307 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4308 unsigned skip_mask = 0, neuter_mask = 0;
4309
4310 if (par->inner)
4311 nvptx_neuter_pars (par->inner, modes, outer | me);
4312
4313 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4314 {
4315 if ((outer | me) & GOMP_DIM_MASK (mode))
4316 {} /* Mode is partitioned: no neutering. */
4317 else if (!(modes & GOMP_DIM_MASK (mode)))
4318 {} /* Mode is not used: nothing to do. */
4319 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4320 || !par->forked_insn)
4321 /* Partitioned in inner parallels, or we're not a partitioned
4322 at all: neuter individual blocks. */
4323 neuter_mask |= GOMP_DIM_MASK (mode);
4324 else if (!par->parent || !par->parent->forked_insn
4325 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4326 /* Parent isn't a parallel or contains this paralleling: skip
4327 parallel at this level. */
4328 skip_mask |= GOMP_DIM_MASK (mode);
4329 else
4330 {} /* Parent will skip this parallel itself. */
4331 }
4332
4333 if (neuter_mask)
4334 {
4335 int ix, len;
4336
4337 if (nvptx_optimize)
4338 {
4339 /* Neuter whole SESE regions. */
4340 bb_pair_vec_t regions;
4341
4342 nvptx_find_sese (par->blocks, regions);
4343 len = regions.length ();
4344 for (ix = 0; ix != len; ix++)
4345 {
4346 basic_block from = regions[ix].first;
4347 basic_block to = regions[ix].second;
4348
4349 if (from)
4350 nvptx_single (neuter_mask, from, to);
4351 else
4352 gcc_assert (!to);
4353 }
4354 }
4355 else
4356 {
4357 /* Neuter each BB individually. */
4358 len = par->blocks.length ();
4359 for (ix = 0; ix != len; ix++)
4360 {
4361 basic_block block = par->blocks[ix];
4362
4363 nvptx_single (neuter_mask, block, block);
4364 }
4365 }
4366 }
4367
4368 if (skip_mask)
4369 nvptx_skip_par (skip_mask, par);
4370
4371 if (par->next)
4372 nvptx_neuter_pars (par->next, modes, outer);
4373 }
4374
4375 #if WORKAROUND_PTXJIT_BUG_2
4376 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4377 is needed in the nvptx target because the branches generated for
4378 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4379
4380 static rtx
4381 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4382 {
4383 rtx pat;
4384 if ((strict && !JUMP_P (insn))
4385 || (!strict && !INSN_P (insn)))
4386 return NULL_RTX;
4387 pat = PATTERN (insn);
4388
4389 /* The set is allowed to appear either as the insn pattern or
4390 the first set in a PARALLEL. */
4391 if (GET_CODE (pat) == PARALLEL)
4392 pat = XVECEXP (pat, 0, 0);
4393 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4394 return pat;
4395
4396 return NULL_RTX;
4397 }
4398
4399 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4400
4401 static rtx
4402 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4403 {
4404 rtx x = nvptx_pc_set (insn, strict);
4405
4406 if (!x)
4407 return NULL_RTX;
4408 x = SET_SRC (x);
4409 if (GET_CODE (x) == LABEL_REF)
4410 return x;
4411 if (GET_CODE (x) != IF_THEN_ELSE)
4412 return NULL_RTX;
4413 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4414 return XEXP (x, 1);
4415 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4416 return XEXP (x, 2);
4417 return NULL_RTX;
4418 }
4419
4420 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4421 insn inbetween the branch and the label. This works around a JIT bug
4422 observed at driver version 384.111, at -O0 for sm_50. */
4423
4424 static void
4425 prevent_branch_around_nothing (void)
4426 {
4427 rtx_insn *seen_label = NULL;
4428 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4429 {
4430 if (INSN_P (insn) && condjump_p (insn))
4431 {
4432 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4433 continue;
4434 }
4435
4436 if (seen_label == NULL)
4437 continue;
4438
4439 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4440 continue;
4441
4442 if (INSN_P (insn))
4443 switch (recog_memoized (insn))
4444 {
4445 case CODE_FOR_nvptx_fork:
4446 case CODE_FOR_nvptx_forked:
4447 case CODE_FOR_nvptx_joining:
4448 case CODE_FOR_nvptx_join:
4449 continue;
4450 default:
4451 seen_label = NULL;
4452 continue;
4453 }
4454
4455 if (LABEL_P (insn) && insn == seen_label)
4456 emit_insn_before (gen_fake_nop (), insn);
4457
4458 seen_label = NULL;
4459 }
4460 }
4461 #endif
4462
4463 /* PTX-specific reorganization
4464 - Split blocks at fork and join instructions
4465 - Compute live registers
4466 - Mark now-unused registers, so function begin doesn't declare
4467 unused registers.
4468 - Insert state propagation when entering partitioned mode
4469 - Insert neutering instructions when in single mode
4470 - Replace subregs with suitable sequences.
4471 */
4472
4473 static void
4474 nvptx_reorg (void)
4475 {
4476 /* We are freeing block_for_insn in the toplev to keep compatibility
4477 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4478 compute_bb_for_insn ();
4479
4480 thread_prologue_and_epilogue_insns ();
4481
4482 /* Split blocks and record interesting unspecs. */
4483 bb_insn_map_t bb_insn_map;
4484
4485 nvptx_split_blocks (&bb_insn_map);
4486
4487 /* Compute live regs */
4488 df_clear_flags (DF_LR_RUN_DCE);
4489 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4490 df_live_add_problem ();
4491 df_live_set_all_dirty ();
4492 df_analyze ();
4493 regstat_init_n_sets_and_refs ();
4494
4495 if (dump_file)
4496 df_dump (dump_file);
4497
4498 /* Mark unused regs as unused. */
4499 int max_regs = max_reg_num ();
4500 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4501 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4502 regno_reg_rtx[i] = const0_rtx;
4503
4504 /* Determine launch dimensions of the function. If it is not an
4505 offloaded function (i.e. this is a regular compiler), the
4506 function has no neutering. */
4507 tree attr = oacc_get_fn_attrib (current_function_decl);
4508 if (attr)
4509 {
4510 /* If we determined this mask before RTL expansion, we could
4511 elide emission of some levels of forks and joins. */
4512 unsigned mask = 0;
4513 tree dims = TREE_VALUE (attr);
4514 unsigned ix;
4515
4516 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4517 {
4518 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4519 tree allowed = TREE_PURPOSE (dims);
4520
4521 if (size != 1 && !(allowed && integer_zerop (allowed)))
4522 mask |= GOMP_DIM_MASK (ix);
4523 }
4524 /* If there is worker neutering, there must be vector
4525 neutering. Otherwise the hardware will fail. */
4526 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4527 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4528
4529 /* Discover & process partitioned regions. */
4530 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4531 nvptx_process_pars (pars);
4532 nvptx_neuter_pars (pars, mask, 0);
4533 delete pars;
4534 }
4535
4536 /* Replace subregs. */
4537 nvptx_reorg_subreg ();
4538
4539 if (TARGET_UNIFORM_SIMT)
4540 nvptx_reorg_uniform_simt ();
4541
4542 #if WORKAROUND_PTXJIT_BUG_2
4543 prevent_branch_around_nothing ();
4544 #endif
4545
4546 regstat_free_n_sets_and_refs ();
4547
4548 df_finish_pass (true);
4549 }
4550 \f
4551 /* Handle a "kernel" attribute; arguments as in
4552 struct attribute_spec.handler. */
4553
4554 static tree
4555 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4556 int ARG_UNUSED (flags), bool *no_add_attrs)
4557 {
4558 tree decl = *node;
4559
4560 if (TREE_CODE (decl) != FUNCTION_DECL)
4561 {
4562 error ("%qE attribute only applies to functions", name);
4563 *no_add_attrs = true;
4564 }
4565 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4566 {
4567 error ("%qE attribute requires a void return type", name);
4568 *no_add_attrs = true;
4569 }
4570
4571 return NULL_TREE;
4572 }
4573
4574 /* Handle a "shared" attribute; arguments as in
4575 struct attribute_spec.handler. */
4576
4577 static tree
4578 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4579 int ARG_UNUSED (flags), bool *no_add_attrs)
4580 {
4581 tree decl = *node;
4582
4583 if (TREE_CODE (decl) != VAR_DECL)
4584 {
4585 error ("%qE attribute only applies to variables", name);
4586 *no_add_attrs = true;
4587 }
4588 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4589 {
4590 error ("%qE attribute not allowed with auto storage class", name);
4591 *no_add_attrs = true;
4592 }
4593
4594 return NULL_TREE;
4595 }
4596
4597 /* Table of valid machine attributes. */
4598 static const struct attribute_spec nvptx_attribute_table[] =
4599 {
4600 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4601 affects_type_identity, handler, exclude } */
4602 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4603 NULL },
4604 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4605 NULL },
4606 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4607 };
4608 \f
4609 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4610
4611 static HOST_WIDE_INT
4612 nvptx_vector_alignment (const_tree type)
4613 {
4614 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4615
4616 return MIN (align, BIGGEST_ALIGNMENT);
4617 }
4618
4619 /* Indicate that INSN cannot be duplicated. */
4620
4621 static bool
4622 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4623 {
4624 switch (recog_memoized (insn))
4625 {
4626 case CODE_FOR_nvptx_shufflesi:
4627 case CODE_FOR_nvptx_shufflesf:
4628 case CODE_FOR_nvptx_barsync:
4629 case CODE_FOR_nvptx_fork:
4630 case CODE_FOR_nvptx_forked:
4631 case CODE_FOR_nvptx_joining:
4632 case CODE_FOR_nvptx_join:
4633 return true;
4634 default:
4635 return false;
4636 }
4637 }
4638
4639 /* Section anchors do not work. Initialization for flag_section_anchor
4640 probes the existence of the anchoring target hooks and prevents
4641 anchoring if they don't exist. However, we may be being used with
4642 a host-side compiler that does support anchoring, and hence see
4643 the anchor flag set (as it's not recalculated). So provide an
4644 implementation denying anchoring. */
4645
4646 static bool
4647 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4648 {
4649 return false;
4650 }
4651 \f
4652 /* Record a symbol for mkoffload to enter into the mapping table. */
4653
4654 static void
4655 nvptx_record_offload_symbol (tree decl)
4656 {
4657 switch (TREE_CODE (decl))
4658 {
4659 case VAR_DECL:
4660 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4661 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4662 break;
4663
4664 case FUNCTION_DECL:
4665 {
4666 tree attr = oacc_get_fn_attrib (decl);
4667 /* OpenMP offloading does not set this attribute. */
4668 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4669
4670 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4671 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4672
4673 for (; dims; dims = TREE_CHAIN (dims))
4674 {
4675 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4676
4677 gcc_assert (!TREE_PURPOSE (dims));
4678 fprintf (asm_out_file, ", %#x", size);
4679 }
4680
4681 fprintf (asm_out_file, "\n");
4682 }
4683 break;
4684
4685 default:
4686 gcc_unreachable ();
4687 }
4688 }
4689
4690 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4691 at the start of a file. */
4692
4693 static void
4694 nvptx_file_start (void)
4695 {
4696 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4697 fputs ("\t.version\t3.1\n", asm_out_file);
4698 fputs ("\t.target\tsm_30\n", asm_out_file);
4699 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4700 fputs ("// END PREAMBLE\n", asm_out_file);
4701 }
4702
4703 /* Emit a declaration for a worker-level buffer in .shared memory. */
4704
4705 static void
4706 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4707 {
4708 const char *name = XSTR (sym, 0);
4709
4710 write_var_marker (file, true, false, name);
4711 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4712 align, name, size);
4713 }
4714
4715 /* Write out the function declarations we've collected and declare storage
4716 for the broadcast buffer. */
4717
4718 static void
4719 nvptx_file_end (void)
4720 {
4721 hash_table<tree_hasher>::iterator iter;
4722 tree decl;
4723 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4724 nvptx_record_fndecl (decl);
4725 fputs (func_decls.str().c_str(), asm_out_file);
4726
4727 if (worker_bcast_size)
4728 write_worker_buffer (asm_out_file, worker_bcast_sym,
4729 worker_bcast_align, worker_bcast_size);
4730
4731 if (worker_red_size)
4732 write_worker_buffer (asm_out_file, worker_red_sym,
4733 worker_red_align, worker_red_size);
4734
4735 if (need_softstack_decl)
4736 {
4737 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4738 /* 32 is the maximum number of warps in a block. Even though it's an
4739 external declaration, emit the array size explicitly; otherwise, it
4740 may fail at PTX JIT time if the definition is later in link order. */
4741 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4742 POINTER_SIZE);
4743 }
4744 if (need_unisimt_decl)
4745 {
4746 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4747 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4748 }
4749 }
4750
4751 /* Expander for the shuffle builtins. */
4752
4753 static rtx
4754 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4755 {
4756 if (ignore)
4757 return target;
4758
4759 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4760 NULL_RTX, mode, EXPAND_NORMAL);
4761 if (!REG_P (src))
4762 src = copy_to_mode_reg (mode, src);
4763
4764 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4765 NULL_RTX, SImode, EXPAND_NORMAL);
4766 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4767 NULL_RTX, SImode, EXPAND_NORMAL);
4768
4769 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4770 idx = copy_to_mode_reg (SImode, idx);
4771
4772 rtx pat = nvptx_gen_shuffle (target, src, idx,
4773 (nvptx_shuffle_kind) INTVAL (op));
4774 if (pat)
4775 emit_insn (pat);
4776
4777 return target;
4778 }
4779
4780 /* Worker reduction address expander. */
4781
4782 static rtx
4783 nvptx_expand_worker_addr (tree exp, rtx target,
4784 machine_mode ARG_UNUSED (mode), int ignore)
4785 {
4786 if (ignore)
4787 return target;
4788
4789 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4790 if (align > worker_red_align)
4791 worker_red_align = align;
4792
4793 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4794 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4795 if (size + offset > worker_red_size)
4796 worker_red_size = size + offset;
4797
4798 rtx addr = worker_red_sym;
4799 if (offset)
4800 {
4801 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4802 addr = gen_rtx_CONST (Pmode, addr);
4803 }
4804
4805 emit_move_insn (target, addr);
4806
4807 return target;
4808 }
4809
4810 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4811 not require taking the address of any object, other than the memory
4812 cell being operated on. */
4813
4814 static rtx
4815 nvptx_expand_cmp_swap (tree exp, rtx target,
4816 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4817 {
4818 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4819
4820 if (!target)
4821 target = gen_reg_rtx (mode);
4822
4823 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4824 NULL_RTX, Pmode, EXPAND_NORMAL);
4825 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4826 NULL_RTX, mode, EXPAND_NORMAL);
4827 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4828 NULL_RTX, mode, EXPAND_NORMAL);
4829 rtx pat;
4830
4831 mem = gen_rtx_MEM (mode, mem);
4832 if (!REG_P (cmp))
4833 cmp = copy_to_mode_reg (mode, cmp);
4834 if (!REG_P (src))
4835 src = copy_to_mode_reg (mode, src);
4836
4837 if (mode == SImode)
4838 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4839 else
4840 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4841
4842 emit_insn (pat);
4843
4844 return target;
4845 }
4846
4847
4848 /* Codes for all the NVPTX builtins. */
4849 enum nvptx_builtins
4850 {
4851 NVPTX_BUILTIN_SHUFFLE,
4852 NVPTX_BUILTIN_SHUFFLELL,
4853 NVPTX_BUILTIN_WORKER_ADDR,
4854 NVPTX_BUILTIN_CMP_SWAP,
4855 NVPTX_BUILTIN_CMP_SWAPLL,
4856 NVPTX_BUILTIN_MAX
4857 };
4858
4859 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4860
4861 /* Return the NVPTX builtin for CODE. */
4862
4863 static tree
4864 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4865 {
4866 if (code >= NVPTX_BUILTIN_MAX)
4867 return error_mark_node;
4868
4869 return nvptx_builtin_decls[code];
4870 }
4871
4872 /* Set up all builtin functions for this target. */
4873
4874 static void
4875 nvptx_init_builtins (void)
4876 {
4877 #define DEF(ID, NAME, T) \
4878 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4879 = add_builtin_function ("__builtin_nvptx_" NAME, \
4880 build_function_type_list T, \
4881 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4882 #define ST sizetype
4883 #define UINT unsigned_type_node
4884 #define LLUINT long_long_unsigned_type_node
4885 #define PTRVOID ptr_type_node
4886
4887 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4888 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4889 DEF (WORKER_ADDR, "worker_addr",
4890 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4891 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4892 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4893
4894 #undef DEF
4895 #undef ST
4896 #undef UINT
4897 #undef LLUINT
4898 #undef PTRVOID
4899 }
4900
4901 /* Expand an expression EXP that calls a built-in function,
4902 with result going to TARGET if that's convenient
4903 (and in mode MODE if that's convenient).
4904 SUBTARGET may be used as the target for computing one of EXP's operands.
4905 IGNORE is nonzero if the value is to be ignored. */
4906
4907 static rtx
4908 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4909 machine_mode mode, int ignore)
4910 {
4911 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4912 switch (DECL_FUNCTION_CODE (fndecl))
4913 {
4914 case NVPTX_BUILTIN_SHUFFLE:
4915 case NVPTX_BUILTIN_SHUFFLELL:
4916 return nvptx_expand_shuffle (exp, target, mode, ignore);
4917
4918 case NVPTX_BUILTIN_WORKER_ADDR:
4919 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4920
4921 case NVPTX_BUILTIN_CMP_SWAP:
4922 case NVPTX_BUILTIN_CMP_SWAPLL:
4923 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4924
4925 default: gcc_unreachable ();
4926 }
4927 }
4928 \f
4929 /* Define dimension sizes for known hardware. */
4930 #define PTX_VECTOR_LENGTH 32
4931 #define PTX_WORKER_LENGTH 32
4932 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4933
4934 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4935
4936 static int
4937 nvptx_simt_vf ()
4938 {
4939 return PTX_VECTOR_LENGTH;
4940 }
4941
4942 /* Validate compute dimensions of an OpenACC offload or routine, fill
4943 in non-unity defaults. FN_LEVEL indicates the level at which a
4944 routine might spawn a loop. It is negative for non-routines. If
4945 DECL is null, we are validating the default dimensions. */
4946
4947 static bool
4948 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4949 {
4950 bool changed = false;
4951
4952 /* The vector size must be 32, unless this is a SEQ routine. */
4953 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4954 && dims[GOMP_DIM_VECTOR] >= 0
4955 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4956 {
4957 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4958 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4959 dims[GOMP_DIM_VECTOR]
4960 ? G_("using vector_length (%d), ignoring %d")
4961 : G_("using vector_length (%d), ignoring runtime setting"),
4962 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4963 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4964 changed = true;
4965 }
4966
4967 /* Check the num workers is not too large. */
4968 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4969 {
4970 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4971 "using num_workers (%d), ignoring %d",
4972 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4973 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4974 changed = true;
4975 }
4976
4977 if (!decl)
4978 {
4979 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4980 if (dims[GOMP_DIM_WORKER] < 0)
4981 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4982 if (dims[GOMP_DIM_GANG] < 0)
4983 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4984 changed = true;
4985 }
4986
4987 return changed;
4988 }
4989
4990 /* Return maximum dimension size, or zero for unbounded. */
4991
4992 static int
4993 nvptx_dim_limit (int axis)
4994 {
4995 switch (axis)
4996 {
4997 case GOMP_DIM_WORKER:
4998 return PTX_WORKER_LENGTH;
4999
5000 case GOMP_DIM_VECTOR:
5001 return PTX_VECTOR_LENGTH;
5002
5003 default:
5004 break;
5005 }
5006 return 0;
5007 }
5008
5009 /* Determine whether fork & joins are needed. */
5010
5011 static bool
5012 nvptx_goacc_fork_join (gcall *call, const int dims[],
5013 bool ARG_UNUSED (is_fork))
5014 {
5015 tree arg = gimple_call_arg (call, 2);
5016 unsigned axis = TREE_INT_CST_LOW (arg);
5017
5018 /* We only care about worker and vector partitioning. */
5019 if (axis < GOMP_DIM_WORKER)
5020 return false;
5021
5022 /* If the size is 1, there's no partitioning. */
5023 if (dims[axis] == 1)
5024 return false;
5025
5026 return true;
5027 }
5028
5029 /* Generate a PTX builtin function call that returns the address in
5030 the worker reduction buffer at OFFSET. TYPE is the type of the
5031 data at that location. */
5032
5033 static tree
5034 nvptx_get_worker_red_addr (tree type, tree offset)
5035 {
5036 machine_mode mode = TYPE_MODE (type);
5037 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5038 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5039 tree align = build_int_cst (unsigned_type_node,
5040 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5041 tree call = build_call_expr (fndecl, 3, offset, size, align);
5042
5043 return fold_convert (build_pointer_type (type), call);
5044 }
5045
5046 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5047 will cast the variable if necessary. */
5048
5049 static void
5050 nvptx_generate_vector_shuffle (location_t loc,
5051 tree dest_var, tree var, unsigned shift,
5052 gimple_seq *seq)
5053 {
5054 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5055 tree_code code = NOP_EXPR;
5056 tree arg_type = unsigned_type_node;
5057 tree var_type = TREE_TYPE (var);
5058 tree dest_type = var_type;
5059
5060 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5061 var_type = TREE_TYPE (var_type);
5062
5063 if (TREE_CODE (var_type) == REAL_TYPE)
5064 code = VIEW_CONVERT_EXPR;
5065
5066 if (TYPE_SIZE (var_type)
5067 == TYPE_SIZE (long_long_unsigned_type_node))
5068 {
5069 fn = NVPTX_BUILTIN_SHUFFLELL;
5070 arg_type = long_long_unsigned_type_node;
5071 }
5072
5073 tree call = nvptx_builtin_decl (fn, true);
5074 tree bits = build_int_cst (unsigned_type_node, shift);
5075 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5076 tree expr;
5077
5078 if (var_type != dest_type)
5079 {
5080 /* Do real and imaginary parts separately. */
5081 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5082 real = fold_build1 (code, arg_type, real);
5083 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5084 real = fold_build1 (code, var_type, real);
5085
5086 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5087 imag = fold_build1 (code, arg_type, imag);
5088 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5089 imag = fold_build1 (code, var_type, imag);
5090
5091 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5092 }
5093 else
5094 {
5095 expr = fold_build1 (code, arg_type, var);
5096 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5097 expr = fold_build1 (code, dest_type, expr);
5098 }
5099
5100 gimplify_assign (dest_var, expr, seq);
5101 }
5102
5103 /* Lazily generate the global lock var decl and return its address. */
5104
5105 static tree
5106 nvptx_global_lock_addr ()
5107 {
5108 tree v = global_lock_var;
5109
5110 if (!v)
5111 {
5112 tree name = get_identifier ("__reduction_lock");
5113 tree type = build_qualified_type (unsigned_type_node,
5114 TYPE_QUAL_VOLATILE);
5115 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5116 global_lock_var = v;
5117 DECL_ARTIFICIAL (v) = 1;
5118 DECL_EXTERNAL (v) = 1;
5119 TREE_STATIC (v) = 1;
5120 TREE_PUBLIC (v) = 1;
5121 TREE_USED (v) = 1;
5122 mark_addressable (v);
5123 mark_decl_referenced (v);
5124 }
5125
5126 return build_fold_addr_expr (v);
5127 }
5128
5129 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5130 GSI. We use a lockless scheme for nearly all case, which looks
5131 like:
5132 actual = initval(OP);
5133 do {
5134 guess = actual;
5135 write = guess OP myval;
5136 actual = cmp&swap (ptr, guess, write)
5137 } while (actual bit-different-to guess);
5138 return write;
5139
5140 This relies on a cmp&swap instruction, which is available for 32-
5141 and 64-bit types. Larger types must use a locking scheme. */
5142
5143 static tree
5144 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5145 tree ptr, tree var, tree_code op)
5146 {
5147 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5148 tree_code code = NOP_EXPR;
5149 tree arg_type = unsigned_type_node;
5150 tree var_type = TREE_TYPE (var);
5151
5152 if (TREE_CODE (var_type) == COMPLEX_TYPE
5153 || TREE_CODE (var_type) == REAL_TYPE)
5154 code = VIEW_CONVERT_EXPR;
5155
5156 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5157 {
5158 arg_type = long_long_unsigned_type_node;
5159 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5160 }
5161
5162 tree swap_fn = nvptx_builtin_decl (fn, true);
5163
5164 gimple_seq init_seq = NULL;
5165 tree init_var = make_ssa_name (arg_type);
5166 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5167 init_expr = fold_build1 (code, arg_type, init_expr);
5168 gimplify_assign (init_var, init_expr, &init_seq);
5169 gimple *init_end = gimple_seq_last (init_seq);
5170
5171 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5172
5173 /* Split the block just after the init stmts. */
5174 basic_block pre_bb = gsi_bb (*gsi);
5175 edge pre_edge = split_block (pre_bb, init_end);
5176 basic_block loop_bb = pre_edge->dest;
5177 pre_bb = pre_edge->src;
5178 /* Reset the iterator. */
5179 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5180
5181 tree expect_var = make_ssa_name (arg_type);
5182 tree actual_var = make_ssa_name (arg_type);
5183 tree write_var = make_ssa_name (arg_type);
5184
5185 /* Build and insert the reduction calculation. */
5186 gimple_seq red_seq = NULL;
5187 tree write_expr = fold_build1 (code, var_type, expect_var);
5188 write_expr = fold_build2 (op, var_type, write_expr, var);
5189 write_expr = fold_build1 (code, arg_type, write_expr);
5190 gimplify_assign (write_var, write_expr, &red_seq);
5191
5192 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5193
5194 /* Build & insert the cmp&swap sequence. */
5195 gimple_seq latch_seq = NULL;
5196 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5197 ptr, expect_var, write_var);
5198 gimplify_assign (actual_var, swap_expr, &latch_seq);
5199
5200 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5201 NULL_TREE, NULL_TREE);
5202 gimple_seq_add_stmt (&latch_seq, cond);
5203
5204 gimple *latch_end = gimple_seq_last (latch_seq);
5205 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5206
5207 /* Split the block just after the latch stmts. */
5208 edge post_edge = split_block (loop_bb, latch_end);
5209 basic_block post_bb = post_edge->dest;
5210 loop_bb = post_edge->src;
5211 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5212
5213 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5214 post_edge->probability = profile_probability::even ();
5215 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5216 loop_edge->probability = profile_probability::even ();
5217 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5218 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5219
5220 gphi *phi = create_phi_node (expect_var, loop_bb);
5221 add_phi_arg (phi, init_var, pre_edge, loc);
5222 add_phi_arg (phi, actual_var, loop_edge, loc);
5223
5224 loop *loop = alloc_loop ();
5225 loop->header = loop_bb;
5226 loop->latch = loop_bb;
5227 add_loop (loop, loop_bb->loop_father);
5228
5229 return fold_build1 (code, var_type, write_var);
5230 }
5231
5232 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5233 GSI. This is necessary for types larger than 64 bits, where there
5234 is no cmp&swap instruction to implement a lockless scheme. We use
5235 a lock variable in global memory.
5236
5237 while (cmp&swap (&lock_var, 0, 1))
5238 continue;
5239 T accum = *ptr;
5240 accum = accum OP var;
5241 *ptr = accum;
5242 cmp&swap (&lock_var, 1, 0);
5243 return accum;
5244
5245 A lock in global memory is necessary to force execution engine
5246 descheduling and avoid resource starvation that can occur if the
5247 lock is in .shared memory. */
5248
5249 static tree
5250 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5251 tree ptr, tree var, tree_code op)
5252 {
5253 tree var_type = TREE_TYPE (var);
5254 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5255 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5256 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5257
5258 /* Split the block just before the gsi. Insert a gimple nop to make
5259 this easier. */
5260 gimple *nop = gimple_build_nop ();
5261 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5262 basic_block entry_bb = gsi_bb (*gsi);
5263 edge entry_edge = split_block (entry_bb, nop);
5264 basic_block lock_bb = entry_edge->dest;
5265 /* Reset the iterator. */
5266 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5267
5268 /* Build and insert the locking sequence. */
5269 gimple_seq lock_seq = NULL;
5270 tree lock_var = make_ssa_name (unsigned_type_node);
5271 tree lock_expr = nvptx_global_lock_addr ();
5272 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5273 uns_unlocked, uns_locked);
5274 gimplify_assign (lock_var, lock_expr, &lock_seq);
5275 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5276 NULL_TREE, NULL_TREE);
5277 gimple_seq_add_stmt (&lock_seq, cond);
5278 gimple *lock_end = gimple_seq_last (lock_seq);
5279 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5280
5281 /* Split the block just after the lock sequence. */
5282 edge locked_edge = split_block (lock_bb, lock_end);
5283 basic_block update_bb = locked_edge->dest;
5284 lock_bb = locked_edge->src;
5285 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5286
5287 /* Create the lock loop ... */
5288 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5289 locked_edge->probability = profile_probability::even ();
5290 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5291 loop_edge->probability = profile_probability::even ();
5292 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5293 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5294
5295 /* ... and the loop structure. */
5296 loop *lock_loop = alloc_loop ();
5297 lock_loop->header = lock_bb;
5298 lock_loop->latch = lock_bb;
5299 lock_loop->nb_iterations_estimate = 1;
5300 lock_loop->any_estimate = true;
5301 add_loop (lock_loop, entry_bb->loop_father);
5302
5303 /* Build and insert the reduction calculation. */
5304 gimple_seq red_seq = NULL;
5305 tree acc_in = make_ssa_name (var_type);
5306 tree ref_in = build_simple_mem_ref (ptr);
5307 TREE_THIS_VOLATILE (ref_in) = 1;
5308 gimplify_assign (acc_in, ref_in, &red_seq);
5309
5310 tree acc_out = make_ssa_name (var_type);
5311 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5312 gimplify_assign (acc_out, update_expr, &red_seq);
5313
5314 tree ref_out = build_simple_mem_ref (ptr);
5315 TREE_THIS_VOLATILE (ref_out) = 1;
5316 gimplify_assign (ref_out, acc_out, &red_seq);
5317
5318 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5319
5320 /* Build & insert the unlock sequence. */
5321 gimple_seq unlock_seq = NULL;
5322 tree unlock_expr = nvptx_global_lock_addr ();
5323 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5324 uns_locked, uns_unlocked);
5325 gimplify_and_add (unlock_expr, &unlock_seq);
5326 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5327
5328 return acc_out;
5329 }
5330
5331 /* Emit a sequence to update a reduction accumlator at *PTR with the
5332 value held in VAR using operator OP. Return the updated value.
5333
5334 TODO: optimize for atomic ops and indepedent complex ops. */
5335
5336 static tree
5337 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5338 tree ptr, tree var, tree_code op)
5339 {
5340 tree type = TREE_TYPE (var);
5341 tree size = TYPE_SIZE (type);
5342
5343 if (size == TYPE_SIZE (unsigned_type_node)
5344 || size == TYPE_SIZE (long_long_unsigned_type_node))
5345 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5346 else
5347 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5348 }
5349
5350 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5351
5352 static void
5353 nvptx_goacc_reduction_setup (gcall *call)
5354 {
5355 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5356 tree lhs = gimple_call_lhs (call);
5357 tree var = gimple_call_arg (call, 2);
5358 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5359 gimple_seq seq = NULL;
5360
5361 push_gimplify_context (true);
5362
5363 if (level != GOMP_DIM_GANG)
5364 {
5365 /* Copy the receiver object. */
5366 tree ref_to_res = gimple_call_arg (call, 1);
5367
5368 if (!integer_zerop (ref_to_res))
5369 var = build_simple_mem_ref (ref_to_res);
5370 }
5371
5372 if (level == GOMP_DIM_WORKER)
5373 {
5374 /* Store incoming value to worker reduction buffer. */
5375 tree offset = gimple_call_arg (call, 5);
5376 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5377 tree ptr = make_ssa_name (TREE_TYPE (call));
5378
5379 gimplify_assign (ptr, call, &seq);
5380 tree ref = build_simple_mem_ref (ptr);
5381 TREE_THIS_VOLATILE (ref) = 1;
5382 gimplify_assign (ref, var, &seq);
5383 }
5384
5385 if (lhs)
5386 gimplify_assign (lhs, var, &seq);
5387
5388 pop_gimplify_context (NULL);
5389 gsi_replace_with_seq (&gsi, seq, true);
5390 }
5391
5392 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5393
5394 static void
5395 nvptx_goacc_reduction_init (gcall *call)
5396 {
5397 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5398 tree lhs = gimple_call_lhs (call);
5399 tree var = gimple_call_arg (call, 2);
5400 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5401 enum tree_code rcode
5402 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5403 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5404 TREE_TYPE (var));
5405 gimple_seq seq = NULL;
5406
5407 push_gimplify_context (true);
5408
5409 if (level == GOMP_DIM_VECTOR)
5410 {
5411 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5412 tree tid = make_ssa_name (integer_type_node);
5413 tree dim_vector = gimple_call_arg (call, 3);
5414 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5415 dim_vector);
5416 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5417 NULL_TREE, NULL_TREE);
5418
5419 gimple_call_set_lhs (tid_call, tid);
5420 gimple_seq_add_stmt (&seq, tid_call);
5421 gimple_seq_add_stmt (&seq, cond_stmt);
5422
5423 /* Split the block just after the call. */
5424 edge init_edge = split_block (gsi_bb (gsi), call);
5425 basic_block init_bb = init_edge->dest;
5426 basic_block call_bb = init_edge->src;
5427
5428 /* Fixup flags from call_bb to init_bb. */
5429 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5430 init_edge->probability = profile_probability::even ();
5431
5432 /* Set the initialization stmts. */
5433 gimple_seq init_seq = NULL;
5434 tree init_var = make_ssa_name (TREE_TYPE (var));
5435 gimplify_assign (init_var, init, &init_seq);
5436 gsi = gsi_start_bb (init_bb);
5437 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5438
5439 /* Split block just after the init stmt. */
5440 gsi_prev (&gsi);
5441 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5442 basic_block dst_bb = inited_edge->dest;
5443
5444 /* Create false edge from call_bb to dst_bb. */
5445 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5446 nop_edge->probability = profile_probability::even ();
5447
5448 /* Create phi node in dst block. */
5449 gphi *phi = create_phi_node (lhs, dst_bb);
5450 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5451 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5452
5453 /* Reset dominator of dst bb. */
5454 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5455
5456 /* Reset the gsi. */
5457 gsi = gsi_for_stmt (call);
5458 }
5459 else
5460 {
5461 if (level == GOMP_DIM_GANG)
5462 {
5463 /* If there's no receiver object, propagate the incoming VAR. */
5464 tree ref_to_res = gimple_call_arg (call, 1);
5465 if (integer_zerop (ref_to_res))
5466 init = var;
5467 }
5468
5469 gimplify_assign (lhs, init, &seq);
5470 }
5471
5472 pop_gimplify_context (NULL);
5473 gsi_replace_with_seq (&gsi, seq, true);
5474 }
5475
5476 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5477
5478 static void
5479 nvptx_goacc_reduction_fini (gcall *call)
5480 {
5481 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5482 tree lhs = gimple_call_lhs (call);
5483 tree ref_to_res = gimple_call_arg (call, 1);
5484 tree var = gimple_call_arg (call, 2);
5485 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5486 enum tree_code op
5487 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5488 gimple_seq seq = NULL;
5489 tree r = NULL_TREE;;
5490
5491 push_gimplify_context (true);
5492
5493 if (level == GOMP_DIM_VECTOR)
5494 {
5495 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5496 but that requires a method of emitting a unified jump at the
5497 gimple level. */
5498 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5499 {
5500 tree other_var = make_ssa_name (TREE_TYPE (var));
5501 nvptx_generate_vector_shuffle (gimple_location (call),
5502 other_var, var, shfl, &seq);
5503
5504 r = make_ssa_name (TREE_TYPE (var));
5505 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5506 var, other_var), &seq);
5507 var = r;
5508 }
5509 }
5510 else
5511 {
5512 tree accum = NULL_TREE;
5513
5514 if (level == GOMP_DIM_WORKER)
5515 {
5516 /* Get reduction buffer address. */
5517 tree offset = gimple_call_arg (call, 5);
5518 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5519 tree ptr = make_ssa_name (TREE_TYPE (call));
5520
5521 gimplify_assign (ptr, call, &seq);
5522 accum = ptr;
5523 }
5524 else if (integer_zerop (ref_to_res))
5525 r = var;
5526 else
5527 accum = ref_to_res;
5528
5529 if (accum)
5530 {
5531 /* UPDATE the accumulator. */
5532 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5533 seq = NULL;
5534 r = nvptx_reduction_update (gimple_location (call), &gsi,
5535 accum, var, op);
5536 }
5537 }
5538
5539 if (lhs)
5540 gimplify_assign (lhs, r, &seq);
5541 pop_gimplify_context (NULL);
5542
5543 gsi_replace_with_seq (&gsi, seq, true);
5544 }
5545
5546 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5547
5548 static void
5549 nvptx_goacc_reduction_teardown (gcall *call)
5550 {
5551 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5552 tree lhs = gimple_call_lhs (call);
5553 tree var = gimple_call_arg (call, 2);
5554 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5555 gimple_seq seq = NULL;
5556
5557 push_gimplify_context (true);
5558 if (level == GOMP_DIM_WORKER)
5559 {
5560 /* Read the worker reduction buffer. */
5561 tree offset = gimple_call_arg (call, 5);
5562 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5563 tree ptr = make_ssa_name (TREE_TYPE (call));
5564
5565 gimplify_assign (ptr, call, &seq);
5566 var = build_simple_mem_ref (ptr);
5567 TREE_THIS_VOLATILE (var) = 1;
5568 }
5569
5570 if (level != GOMP_DIM_GANG)
5571 {
5572 /* Write to the receiver object. */
5573 tree ref_to_res = gimple_call_arg (call, 1);
5574
5575 if (!integer_zerop (ref_to_res))
5576 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5577 }
5578
5579 if (lhs)
5580 gimplify_assign (lhs, var, &seq);
5581
5582 pop_gimplify_context (NULL);
5583
5584 gsi_replace_with_seq (&gsi, seq, true);
5585 }
5586
5587 /* NVPTX reduction expander. */
5588
5589 static void
5590 nvptx_goacc_reduction (gcall *call)
5591 {
5592 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5593
5594 switch (code)
5595 {
5596 case IFN_GOACC_REDUCTION_SETUP:
5597 nvptx_goacc_reduction_setup (call);
5598 break;
5599
5600 case IFN_GOACC_REDUCTION_INIT:
5601 nvptx_goacc_reduction_init (call);
5602 break;
5603
5604 case IFN_GOACC_REDUCTION_FINI:
5605 nvptx_goacc_reduction_fini (call);
5606 break;
5607
5608 case IFN_GOACC_REDUCTION_TEARDOWN:
5609 nvptx_goacc_reduction_teardown (call);
5610 break;
5611
5612 default:
5613 gcc_unreachable ();
5614 }
5615 }
5616
5617 static bool
5618 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5619 rtx x ATTRIBUTE_UNUSED)
5620 {
5621 return true;
5622 }
5623
5624 static bool
5625 nvptx_vector_mode_supported (machine_mode mode)
5626 {
5627 return (mode == V2SImode
5628 || mode == V2DImode);
5629 }
5630
5631 /* Return the preferred mode for vectorizing scalar MODE. */
5632
5633 static machine_mode
5634 nvptx_preferred_simd_mode (scalar_mode mode)
5635 {
5636 switch (mode)
5637 {
5638 case E_DImode:
5639 return V2DImode;
5640 case E_SImode:
5641 return V2SImode;
5642
5643 default:
5644 return default_preferred_simd_mode (mode);
5645 }
5646 }
5647
5648 unsigned int
5649 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5650 {
5651 if (TREE_CODE (type) == INTEGER_TYPE)
5652 {
5653 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5654 if (size == GET_MODE_SIZE (TImode))
5655 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5656 }
5657
5658 return basic_align;
5659 }
5660
5661 /* Implement TARGET_MODES_TIEABLE_P. */
5662
5663 static bool
5664 nvptx_modes_tieable_p (machine_mode, machine_mode)
5665 {
5666 return false;
5667 }
5668
5669 /* Implement TARGET_HARD_REGNO_NREGS. */
5670
5671 static unsigned int
5672 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5673 {
5674 return 1;
5675 }
5676
5677 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5678
5679 static bool
5680 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5681 {
5682 return false;
5683 }
5684
5685 #undef TARGET_OPTION_OVERRIDE
5686 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5687
5688 #undef TARGET_ATTRIBUTE_TABLE
5689 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5690
5691 #undef TARGET_LRA_P
5692 #define TARGET_LRA_P hook_bool_void_false
5693
5694 #undef TARGET_LEGITIMATE_ADDRESS_P
5695 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5696
5697 #undef TARGET_PROMOTE_FUNCTION_MODE
5698 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5699
5700 #undef TARGET_FUNCTION_ARG
5701 #define TARGET_FUNCTION_ARG nvptx_function_arg
5702 #undef TARGET_FUNCTION_INCOMING_ARG
5703 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5704 #undef TARGET_FUNCTION_ARG_ADVANCE
5705 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5706 #undef TARGET_FUNCTION_ARG_BOUNDARY
5707 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5708 #undef TARGET_PASS_BY_REFERENCE
5709 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5710 #undef TARGET_FUNCTION_VALUE_REGNO_P
5711 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5712 #undef TARGET_FUNCTION_VALUE
5713 #define TARGET_FUNCTION_VALUE nvptx_function_value
5714 #undef TARGET_LIBCALL_VALUE
5715 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5716 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5717 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5718 #undef TARGET_GET_DRAP_RTX
5719 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5720 #undef TARGET_SPLIT_COMPLEX_ARG
5721 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5722 #undef TARGET_RETURN_IN_MEMORY
5723 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5724 #undef TARGET_OMIT_STRUCT_RETURN_REG
5725 #define TARGET_OMIT_STRUCT_RETURN_REG true
5726 #undef TARGET_STRICT_ARGUMENT_NAMING
5727 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5728 #undef TARGET_CALL_ARGS
5729 #define TARGET_CALL_ARGS nvptx_call_args
5730 #undef TARGET_END_CALL_ARGS
5731 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5732
5733 #undef TARGET_ASM_FILE_START
5734 #define TARGET_ASM_FILE_START nvptx_file_start
5735 #undef TARGET_ASM_FILE_END
5736 #define TARGET_ASM_FILE_END nvptx_file_end
5737 #undef TARGET_ASM_GLOBALIZE_LABEL
5738 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5739 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5740 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5741 #undef TARGET_PRINT_OPERAND
5742 #define TARGET_PRINT_OPERAND nvptx_print_operand
5743 #undef TARGET_PRINT_OPERAND_ADDRESS
5744 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5745 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5746 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5747 #undef TARGET_ASM_INTEGER
5748 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5749 #undef TARGET_ASM_DECL_END
5750 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5751 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5752 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5753 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5754 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5755 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5756 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5757
5758 #undef TARGET_MACHINE_DEPENDENT_REORG
5759 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5760 #undef TARGET_NO_REGISTER_ALLOCATION
5761 #define TARGET_NO_REGISTER_ALLOCATION true
5762
5763 #undef TARGET_ENCODE_SECTION_INFO
5764 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5765 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5766 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5767
5768 #undef TARGET_VECTOR_ALIGNMENT
5769 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5770
5771 #undef TARGET_CANNOT_COPY_INSN_P
5772 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5773
5774 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5775 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5776
5777 #undef TARGET_INIT_BUILTINS
5778 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5779 #undef TARGET_EXPAND_BUILTIN
5780 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5781 #undef TARGET_BUILTIN_DECL
5782 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5783
5784 #undef TARGET_SIMT_VF
5785 #define TARGET_SIMT_VF nvptx_simt_vf
5786
5787 #undef TARGET_GOACC_VALIDATE_DIMS
5788 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5789
5790 #undef TARGET_GOACC_DIM_LIMIT
5791 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5792
5793 #undef TARGET_GOACC_FORK_JOIN
5794 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5795
5796 #undef TARGET_GOACC_REDUCTION
5797 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5798
5799 #undef TARGET_CANNOT_FORCE_CONST_MEM
5800 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5801
5802 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5803 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5804
5805 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5806 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5807 nvptx_preferred_simd_mode
5808
5809 #undef TARGET_MODES_TIEABLE_P
5810 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5811
5812 #undef TARGET_HARD_REGNO_NREGS
5813 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5814
5815 #undef TARGET_CAN_CHANGE_MODE_CLASS
5816 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5817
5818 struct gcc_target targetm = TARGET_INITIALIZER;
5819
5820 #include "gt-nvptx.h"