nvptx.c (nvptx_function_arg_advance): Don't consider mode.
[gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "expmed.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic.h"
38 #include "alias.h"
39 #include "insn-flags.h"
40 #include "output.h"
41 #include "insn-attr.h"
42 #include "flags.h"
43 #include "dojump.h"
44 #include "explow.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "stmt.h"
48 #include "expr.h"
49 #include "tm-preds.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
52 #include "dbxout.h"
53 #include "cfgrtl.h"
54 #include "gimple.h"
55 #include "stor-layout.h"
56 #include "builtins.h"
57 #include "omp-low.h"
58 #include "gomp-constants.h"
59 #include "dumpfile.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-ssa-operands.h"
64 #include "tree-ssanames.h"
65 #include "gimplify.h"
66 #include "tree-phinodes.h"
67 #include "cfgloop.h"
68 #include "fold-const.h"
69
70 /* This file should be included last. */
71 #include "target-def.h"
72
73 #define SHUFFLE_UP 0
74 #define SHUFFLE_DOWN 1
75 #define SHUFFLE_BFLY 2
76 #define SHUFFLE_IDX 3
77
78 /* Record the function decls we've written, and the libfuncs and function
79 decls corresponding to them. */
80 static std::stringstream func_decls;
81
82 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
83 {
84 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
85 static bool equal (rtx a, rtx b) { return a == b; }
86 };
87
88 static GTY((cache))
89 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
90
91 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
92 {
93 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
94 static bool equal (tree a, tree b) { return a == b; }
95 };
96
97 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
98 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
99
100 /* Buffer needed to broadcast across workers. This is used for both
101 worker-neutering and worker broadcasting. It is shared by all
102 functions emitted. The buffer is placed in shared memory. It'd be
103 nice if PTX supported common blocks, because then this could be
104 shared across TUs (taking the largest size). */
105 static unsigned worker_bcast_size;
106 static unsigned worker_bcast_align;
107 #define worker_bcast_name "__worker_bcast"
108 static GTY(()) rtx worker_bcast_sym;
109
110 /* Buffer needed for worker reductions. This has to be distinct from
111 the worker broadcast array, as both may be live concurrently. */
112 static unsigned worker_red_size;
113 static unsigned worker_red_align;
114 #define worker_red_name "__worker_red"
115 static GTY(()) rtx worker_red_sym;
116
117 /* Global lock variable, needed for 128bit worker & gang reductions. */
118 static GTY(()) tree global_lock_var;
119
120 /* Allocate a new, cleared machine_function structure. */
121
122 static struct machine_function *
123 nvptx_init_machine_status (void)
124 {
125 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
126 p->ret_reg_mode = VOIDmode;
127 return p;
128 }
129
130 /* Implement TARGET_OPTION_OVERRIDE. */
131
132 static void
133 nvptx_option_override (void)
134 {
135 init_machine_status = nvptx_init_machine_status;
136 /* Gives us a predictable order, which we need especially for variables. */
137 flag_toplevel_reorder = 1;
138 /* Assumes that it will see only hard registers. */
139 flag_var_tracking = 0;
140 write_symbols = NO_DEBUG;
141 debug_info_level = DINFO_LEVEL_NONE;
142
143 if (nvptx_optimize < 0)
144 nvptx_optimize = optimize > 0;
145
146 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
147 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
148 declared_libfuncs_htab
149 = hash_table<declared_libfunc_hasher>::create_ggc (17);
150
151 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
152 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
153
154 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
155 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
156 }
157
158 /* Return the mode to be used when declaring a ptx object for OBJ.
159 For objects with subparts such as complex modes this is the mode
160 of the subpart. */
161
162 machine_mode
163 nvptx_underlying_object_mode (rtx obj)
164 {
165 if (GET_CODE (obj) == SUBREG)
166 obj = SUBREG_REG (obj);
167 machine_mode mode = GET_MODE (obj);
168 if (mode == TImode)
169 return DImode;
170 if (COMPLEX_MODE_P (mode))
171 return GET_MODE_INNER (mode);
172 return mode;
173 }
174
175 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
176 deal with ptx ideosyncracies. */
177
178 const char *
179 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
180 {
181 switch (mode)
182 {
183 case BLKmode:
184 return ".b8";
185 case BImode:
186 return ".pred";
187 case QImode:
188 if (promote)
189 return ".u32";
190 else
191 return ".u8";
192 case HImode:
193 return ".u16";
194 case SImode:
195 return ".u32";
196 case DImode:
197 return ".u64";
198
199 case SFmode:
200 return ".f32";
201 case DFmode:
202 return ".f64";
203
204 default:
205 gcc_unreachable ();
206 }
207 }
208
209 /* Determine the address space to use for SYMBOL_REF SYM. */
210
211 static addr_space_t
212 nvptx_addr_space_from_sym (rtx sym)
213 {
214 tree decl = SYMBOL_REF_DECL (sym);
215 if (decl == NULL_TREE || TREE_CODE (decl) == FUNCTION_DECL)
216 return ADDR_SPACE_GENERIC;
217
218 bool is_const = (CONSTANT_CLASS_P (decl)
219 || TREE_CODE (decl) == CONST_DECL
220 || TREE_READONLY (decl));
221 if (is_const)
222 return ADDR_SPACE_CONST;
223
224 return ADDR_SPACE_GLOBAL;
225 }
226
227 /* Check NAME for special function names and redirect them by returning a
228 replacement. This applies to malloc, free and realloc, for which we
229 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
230
231 static const char *
232 nvptx_name_replacement (const char *name)
233 {
234 if (strcmp (name, "call") == 0)
235 return "__nvptx_call";
236 if (strcmp (name, "malloc") == 0)
237 return "__nvptx_malloc";
238 if (strcmp (name, "free") == 0)
239 return "__nvptx_free";
240 if (strcmp (name, "realloc") == 0)
241 return "__nvptx_realloc";
242 return name;
243 }
244
245 /* If MODE should be treated as two registers of an inner mode, return
246 that inner mode. Otherwise return VOIDmode. */
247
248 static machine_mode
249 maybe_split_mode (machine_mode mode)
250 {
251 if (COMPLEX_MODE_P (mode))
252 return GET_MODE_INNER (mode);
253
254 if (mode == TImode)
255 return DImode;
256
257 return VOIDmode;
258 }
259
260 /* Emit forking instructions for MASK. */
261
262 static void
263 nvptx_emit_forking (unsigned mask, bool is_call)
264 {
265 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
266 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
267 if (mask)
268 {
269 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
270
271 /* Emit fork at all levels. This helps form SESE regions, as
272 it creates a block with a single successor before entering a
273 partitooned region. That is a good candidate for the end of
274 an SESE region. */
275 if (!is_call)
276 emit_insn (gen_nvptx_fork (op));
277 emit_insn (gen_nvptx_forked (op));
278 }
279 }
280
281 /* Emit joining instructions for MASK. */
282
283 static void
284 nvptx_emit_joining (unsigned mask, bool is_call)
285 {
286 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
287 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
288 if (mask)
289 {
290 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
291
292 /* Emit joining for all non-call pars to ensure there's a single
293 predecessor for the block the join insn ends up in. This is
294 needed for skipping entire loops. */
295 if (!is_call)
296 emit_insn (gen_nvptx_joining (op));
297 emit_insn (gen_nvptx_join (op));
298 }
299 }
300
301 #define PASS_IN_REG_P(MODE, TYPE) \
302 ((GET_MODE_CLASS (MODE) == MODE_INT \
303 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
304 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
305 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
306 && !AGGREGATE_TYPE_P (TYPE))) \
307 && (MODE) != TImode)
308
309 #define RETURN_IN_REG_P(MODE) \
310 ((GET_MODE_CLASS (MODE) == MODE_INT \
311 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
312 && GET_MODE_SIZE (MODE) <= 8)
313 \f
314 /* Perform a mode promotion for a function argument with MODE. Return
315 the promoted mode. */
316
317 static machine_mode
318 arg_promotion (machine_mode mode)
319 {
320 if (mode == QImode || mode == HImode)
321 return SImode;
322 return mode;
323 }
324
325 /* Write the declaration of a function arg of TYPE to S. I is the index
326 of the argument, MODE its mode. NO_ARG_TYPES is true if this is for
327 a decl with zero TYPE_ARG_TYPES, i.e. an old-style C decl. */
328
329 static int
330 write_one_arg (std::stringstream &s, const char *sep, int i,
331 tree type, machine_mode mode, bool no_arg_types)
332 {
333 if (!PASS_IN_REG_P (mode, type))
334 mode = Pmode;
335
336 machine_mode split = maybe_split_mode (mode);
337 if (split != VOIDmode)
338 {
339 i = write_one_arg (s, sep, i, TREE_TYPE (type), split, false);
340 sep = ", ";
341 mode = split;
342 }
343
344 if (no_arg_types && !AGGREGATE_TYPE_P (type))
345 {
346 if (mode == SFmode)
347 mode = DFmode;
348 mode = arg_promotion (mode);
349 }
350
351 s << sep;
352 s << ".param" << nvptx_ptx_type_from_mode (mode, false) << " %in_ar"
353 << i << (mode == QImode || mode == HImode ? "[1]" : "");
354 if (mode == BLKmode)
355 s << "[" << int_size_in_bytes (type) << "]";
356 return i + 1;
357 }
358
359 /* Look for attributes in ATTRS that would indicate we must write a function
360 as a .entry kernel rather than a .func. Return true if one is found. */
361
362 static bool
363 write_as_kernel (tree attrs)
364 {
365 return (lookup_attribute ("kernel", attrs) != NULL_TREE
366 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
367 }
368
369 /* Write a .func or .kernel declaration or definition along with
370 a helper comment for use by ld. S is the stream to write to, DECL
371 the decl for the function with name NAME. For definitions, emit
372 a declaration too. */
373
374 static const char *
375 write_fn_proto (std::stringstream &s, bool is_defn,
376 const char *name, const_tree decl)
377 {
378 if (is_defn)
379 /* Emit a declaration. The PTX assembler gets upset without it. */
380 name = write_fn_proto (s, false, name, decl);
381 else
382 {
383 /* Avoid repeating the name replacement. */
384 name = nvptx_name_replacement (name);
385 if (name[0] == '*')
386 name++;
387 }
388
389 /* Emit the linker marker. */
390 s << "\n// BEGIN";
391 if (TREE_PUBLIC (decl))
392 s << " GLOBAL";
393 s << " FUNCTION " << (is_defn ? "DEF" : "DECL") << ": " << name << "\n";
394
395 /* PTX declaration. */
396 if (DECL_EXTERNAL (decl))
397 s << ".extern ";
398 else if (TREE_PUBLIC (decl))
399 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
400 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
401
402 tree fntype = TREE_TYPE (decl);
403 tree result_type = TREE_TYPE (fntype);
404
405 /* Declare the result. */
406 bool return_in_mem = false;
407 if (TYPE_MODE (result_type) != VOIDmode)
408 {
409 machine_mode mode = TYPE_MODE (result_type);
410 if (!RETURN_IN_REG_P (mode))
411 return_in_mem = true;
412 else
413 {
414 mode = arg_promotion (mode);
415 s << "(.param" << nvptx_ptx_type_from_mode (mode, false)
416 << " %out_retval) ";
417 }
418 }
419
420 s << name;
421
422 const char *sep = " (";
423 int i = 0;
424
425 /* Emit argument list. */
426 if (return_in_mem)
427 {
428 s << sep << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_ar0";
429 sep = ", ";
430 i++;
431 }
432
433 /* We get:
434 NULL in TYPE_ARG_TYPES, for old-style functions
435 NULL in DECL_ARGUMENTS, for builtin functions without another
436 declaration.
437 So we have to pick the best one we have. */
438 tree args = TYPE_ARG_TYPES (fntype);
439 bool null_type_args = !args;
440 if (null_type_args)
441 args = DECL_ARGUMENTS (decl);
442
443 for (; args; args = TREE_CHAIN (args))
444 {
445 tree type = null_type_args ? TREE_TYPE (args) : TREE_VALUE (args);
446 machine_mode mode = TYPE_MODE (type);
447
448 if (mode == VOIDmode)
449 break;
450 i = write_one_arg (s, sep, i, type, mode, null_type_args);
451 sep = ", ";
452 }
453
454 if (stdarg_p (fntype))
455 {
456 s << sep << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_argp";
457 i++;
458 sep = ", ";
459 }
460
461 if (DECL_STATIC_CHAIN (decl))
462 {
463 s << sep << ".reg.u" << GET_MODE_BITSIZE (Pmode)
464 << reg_names [STATIC_CHAIN_REGNUM];
465 i++;
466 sep = ", ";
467 }
468
469 if (!i && strcmp (name, "main") == 0)
470 {
471 s << sep
472 << ".param.u32 %argc, .param.u" << GET_MODE_BITSIZE (Pmode)
473 << " %argv";
474 i++;
475 sep = ", ";
476 }
477
478 if (i)
479 s << ")";
480
481 s << (is_defn ? "\n" : ";\n");
482
483 return name;
484 }
485
486 /* Construct a function declaration from a call insn. This can be
487 necessary for two reasons - either we have an indirect call which
488 requires a .callprototype declaration, or we have a libcall
489 generated by emit_library_call for which no decl exists. */
490
491 static void
492 write_fn_proto_from_insn (std::stringstream &s, const char *name,
493 rtx result, rtx pat)
494 {
495 if (!name)
496 {
497 s << "\t.callprototype ";
498 name = "_";
499 }
500 else
501 {
502 name = nvptx_name_replacement (name);
503 s << "\n// BEGIN GLOBAL FUNCTION DECL: " << name << "\n";
504 s << "\t.extern .func ";
505 }
506
507 if (result != NULL_RTX)
508 s << "(.param"
509 << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)), false)
510 << " %rval) ";
511
512 s << name;
513
514 const char *sep = " (";
515 int arg_end = XVECLEN (pat, 0);
516 for (int i = 1; i < arg_end; i++)
517 {
518 /* We don't have to deal with mode splitting here, as that was
519 already done when generating the call sequence. */
520 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
521
522 s << sep
523 << ".param"
524 << nvptx_ptx_type_from_mode (mode, false)
525 << " %arg"
526 << i;
527 if (mode == QImode || mode == HImode)
528 s << "[1]";
529 sep = ", ";
530 }
531 if (arg_end != 1)
532 s << ")";
533 s << ";\n";
534 }
535
536 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
537 table and and write a ptx prototype. These are emitted at end of
538 compilation. */
539
540 static void
541 nvptx_record_fndecl (tree decl)
542 {
543 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
544 if (*slot == NULL)
545 {
546 *slot = decl;
547 const char *name = get_fnname_from_decl (decl);
548 write_fn_proto (func_decls, false, name, decl);
549 }
550 }
551
552 /* Record a libcall or unprototyped external function. CALLEE is the
553 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
554 declaration for it. */
555
556 static void
557 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
558 {
559 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
560 if (*slot == NULL)
561 {
562 *slot = callee;
563
564 const char *name = XSTR (callee, 0);
565 write_fn_proto_from_insn (func_decls, name, retval, pat);
566 }
567 }
568
569 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
570 is prototyped, record it now. Otherwise record it as needed at end
571 of compilation, when we might have more information about it. */
572
573 void
574 nvptx_record_needed_fndecl (tree decl)
575 {
576 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
577 {
578 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
579 if (*slot == NULL)
580 *slot = decl;
581 }
582 else
583 nvptx_record_fndecl (decl);
584 }
585
586 /* SYM is a SYMBOL_REF. If it refers to an external function, record
587 it as needed. */
588
589 static void
590 nvptx_maybe_record_fnsym (rtx sym)
591 {
592 tree decl = SYMBOL_REF_DECL (sym);
593
594 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
595 nvptx_record_needed_fndecl (decl);
596 }
597
598 /* Emit code to initialize the REGNO predicate register to indicate
599 whether we are not lane zero on the NAME axis. */
600
601 static void
602 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
603 {
604 fprintf (file, "\t{\n");
605 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
606 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
607 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
608 fprintf (file, "\t}\n");
609 }
610
611 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
612 function, including local var decls and copies from the arguments to
613 local regs. */
614
615 void
616 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
617 {
618 tree fntype = TREE_TYPE (decl);
619 tree result_type = TREE_TYPE (fntype);
620 int argno = 0;
621
622 std::stringstream s;
623 write_fn_proto (s, true, name, decl);
624 fprintf (file, "%s", s.str().c_str());
625 fprintf (file, "{\n");
626
627 bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
628 && !RETURN_IN_REG_P (TYPE_MODE (result_type)));
629 if (return_in_mem)
630 {
631 fprintf (file, "\t.reg.u%d %%ar%d;\n", GET_MODE_BITSIZE (Pmode), argno);
632 fprintf (file, "\tld.param.u%d %%ar%d, [%%in_ar%d];\n",
633 GET_MODE_BITSIZE (Pmode), argno, argno);
634 argno++;
635 }
636
637 /* Declare and initialize incoming arguments. */
638 tree args = DECL_ARGUMENTS (decl);
639 bool prototyped = false;
640 if (TYPE_ARG_TYPES (fntype))
641 {
642 args = TYPE_ARG_TYPES (fntype);
643 prototyped = true;
644 }
645
646 for (; args != NULL_TREE; args = TREE_CHAIN (args))
647 {
648 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
649 machine_mode mode = TYPE_MODE (type);
650 int count = 1;
651
652 if (mode == VOIDmode)
653 break;
654
655 if (!PASS_IN_REG_P (mode, type))
656 mode = Pmode;
657
658 machine_mode split = maybe_split_mode (mode);
659 if (split != VOIDmode)
660 {
661 count = 2;
662 mode = split;
663 }
664 else if (!prototyped && !AGGREGATE_TYPE_P (type) && mode == SFmode)
665 mode = DFmode;
666
667 mode = arg_promotion (mode);
668 while (count--)
669 {
670 fprintf (file, "\t.reg%s %%ar%d;\n",
671 nvptx_ptx_type_from_mode (mode, false), argno);
672 fprintf (file, "\tld.param%s %%ar%d, [%%in_ar%d];\n",
673 nvptx_ptx_type_from_mode (mode, false), argno, argno);
674 argno++;
675 }
676 }
677
678 /* C++11 ABI causes us to return a reference to the passed in
679 pointer for return_in_mem. */
680 if (cfun->machine->ret_reg_mode != VOIDmode)
681 {
682 machine_mode mode = arg_promotion
683 ((machine_mode)cfun->machine->ret_reg_mode);
684 fprintf (file, "\t.reg%s %%retval;\n",
685 nvptx_ptx_type_from_mode (mode, false));
686 }
687
688 if (stdarg_p (fntype))
689 {
690 fprintf (file, "\t.reg.u%d %%argp;\n", GET_MODE_BITSIZE (Pmode));
691 fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
692 GET_MODE_BITSIZE (Pmode));
693 }
694
695 fprintf (file, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode),
696 reg_names[OUTGOING_STATIC_CHAIN_REGNUM]);
697
698 /* Declare the pseudos we have as ptx registers. */
699 int maxregs = max_reg_num ();
700 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
701 {
702 if (regno_reg_rtx[i] != const0_rtx)
703 {
704 machine_mode mode = PSEUDO_REGNO_MODE (i);
705 machine_mode split = maybe_split_mode (mode);
706 if (split != VOIDmode)
707 {
708 fprintf (file, "\t.reg%s %%r%d$%d;\n",
709 nvptx_ptx_type_from_mode (split, true), i, 0);
710 fprintf (file, "\t.reg%s %%r%d$%d;\n",
711 nvptx_ptx_type_from_mode (split, true), i, 1);
712 }
713 else
714 fprintf (file, "\t.reg%s %%r%d;\n",
715 nvptx_ptx_type_from_mode (mode, true), i);
716 }
717 }
718
719 /* The only reason we might be using outgoing args is if we call a stdargs
720 function. Allocate the space for this. If we called varargs functions
721 without passing any variadic arguments, we'll see a reference to outargs
722 even with a zero outgoing_args_size. */
723 HOST_WIDE_INT sz = crtl->outgoing_args_size;
724 if (sz == 0)
725 sz = 1;
726 if (cfun->machine->has_call_with_varargs)
727 {
728 fprintf (file, "\t.reg.u%d %%outargs;\n"
729 "\t.local.align 8 .b8 %%outargs_ar["
730 HOST_WIDE_INT_PRINT_DEC"];\n",
731 BITS_PER_WORD, sz);
732 fprintf (file, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
733 BITS_PER_WORD);
734 }
735
736 if (cfun->machine->punning_buffer_size > 0)
737 {
738 fprintf (file, "\t.reg.u%d %%punbuffer;\n"
739 "\t.local.align 8 .b8 %%punbuffer_ar[%d];\n",
740 BITS_PER_WORD, cfun->machine->punning_buffer_size);
741 fprintf (file, "\tcvta.local.u%d %%punbuffer, %%punbuffer_ar;\n",
742 BITS_PER_WORD);
743 }
744
745 /* Declare a local variable for the frame. */
746 sz = get_frame_size ();
747 if (sz > 0 || cfun->machine->has_call_with_sc)
748 {
749 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
750
751 fprintf (file, "\t.reg.u%d %%frame;\n"
752 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
753 BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
754 fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
755 BITS_PER_WORD);
756 }
757
758 /* Emit axis predicates. */
759 if (cfun->machine->axis_predicate[0])
760 nvptx_init_axis_predicate (file,
761 REGNO (cfun->machine->axis_predicate[0]), "y");
762 if (cfun->machine->axis_predicate[1])
763 nvptx_init_axis_predicate (file,
764 REGNO (cfun->machine->axis_predicate[1]), "x");
765 }
766
767 /* Output a return instruction. Also copy the return value to its outgoing
768 location. */
769
770 const char *
771 nvptx_output_return (void)
772 {
773 machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
774
775 if (mode != VOIDmode)
776 {
777 mode = arg_promotion (mode);
778 fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
779 nvptx_ptx_type_from_mode (mode, false));
780 }
781
782 return "ret;";
783 }
784
785 /* Terminate a function by writing a closing brace to FILE. */
786
787 void
788 nvptx_function_end (FILE *file)
789 {
790 fprintf (file, "}\n");
791 }
792 \f
793 /* Decide whether we can make a sibling call to a function. For ptx, we
794 can't. */
795
796 static bool
797 nvptx_function_ok_for_sibcall (tree, tree)
798 {
799 return false;
800 }
801
802 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
803
804 static rtx
805 nvptx_get_drap_rtx (void)
806 {
807 return NULL_RTX;
808 }
809
810 /* Implement the TARGET_CALL_ARGS hook. Record information about one
811 argument to the next call. */
812
813 static void
814 nvptx_call_args (rtx arg, tree funtype)
815 {
816 if (cfun->machine->start_call == NULL_RTX)
817 {
818 cfun->machine->call_args = NULL;
819 cfun->machine->funtype = funtype;
820 cfun->machine->start_call = const0_rtx;
821 }
822 if (arg == pc_rtx)
823 return;
824
825 rtx_expr_list *args_so_far = cfun->machine->call_args;
826 if (REG_P (arg))
827 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, args_so_far);
828 }
829
830 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
831 information we recorded. */
832
833 static void
834 nvptx_end_call_args (void)
835 {
836 cfun->machine->start_call = NULL_RTX;
837 free_EXPR_LIST_list (&cfun->machine->call_args);
838 }
839
840 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
841 track of whether calls involving static chains or varargs were seen
842 in the current function.
843 For libcalls, maintain a hash table of decls we have seen, and
844 record a function decl for later when encountering a new one. */
845
846 void
847 nvptx_expand_call (rtx retval, rtx address)
848 {
849 int nargs = 0;
850 rtx callee = XEXP (address, 0);
851 rtx pat, t;
852 rtvec vec;
853 rtx varargs = NULL_RTX;
854 unsigned parallel = 0;
855
856 for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
857 nargs++;
858
859 if (!call_insn_operand (callee, Pmode))
860 {
861 callee = force_reg (Pmode, callee);
862 address = change_address (address, QImode, callee);
863 }
864
865 if (GET_CODE (callee) == SYMBOL_REF)
866 {
867 tree decl = SYMBOL_REF_DECL (callee);
868 if (decl != NULL_TREE)
869 {
870 if (DECL_STATIC_CHAIN (decl))
871 cfun->machine->has_call_with_sc = true;
872
873 tree attr = get_oacc_fn_attrib (decl);
874 if (attr)
875 {
876 tree dims = TREE_VALUE (attr);
877
878 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
879 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
880 {
881 if (TREE_PURPOSE (dims)
882 && !integer_zerop (TREE_PURPOSE (dims)))
883 break;
884 /* Not on this axis. */
885 parallel ^= GOMP_DIM_MASK (ix);
886 dims = TREE_CHAIN (dims);
887 }
888 }
889 }
890 }
891
892 if (cfun->machine->funtype
893 /* It's possible to construct testcases where we call a variable.
894 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
895 in such a case. */
896 && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
897 || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
898 && stdarg_p (cfun->machine->funtype))
899 {
900 varargs = gen_reg_rtx (Pmode);
901 emit_move_insn (varargs, stack_pointer_rtx);
902 cfun->machine->has_call_with_varargs = true;
903 }
904 vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
905 pat = gen_rtx_PARALLEL (VOIDmode, vec);
906
907 int vec_pos = 0;
908
909 rtx tmp_retval = retval;
910 t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
911 if (retval != NULL_RTX)
912 {
913 if (!nvptx_register_operand (retval, GET_MODE (retval)))
914 tmp_retval = gen_reg_rtx (GET_MODE (retval));
915 t = gen_rtx_SET (tmp_retval, t);
916 }
917 XVECEXP (pat, 0, vec_pos++) = t;
918
919 /* Construct the call insn, including a USE for each argument pseudo
920 register. These will be used when printing the insn. */
921 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
922 {
923 rtx this_arg = XEXP (arg, 0);
924 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
925 }
926
927 if (varargs)
928 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
929
930 gcc_assert (vec_pos = XVECLEN (pat, 0));
931
932 nvptx_emit_forking (parallel, true);
933 emit_call_insn (pat);
934 nvptx_emit_joining (parallel, true);
935
936 if (tmp_retval != retval)
937 emit_move_insn (retval, tmp_retval);
938 }
939
940 /* Implement TARGET_FUNCTION_ARG. */
941
942 static rtx
943 nvptx_function_arg (cumulative_args_t, machine_mode mode,
944 const_tree, bool named)
945 {
946 if (mode == VOIDmode)
947 return NULL_RTX;
948
949 if (named)
950 return gen_reg_rtx (mode);
951 return NULL_RTX;
952 }
953
954 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
955
956 static rtx
957 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
958 const_tree, bool named)
959 {
960 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
961 if (mode == VOIDmode)
962 return NULL_RTX;
963
964 if (!named)
965 return NULL_RTX;
966
967 /* No need to deal with split modes here, the only case that can
968 happen is complex modes and those are dealt with by
969 TARGET_SPLIT_COMPLEX_ARG. */
970 return gen_rtx_UNSPEC (mode,
971 gen_rtvec (1, GEN_INT (cum->count)),
972 UNSPEC_ARG_REG);
973 }
974
975 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
976
977 static void
978 nvptx_function_arg_advance (cumulative_args_t cum_v,
979 machine_mode ARG_UNUSED (mode),
980 const_tree ARG_UNUSED (type),
981 bool ARG_UNUSED (named))
982 {
983 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
984 cum->count++;
985 }
986
987 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
988
989 For nvptx, we know how to handle functions declared as stdarg: by
990 passing an extra pointer to the unnamed arguments. However, the
991 Fortran frontend can produce a different situation, where a
992 function pointer is declared with no arguments, but the actual
993 function and calls to it take more arguments. In that case, we
994 want to ensure the call matches the definition of the function. */
995
996 static bool
997 nvptx_strict_argument_naming (cumulative_args_t cum_v)
998 {
999 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1000 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
1001 }
1002
1003 /* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
1004
1005 static unsigned int
1006 nvptx_function_arg_boundary (machine_mode mode, const_tree type)
1007 {
1008 unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
1009
1010 if (boundary > BITS_PER_WORD)
1011 return 2 * BITS_PER_WORD;
1012
1013 if (mode == BLKmode)
1014 {
1015 HOST_WIDE_INT size = int_size_in_bytes (type);
1016 if (size > 4)
1017 return 2 * BITS_PER_WORD;
1018 if (boundary < BITS_PER_WORD)
1019 {
1020 if (size >= 3)
1021 return BITS_PER_WORD;
1022 if (size >= 2)
1023 return 2 * BITS_PER_UNIT;
1024 }
1025 }
1026 return boundary;
1027 }
1028
1029 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
1030 where function FUNC returns or receives a value of data type TYPE. */
1031
1032 static rtx
1033 nvptx_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED,
1034 bool outgoing)
1035 {
1036 int unsignedp = TYPE_UNSIGNED (type);
1037 machine_mode orig_mode = TYPE_MODE (type);
1038 machine_mode mode = promote_function_mode (type, orig_mode,
1039 &unsignedp, NULL_TREE, 1);
1040 if (outgoing)
1041 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1042 if (cfun->machine->start_call == NULL_RTX)
1043 /* Pretend to return in a hard reg for early uses before pseudos can be
1044 generated. */
1045 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1046 return gen_reg_rtx (mode);
1047 }
1048
1049 /* Implement TARGET_LIBCALL_VALUE. */
1050
1051 static rtx
1052 nvptx_libcall_value (machine_mode mode, const_rtx)
1053 {
1054 if (cfun->machine->start_call == NULL_RTX)
1055 /* Pretend to return in a hard reg for early uses before pseudos can be
1056 generated. */
1057 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1058 return gen_reg_rtx (mode);
1059 }
1060
1061 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
1062
1063 static bool
1064 nvptx_function_value_regno_p (const unsigned int regno)
1065 {
1066 return regno == NVPTX_RETURN_REGNUM;
1067 }
1068
1069 /* Types with a mode other than those supported by the machine are passed by
1070 reference in memory. */
1071
1072 static bool
1073 nvptx_pass_by_reference (cumulative_args_t, machine_mode mode,
1074 const_tree type, bool)
1075 {
1076 return !PASS_IN_REG_P (mode, type);
1077 }
1078
1079 /* Implement TARGET_RETURN_IN_MEMORY. */
1080
1081 static bool
1082 nvptx_return_in_memory (const_tree type, const_tree)
1083 {
1084 machine_mode mode = TYPE_MODE (type);
1085 if (!RETURN_IN_REG_P (mode))
1086 return true;
1087 return false;
1088 }
1089
1090 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
1091
1092 static machine_mode
1093 nvptx_promote_function_mode (const_tree type, machine_mode mode,
1094 int *punsignedp,
1095 const_tree funtype, int for_return)
1096 {
1097 if (type == NULL_TREE)
1098 return mode;
1099 if (for_return)
1100 return promote_mode (type, mode, punsignedp);
1101 /* For K&R-style functions, try to match the language promotion rules to
1102 minimize type mismatches at assembly time. */
1103 if (TYPE_ARG_TYPES (funtype) == NULL_TREE
1104 && type != NULL_TREE
1105 && !AGGREGATE_TYPE_P (type))
1106 {
1107 if (mode == SFmode)
1108 mode = DFmode;
1109 mode = arg_promotion (mode);
1110 }
1111
1112 return mode;
1113 }
1114
1115 /* Implement TARGET_STATIC_CHAIN. */
1116
1117 static rtx
1118 nvptx_static_chain (const_tree fndecl, bool incoming_p)
1119 {
1120 if (!DECL_STATIC_CHAIN (fndecl))
1121 return NULL;
1122
1123 if (incoming_p)
1124 return gen_rtx_REG (Pmode, STATIC_CHAIN_REGNUM);
1125 else
1126 return gen_rtx_REG (Pmode, OUTGOING_STATIC_CHAIN_REGNUM);
1127 }
1128 \f
1129 /* Emit a comparison COMPARE, and return the new test to be used in the
1130 jump. */
1131
1132 rtx
1133 nvptx_expand_compare (rtx compare)
1134 {
1135 rtx pred = gen_reg_rtx (BImode);
1136 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1137 XEXP (compare, 0), XEXP (compare, 1));
1138 emit_insn (gen_rtx_SET (pred, cmp));
1139 return gen_rtx_NE (BImode, pred, const0_rtx);
1140 }
1141
1142 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1143
1144 void
1145 nvptx_expand_oacc_fork (unsigned mode)
1146 {
1147 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1148 }
1149
1150 void
1151 nvptx_expand_oacc_join (unsigned mode)
1152 {
1153 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1154 }
1155
1156 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1157 objects. */
1158
1159 static rtx
1160 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1161 {
1162 rtx res;
1163
1164 switch (GET_MODE (src))
1165 {
1166 case DImode:
1167 res = gen_unpackdisi2 (dst0, dst1, src);
1168 break;
1169 case DFmode:
1170 res = gen_unpackdfsi2 (dst0, dst1, src);
1171 break;
1172 default: gcc_unreachable ();
1173 }
1174 return res;
1175 }
1176
1177 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1178 object. */
1179
1180 static rtx
1181 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1182 {
1183 rtx res;
1184
1185 switch (GET_MODE (dst))
1186 {
1187 case DImode:
1188 res = gen_packsidi2 (dst, src0, src1);
1189 break;
1190 case DFmode:
1191 res = gen_packsidf2 (dst, src0, src1);
1192 break;
1193 default: gcc_unreachable ();
1194 }
1195 return res;
1196 }
1197
1198 /* Generate an instruction or sequence to broadcast register REG
1199 across the vectors of a single warp. */
1200
1201 static rtx
1202 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
1203 {
1204 rtx res;
1205
1206 switch (GET_MODE (dst))
1207 {
1208 case SImode:
1209 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1210 break;
1211 case SFmode:
1212 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1213 break;
1214 case DImode:
1215 case DFmode:
1216 {
1217 rtx tmp0 = gen_reg_rtx (SImode);
1218 rtx tmp1 = gen_reg_rtx (SImode);
1219
1220 start_sequence ();
1221 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1222 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1223 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1224 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1225 res = get_insns ();
1226 end_sequence ();
1227 }
1228 break;
1229 case BImode:
1230 {
1231 rtx tmp = gen_reg_rtx (SImode);
1232
1233 start_sequence ();
1234 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1235 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1236 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1237 res = get_insns ();
1238 end_sequence ();
1239 }
1240 break;
1241
1242 default:
1243 gcc_unreachable ();
1244 }
1245 return res;
1246 }
1247
1248 /* Generate an instruction or sequence to broadcast register REG
1249 across the vectors of a single warp. */
1250
1251 static rtx
1252 nvptx_gen_vcast (rtx reg)
1253 {
1254 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1255 }
1256
1257 /* Structure used when generating a worker-level spill or fill. */
1258
1259 struct wcast_data_t
1260 {
1261 rtx base; /* Register holding base addr of buffer. */
1262 rtx ptr; /* Iteration var, if needed. */
1263 unsigned offset; /* Offset into worker buffer. */
1264 };
1265
1266 /* Direction of the spill/fill and looping setup/teardown indicator. */
1267
1268 enum propagate_mask
1269 {
1270 PM_read = 1 << 0,
1271 PM_write = 1 << 1,
1272 PM_loop_begin = 1 << 2,
1273 PM_loop_end = 1 << 3,
1274
1275 PM_read_write = PM_read | PM_write
1276 };
1277
1278 /* Generate instruction(s) to spill or fill register REG to/from the
1279 worker broadcast array. PM indicates what is to be done, REP
1280 how many loop iterations will be executed (0 for not a loop). */
1281
1282 static rtx
1283 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1284 {
1285 rtx res;
1286 machine_mode mode = GET_MODE (reg);
1287
1288 switch (mode)
1289 {
1290 case BImode:
1291 {
1292 rtx tmp = gen_reg_rtx (SImode);
1293
1294 start_sequence ();
1295 if (pm & PM_read)
1296 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1297 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1298 if (pm & PM_write)
1299 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1300 res = get_insns ();
1301 end_sequence ();
1302 }
1303 break;
1304
1305 default:
1306 {
1307 rtx addr = data->ptr;
1308
1309 if (!addr)
1310 {
1311 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1312
1313 if (align > worker_bcast_align)
1314 worker_bcast_align = align;
1315 data->offset = (data->offset + align - 1) & ~(align - 1);
1316 addr = data->base;
1317 if (data->offset)
1318 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1319 }
1320
1321 addr = gen_rtx_MEM (mode, addr);
1322 addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
1323 if (pm == PM_read)
1324 res = gen_rtx_SET (addr, reg);
1325 else if (pm == PM_write)
1326 res = gen_rtx_SET (reg, addr);
1327 else
1328 gcc_unreachable ();
1329
1330 if (data->ptr)
1331 {
1332 /* We're using a ptr, increment it. */
1333 start_sequence ();
1334
1335 emit_insn (res);
1336 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1337 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1338 res = get_insns ();
1339 end_sequence ();
1340 }
1341 else
1342 rep = 1;
1343 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1344 }
1345 break;
1346 }
1347 return res;
1348 }
1349
1350 /* When loading an operand ORIG_OP, verify whether an address space
1351 conversion to generic is required, and if so, perform it. Check
1352 for SYMBOL_REFs and record them if needed. Return either the
1353 original operand, or the converted one. */
1354
1355 rtx
1356 nvptx_maybe_convert_symbolic_operand (rtx op)
1357 {
1358 if (GET_MODE (op) != Pmode)
1359 return op;
1360
1361 rtx sym = op;
1362 if (GET_CODE (sym) == CONST)
1363 sym = XEXP (sym, 0);
1364 if (GET_CODE (sym) == PLUS)
1365 sym = XEXP (sym, 0);
1366
1367 if (GET_CODE (sym) != SYMBOL_REF)
1368 return op;
1369
1370 nvptx_maybe_record_fnsym (sym);
1371
1372 addr_space_t as = nvptx_addr_space_from_sym (sym);
1373 if (as == ADDR_SPACE_GENERIC)
1374 return op;
1375
1376 enum unspec code;
1377 code = (as == ADDR_SPACE_GLOBAL ? UNSPEC_FROM_GLOBAL
1378 : as == ADDR_SPACE_LOCAL ? UNSPEC_FROM_LOCAL
1379 : as == ADDR_SPACE_SHARED ? UNSPEC_FROM_SHARED
1380 : as == ADDR_SPACE_CONST ? UNSPEC_FROM_CONST
1381 : UNSPEC_FROM_PARAM);
1382
1383 rtx dest = gen_reg_rtx (Pmode);
1384 emit_insn (gen_rtx_SET (dest,
1385 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op), code)));
1386 return dest;
1387 }
1388 \f
1389 /* Returns true if X is a valid address for use in a memory reference. */
1390
1391 static bool
1392 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1393 {
1394 enum rtx_code code = GET_CODE (x);
1395
1396 switch (code)
1397 {
1398 case REG:
1399 return true;
1400
1401 case PLUS:
1402 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1403 return true;
1404 return false;
1405
1406 case CONST:
1407 case SYMBOL_REF:
1408 case LABEL_REF:
1409 return true;
1410
1411 default:
1412 return false;
1413 }
1414 }
1415
1416 /* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1417 to ensure that the return register's mode isn't changed. */
1418
1419 bool
1420 nvptx_hard_regno_mode_ok (int regno, machine_mode mode)
1421 {
1422 if (regno != NVPTX_RETURN_REGNUM
1423 || cfun == NULL || cfun->machine->ret_reg_mode == VOIDmode)
1424 return true;
1425 return mode == cfun->machine->ret_reg_mode;
1426 }
1427 \f
1428 /* Convert an address space AS to the corresponding ptx string. */
1429
1430 const char *
1431 nvptx_section_from_addr_space (addr_space_t as)
1432 {
1433 switch (as)
1434 {
1435 case ADDR_SPACE_CONST:
1436 return ".const";
1437
1438 case ADDR_SPACE_GLOBAL:
1439 return ".global";
1440
1441 case ADDR_SPACE_SHARED:
1442 return ".shared";
1443
1444 case ADDR_SPACE_GENERIC:
1445 return "";
1446
1447 default:
1448 gcc_unreachable ();
1449 }
1450 }
1451
1452 /* Determine whether DECL goes into .const or .global. */
1453
1454 const char *
1455 nvptx_section_for_decl (const_tree decl)
1456 {
1457 bool is_const = (CONSTANT_CLASS_P (decl)
1458 || TREE_CODE (decl) == CONST_DECL
1459 || TREE_READONLY (decl));
1460 if (is_const)
1461 return ".const";
1462
1463 return ".global";
1464 }
1465
1466 \f
1467 /* Machinery to output constant initializers. When beginning an initializer,
1468 we decide on a chunk size (which is visible in ptx in the type used), and
1469 then all initializer data is buffered until a chunk is filled and ready to
1470 be written out. */
1471
1472 /* Used when assembling integers to ensure data is emitted in
1473 pieces whose size matches the declaration we printed. */
1474 static unsigned int decl_chunk_size;
1475 static machine_mode decl_chunk_mode;
1476 /* Used in the same situation, to keep track of the byte offset
1477 into the initializer. */
1478 static unsigned HOST_WIDE_INT decl_offset;
1479 /* The initializer part we are currently processing. */
1480 static HOST_WIDE_INT init_part;
1481 /* The total size of the object. */
1482 static unsigned HOST_WIDE_INT object_size;
1483 /* True if we found a skip extending to the end of the object. Used to
1484 assert that no data follows. */
1485 static bool object_finished;
1486
1487 /* Write the necessary separator string to begin a new initializer value. */
1488
1489 static void
1490 begin_decl_field (void)
1491 {
1492 /* We never see decl_offset at zero by the time we get here. */
1493 if (decl_offset == decl_chunk_size)
1494 fprintf (asm_out_file, " = { ");
1495 else
1496 fprintf (asm_out_file, ", ");
1497 }
1498
1499 /* Output the currently stored chunk as an initializer value. */
1500
1501 static void
1502 output_decl_chunk (void)
1503 {
1504 begin_decl_field ();
1505 output_address (VOIDmode, gen_int_mode (init_part, decl_chunk_mode));
1506 init_part = 0;
1507 }
1508
1509 /* Add value VAL sized SIZE to the data we're emitting, and keep writing
1510 out chunks as they fill up. */
1511
1512 static void
1513 nvptx_assemble_value (HOST_WIDE_INT val, unsigned int size)
1514 {
1515 unsigned HOST_WIDE_INT chunk_offset = decl_offset % decl_chunk_size;
1516 gcc_assert (!object_finished);
1517 while (size > 0)
1518 {
1519 int this_part = size;
1520 if (chunk_offset + this_part > decl_chunk_size)
1521 this_part = decl_chunk_size - chunk_offset;
1522 HOST_WIDE_INT val_part;
1523 HOST_WIDE_INT mask = 2;
1524 mask <<= this_part * BITS_PER_UNIT - 1;
1525 val_part = val & (mask - 1);
1526 init_part |= val_part << (BITS_PER_UNIT * chunk_offset);
1527 val >>= BITS_PER_UNIT * this_part;
1528 size -= this_part;
1529 decl_offset += this_part;
1530 if (decl_offset % decl_chunk_size == 0)
1531 output_decl_chunk ();
1532
1533 chunk_offset = 0;
1534 }
1535 }
1536
1537 /* Target hook for assembling integer object X of size SIZE. */
1538
1539 static bool
1540 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1541 {
1542 HOST_WIDE_INT val = 0;
1543
1544 switch (GET_CODE (x))
1545 {
1546 default:
1547 gcc_unreachable ();
1548
1549 case CONST_INT:
1550 val = INTVAL (x);
1551 nvptx_assemble_value (val, size);
1552 break;
1553
1554 case CONST:
1555 x = XEXP (x, 0);
1556 gcc_assert (GET_CODE (x) == PLUS);
1557 val = INTVAL (XEXP (x, 1));
1558 x = XEXP (x, 0);
1559 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1560 /* FALLTHROUGH */
1561
1562 case SYMBOL_REF:
1563 gcc_assert (size = decl_chunk_size);
1564 if (decl_offset % decl_chunk_size != 0)
1565 sorry ("cannot emit unaligned pointers in ptx assembly");
1566 decl_offset += size;
1567 begin_decl_field ();
1568
1569 nvptx_maybe_record_fnsym (x);
1570 fprintf (asm_out_file, "generic(");
1571 output_address (VOIDmode, x);
1572 fprintf (asm_out_file, ")");
1573
1574 if (val)
1575 fprintf (asm_out_file, " + " HOST_WIDE_INT_PRINT_DEC, val);
1576 break;
1577 }
1578
1579 return true;
1580 }
1581
1582 /* Output SIZE zero bytes. We ignore the FILE argument since the
1583 functions we're calling to perform the output just use
1584 asm_out_file. */
1585
1586 void
1587 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1588 {
1589 if (decl_offset + size >= object_size)
1590 {
1591 if (decl_offset % decl_chunk_size != 0)
1592 nvptx_assemble_value (0, decl_chunk_size);
1593 object_finished = true;
1594 return;
1595 }
1596
1597 while (size > decl_chunk_size)
1598 {
1599 nvptx_assemble_value (0, decl_chunk_size);
1600 size -= decl_chunk_size;
1601 }
1602 while (size-- > 0)
1603 nvptx_assemble_value (0, 1);
1604 }
1605
1606 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1607 ignore the FILE arg. */
1608
1609 void
1610 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1611 {
1612 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1613 nvptx_assemble_value (str[i], 1);
1614 }
1615
1616 /* Called when the initializer for a decl has been completely output through
1617 combinations of the three functions above. */
1618
1619 static void
1620 nvptx_assemble_decl_end (void)
1621 {
1622 if (decl_offset != 0)
1623 {
1624 if (!object_finished && decl_offset % decl_chunk_size != 0)
1625 nvptx_assemble_value (0, decl_chunk_size);
1626
1627 fprintf (asm_out_file, " }");
1628 }
1629 fprintf (asm_out_file, ";\n");
1630 }
1631
1632 /* Start a declaration of a variable of TYPE with NAME to
1633 FILE. IS_PUBLIC says whether this will be externally visible.
1634 Here we just write the linker hint and decide on the chunk size
1635 to use. */
1636
1637 static void
1638 init_output_initializer (FILE *file, const char *name, const_tree type,
1639 bool is_public)
1640 {
1641 fprintf (file, "\n// BEGIN%s VAR DEF: ", is_public ? " GLOBAL" : "");
1642 assemble_name_raw (file, name);
1643 fputc ('\n', file);
1644
1645 if (TREE_CODE (type) == ARRAY_TYPE)
1646 type = TREE_TYPE (type);
1647 int sz = int_size_in_bytes (type);
1648 if ((TREE_CODE (type) != INTEGER_TYPE
1649 && TREE_CODE (type) != ENUMERAL_TYPE
1650 && TREE_CODE (type) != REAL_TYPE)
1651 || sz < 0
1652 || sz > HOST_BITS_PER_WIDE_INT)
1653 type = ptr_type_node;
1654 decl_chunk_size = int_size_in_bytes (type);
1655 decl_chunk_mode = int_mode_for_mode (TYPE_MODE (type));
1656 decl_offset = 0;
1657 init_part = 0;
1658 object_finished = false;
1659 }
1660
1661 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1662 writing a constant variable EXP with NAME and SIZE and its
1663 initializer to FILE. */
1664
1665 static void
1666 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1667 const_tree exp, HOST_WIDE_INT size)
1668 {
1669 tree type = TREE_TYPE (exp);
1670 init_output_initializer (file, name, type, false);
1671 fprintf (file, "\t.const .align %d .u%d ",
1672 TYPE_ALIGN (TREE_TYPE (exp)) / BITS_PER_UNIT,
1673 decl_chunk_size * BITS_PER_UNIT);
1674 assemble_name (file, name);
1675 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1676 (size + decl_chunk_size - 1) / decl_chunk_size);
1677 object_size = size;
1678 }
1679
1680 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1681 a variable DECL with NAME to FILE. */
1682
1683 void
1684 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1685 {
1686 if (decl && DECL_SIZE (decl))
1687 {
1688 tree type = TREE_TYPE (decl);
1689 unsigned HOST_WIDE_INT size;
1690
1691 init_output_initializer (file, name, type, TREE_PUBLIC (decl));
1692 size = tree_to_uhwi (DECL_SIZE_UNIT (decl));
1693 const char *section = nvptx_section_for_decl (decl);
1694 fprintf (file, "\t%s%s .align %d .u%d ",
1695 !TREE_PUBLIC (decl) ? ""
1696 : DECL_WEAK (decl) ? ".weak" : ".visible",
1697 section, DECL_ALIGN (decl) / BITS_PER_UNIT,
1698 decl_chunk_size * BITS_PER_UNIT);
1699 assemble_name (file, name);
1700 if (size > 0)
1701 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1702 (size + decl_chunk_size - 1) / decl_chunk_size);
1703 else
1704 object_finished = true;
1705 object_size = size;
1706 }
1707 }
1708
1709 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1710
1711 static void
1712 nvptx_globalize_label (FILE *, const char *)
1713 {
1714 }
1715
1716 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1717 declaration only for variable DECL with NAME to FILE. */
1718 static void
1719 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1720 {
1721 if (TREE_CODE (decl) != VAR_DECL)
1722 return;
1723 const char *section = nvptx_section_for_decl (decl);
1724 fprintf (file, "\n// BEGIN%s VAR DECL: ",
1725 TREE_PUBLIC (decl) ? " GLOBAL" : "");
1726 assemble_name_raw (file, name);
1727 fputs ("\n", file);
1728 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (decl));
1729 fprintf (file, ".extern %s .b8 ", section);
1730 assemble_name_raw (file, name);
1731 if (size > 0)
1732 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC"]", size);
1733 fprintf (file, ";\n\n");
1734 }
1735
1736 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1737 involves writing .param declarations and in/out copies into them. For
1738 indirect calls, also write the .callprototype. */
1739
1740 const char *
1741 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1742 {
1743 char buf[16];
1744 static int labelno;
1745 bool needs_tgt = register_operand (callee, Pmode);
1746 rtx pat = PATTERN (insn);
1747 int arg_end = XVECLEN (pat, 0);
1748 tree decl = NULL_TREE;
1749
1750 fprintf (asm_out_file, "\t{\n");
1751 if (result != NULL)
1752 fprintf (asm_out_file, "\t\t.param%s %%retval_in;\n",
1753 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
1754 false));
1755
1756 /* Ensure we have a ptx declaration in the output if necessary. */
1757 if (GET_CODE (callee) == SYMBOL_REF)
1758 {
1759 decl = SYMBOL_REF_DECL (callee);
1760 if (!decl
1761 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1762 nvptx_record_libfunc (callee, result, pat);
1763 else if (DECL_EXTERNAL (decl))
1764 nvptx_record_fndecl (decl);
1765 }
1766
1767 if (needs_tgt)
1768 {
1769 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1770 labelno++;
1771 ASM_OUTPUT_LABEL (asm_out_file, buf);
1772 std::stringstream s;
1773 write_fn_proto_from_insn (s, NULL, result, pat);
1774 fputs (s.str().c_str(), asm_out_file);
1775 }
1776
1777 for (int argno = 1; argno < arg_end; argno++)
1778 {
1779 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1780 machine_mode mode = GET_MODE (t);
1781
1782 /* Mode splitting has already been done. */
1783 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1784 nvptx_ptx_type_from_mode (mode, false), argno,
1785 mode == QImode || mode == HImode ? "[1]" : "");
1786 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1787 nvptx_ptx_type_from_mode (mode, false), argno,
1788 REGNO (t));
1789 }
1790
1791 fprintf (asm_out_file, "\t\tcall ");
1792 if (result != NULL_RTX)
1793 fprintf (asm_out_file, "(%%retval_in), ");
1794
1795 if (decl)
1796 {
1797 const char *name = get_fnname_from_decl (decl);
1798 name = nvptx_name_replacement (name);
1799 assemble_name (asm_out_file, name);
1800 }
1801 else
1802 output_address (VOIDmode, callee);
1803
1804 const char *open = "(";
1805 for (int argno = 1; argno < arg_end; argno++)
1806 {
1807 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1808 open = "";
1809 }
1810 if (decl && DECL_STATIC_CHAIN (decl))
1811 {
1812 fprintf (asm_out_file, ", %s%s", open,
1813 reg_names [OUTGOING_STATIC_CHAIN_REGNUM]);
1814 open = "";
1815 }
1816 if (!open[0])
1817 fprintf (asm_out_file, ")");
1818
1819 if (needs_tgt)
1820 {
1821 fprintf (asm_out_file, ", ");
1822 assemble_name (asm_out_file, buf);
1823 }
1824 fprintf (asm_out_file, ";\n");
1825
1826 return result != NULL_RTX ? "\tld.param%t0\t%0, [%%retval_in];\n\t}" : "}";
1827 }
1828
1829 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1830
1831 static bool
1832 nvptx_print_operand_punct_valid_p (unsigned char c)
1833 {
1834 return c == '.' || c== '#';
1835 }
1836
1837 static void nvptx_print_operand (FILE *, rtx, int);
1838
1839 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1840
1841 static void
1842 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1843 {
1844 rtx off;
1845 if (GET_CODE (x) == CONST)
1846 x = XEXP (x, 0);
1847 switch (GET_CODE (x))
1848 {
1849 case PLUS:
1850 off = XEXP (x, 1);
1851 output_address (VOIDmode, XEXP (x, 0));
1852 fprintf (file, "+");
1853 output_address (VOIDmode, off);
1854 break;
1855
1856 case SYMBOL_REF:
1857 case LABEL_REF:
1858 output_addr_const (file, x);
1859 break;
1860
1861 default:
1862 gcc_assert (GET_CODE (x) != MEM);
1863 nvptx_print_operand (file, x, 0);
1864 break;
1865 }
1866 }
1867
1868 /* Write assembly language output for the address ADDR to FILE. */
1869
1870 static void
1871 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1872 {
1873 nvptx_print_address_operand (file, addr, mode);
1874 }
1875
1876 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1877
1878 Meaning of CODE:
1879 . -- print the predicate for the instruction or an emptry string for an
1880 unconditional one.
1881 # -- print a rounding mode for the instruction
1882
1883 A -- print an address space identifier for a MEM
1884 c -- print an opcode suffix for a comparison operator, including a type code
1885 f -- print a full reg even for something that must always be split
1886 S -- print a shuffle kind specified by CONST_INT
1887 t -- print a type opcode suffix, promoting QImode to 32 bits
1888 T -- print a type size in bits
1889 u -- print a type opcode suffix without promotions. */
1890
1891 static void
1892 nvptx_print_operand (FILE *file, rtx x, int code)
1893 {
1894 rtx orig_x = x;
1895 machine_mode op_mode;
1896
1897 if (code == '.')
1898 {
1899 x = current_insn_predicate;
1900 if (x)
1901 {
1902 unsigned int regno = REGNO (XEXP (x, 0));
1903 fputs ("[", file);
1904 if (GET_CODE (x) == EQ)
1905 fputs ("!", file);
1906 fputs (reg_names [regno], file);
1907 fputs ("]", file);
1908 }
1909 return;
1910 }
1911 else if (code == '#')
1912 {
1913 fputs (".rn", file);
1914 return;
1915 }
1916
1917 enum rtx_code x_code = GET_CODE (x);
1918
1919 switch (code)
1920 {
1921 case 'A':
1922 {
1923 addr_space_t as = ADDR_SPACE_GENERIC;
1924 rtx sym = XEXP (x, 0);
1925
1926 if (GET_CODE (sym) == CONST)
1927 sym = XEXP (sym, 0);
1928 if (GET_CODE (sym) == PLUS)
1929 sym = XEXP (sym, 0);
1930
1931 if (GET_CODE (sym) == SYMBOL_REF)
1932 as = nvptx_addr_space_from_sym (sym);
1933
1934 fputs (nvptx_section_from_addr_space (as), file);
1935 }
1936 break;
1937
1938 case 't':
1939 op_mode = nvptx_underlying_object_mode (x);
1940 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
1941 break;
1942
1943 case 'u':
1944 op_mode = nvptx_underlying_object_mode (x);
1945 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
1946 break;
1947
1948 case 'S':
1949 {
1950 unsigned kind = UINTVAL (x);
1951 static const char *const kinds[] =
1952 {"up", "down", "bfly", "idx"};
1953 fprintf (file, ".%s", kinds[kind]);
1954 }
1955 break;
1956
1957 case 'T':
1958 fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
1959 break;
1960
1961 case 'j':
1962 fprintf (file, "@");
1963 goto common;
1964
1965 case 'J':
1966 fprintf (file, "@!");
1967 goto common;
1968
1969 case 'c':
1970 op_mode = GET_MODE (XEXP (x, 0));
1971 switch (x_code)
1972 {
1973 case EQ:
1974 fputs (".eq", file);
1975 break;
1976 case NE:
1977 if (FLOAT_MODE_P (op_mode))
1978 fputs (".neu", file);
1979 else
1980 fputs (".ne", file);
1981 break;
1982 case LE:
1983 fputs (".le", file);
1984 break;
1985 case GE:
1986 fputs (".ge", file);
1987 break;
1988 case LT:
1989 fputs (".lt", file);
1990 break;
1991 case GT:
1992 fputs (".gt", file);
1993 break;
1994 case LEU:
1995 fputs (".ls", file);
1996 break;
1997 case GEU:
1998 fputs (".hs", file);
1999 break;
2000 case LTU:
2001 fputs (".lo", file);
2002 break;
2003 case GTU:
2004 fputs (".hi", file);
2005 break;
2006 case LTGT:
2007 fputs (".ne", file);
2008 break;
2009 case UNEQ:
2010 fputs (".equ", file);
2011 break;
2012 case UNLE:
2013 fputs (".leu", file);
2014 break;
2015 case UNGE:
2016 fputs (".geu", file);
2017 break;
2018 case UNLT:
2019 fputs (".ltu", file);
2020 break;
2021 case UNGT:
2022 fputs (".gtu", file);
2023 break;
2024 case UNORDERED:
2025 fputs (".nan", file);
2026 break;
2027 case ORDERED:
2028 fputs (".num", file);
2029 break;
2030 default:
2031 gcc_unreachable ();
2032 }
2033 if (FLOAT_MODE_P (op_mode)
2034 || x_code == EQ || x_code == NE
2035 || x_code == GEU || x_code == GTU
2036 || x_code == LEU || x_code == LTU)
2037 fputs (nvptx_ptx_type_from_mode (op_mode, true), file);
2038 else
2039 fprintf (file, ".s%d", GET_MODE_BITSIZE (op_mode));
2040 break;
2041 default:
2042 common:
2043 switch (x_code)
2044 {
2045 case SUBREG:
2046 x = SUBREG_REG (x);
2047 /* fall through */
2048
2049 case REG:
2050 if (HARD_REGISTER_P (x))
2051 fprintf (file, "%s", reg_names[REGNO (x)]);
2052 else
2053 fprintf (file, "%%r%d", REGNO (x));
2054 if (code != 'f' && maybe_split_mode (GET_MODE (x)) != VOIDmode)
2055 {
2056 gcc_assert (GET_CODE (orig_x) == SUBREG
2057 && maybe_split_mode (GET_MODE (orig_x)) == VOIDmode);
2058 fprintf (file, "$%d", SUBREG_BYTE (orig_x) / UNITS_PER_WORD);
2059 }
2060 break;
2061
2062 case MEM:
2063 fputc ('[', file);
2064 nvptx_print_address_operand (file, XEXP (x, 0), GET_MODE (x));
2065 fputc (']', file);
2066 break;
2067
2068 case CONST_INT:
2069 output_addr_const (file, x);
2070 break;
2071
2072 case CONST:
2073 case SYMBOL_REF:
2074 case LABEL_REF:
2075 /* We could use output_addr_const, but that can print things like
2076 "x-8", which breaks ptxas. Need to ensure it is output as
2077 "x+-8". */
2078 nvptx_print_address_operand (file, x, VOIDmode);
2079 break;
2080
2081 case CONST_DOUBLE:
2082 long vals[2];
2083 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), GET_MODE (x));
2084 vals[0] &= 0xffffffff;
2085 vals[1] &= 0xffffffff;
2086 if (GET_MODE (x) == SFmode)
2087 fprintf (file, "0f%08lx", vals[0]);
2088 else
2089 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2090 break;
2091
2092 default:
2093 output_addr_const (file, x);
2094 }
2095 }
2096 }
2097 \f
2098 /* Record replacement regs used to deal with subreg operands. */
2099 struct reg_replace
2100 {
2101 rtx replacement[MAX_RECOG_OPERANDS];
2102 machine_mode mode;
2103 int n_allocated;
2104 int n_in_use;
2105 };
2106
2107 /* Allocate or reuse a replacement in R and return the rtx. */
2108
2109 static rtx
2110 get_replacement (struct reg_replace *r)
2111 {
2112 if (r->n_allocated == r->n_in_use)
2113 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2114 return r->replacement[r->n_in_use++];
2115 }
2116
2117 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2118 the presence of subregs would break the rules for most instructions.
2119 Replace them with a suitable new register of the right size, plus
2120 conversion copyin/copyout instructions. */
2121
2122 static void
2123 nvptx_reorg_subreg (void)
2124 {
2125 struct reg_replace qiregs, hiregs, siregs, diregs;
2126 rtx_insn *insn, *next;
2127
2128 qiregs.n_allocated = 0;
2129 hiregs.n_allocated = 0;
2130 siregs.n_allocated = 0;
2131 diregs.n_allocated = 0;
2132 qiregs.mode = QImode;
2133 hiregs.mode = HImode;
2134 siregs.mode = SImode;
2135 diregs.mode = DImode;
2136
2137 for (insn = get_insns (); insn; insn = next)
2138 {
2139 next = NEXT_INSN (insn);
2140 if (!NONDEBUG_INSN_P (insn)
2141 || asm_noperands (PATTERN (insn)) >= 0
2142 || GET_CODE (PATTERN (insn)) == USE
2143 || GET_CODE (PATTERN (insn)) == CLOBBER)
2144 continue;
2145
2146 qiregs.n_in_use = 0;
2147 hiregs.n_in_use = 0;
2148 siregs.n_in_use = 0;
2149 diregs.n_in_use = 0;
2150 extract_insn (insn);
2151 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2152
2153 for (int i = 0; i < recog_data.n_operands; i++)
2154 {
2155 rtx op = recog_data.operand[i];
2156 if (GET_CODE (op) != SUBREG)
2157 continue;
2158
2159 rtx inner = SUBREG_REG (op);
2160
2161 machine_mode outer_mode = GET_MODE (op);
2162 machine_mode inner_mode = GET_MODE (inner);
2163 gcc_assert (s_ok);
2164 if (s_ok
2165 && (GET_MODE_PRECISION (inner_mode)
2166 >= GET_MODE_PRECISION (outer_mode)))
2167 continue;
2168 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2169 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2170 : outer_mode == HImode ? &hiregs
2171 : outer_mode == SImode ? &siregs
2172 : &diregs);
2173 rtx new_reg = get_replacement (r);
2174
2175 if (recog_data.operand_type[i] != OP_OUT)
2176 {
2177 enum rtx_code code;
2178 if (GET_MODE_PRECISION (inner_mode)
2179 < GET_MODE_PRECISION (outer_mode))
2180 code = ZERO_EXTEND;
2181 else
2182 code = TRUNCATE;
2183
2184 rtx pat = gen_rtx_SET (new_reg,
2185 gen_rtx_fmt_e (code, outer_mode, inner));
2186 emit_insn_before (pat, insn);
2187 }
2188
2189 if (recog_data.operand_type[i] != OP_IN)
2190 {
2191 enum rtx_code code;
2192 if (GET_MODE_PRECISION (inner_mode)
2193 < GET_MODE_PRECISION (outer_mode))
2194 code = TRUNCATE;
2195 else
2196 code = ZERO_EXTEND;
2197
2198 rtx pat = gen_rtx_SET (inner,
2199 gen_rtx_fmt_e (code, inner_mode, new_reg));
2200 emit_insn_after (pat, insn);
2201 }
2202 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2203 }
2204 }
2205 }
2206
2207 /* Loop structure of the function. The entire function is described as
2208 a NULL loop. */
2209
2210 struct parallel
2211 {
2212 /* Parent parallel. */
2213 parallel *parent;
2214
2215 /* Next sibling parallel. */
2216 parallel *next;
2217
2218 /* First child parallel. */
2219 parallel *inner;
2220
2221 /* Partitioning mask of the parallel. */
2222 unsigned mask;
2223
2224 /* Partitioning used within inner parallels. */
2225 unsigned inner_mask;
2226
2227 /* Location of parallel forked and join. The forked is the first
2228 block in the parallel and the join is the first block after of
2229 the partition. */
2230 basic_block forked_block;
2231 basic_block join_block;
2232
2233 rtx_insn *forked_insn;
2234 rtx_insn *join_insn;
2235
2236 rtx_insn *fork_insn;
2237 rtx_insn *joining_insn;
2238
2239 /* Basic blocks in this parallel, but not in child parallels. The
2240 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2241 blocks are not. */
2242 auto_vec<basic_block> blocks;
2243
2244 public:
2245 parallel (parallel *parent, unsigned mode);
2246 ~parallel ();
2247 };
2248
2249 /* Constructor links the new parallel into it's parent's chain of
2250 children. */
2251
2252 parallel::parallel (parallel *parent_, unsigned mask_)
2253 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2254 {
2255 forked_block = join_block = 0;
2256 forked_insn = join_insn = 0;
2257 fork_insn = joining_insn = 0;
2258
2259 if (parent)
2260 {
2261 next = parent->inner;
2262 parent->inner = this;
2263 }
2264 }
2265
2266 parallel::~parallel ()
2267 {
2268 delete inner;
2269 delete next;
2270 }
2271
2272 /* Map of basic blocks to insns */
2273 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2274
2275 /* A tuple of an insn of interest and the BB in which it resides. */
2276 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2277 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2278
2279 /* Split basic blocks such that each forked and join unspecs are at
2280 the start of their basic blocks. Thus afterwards each block will
2281 have a single partitioning mode. We also do the same for return
2282 insns, as they are executed by every thread. Return the
2283 partitioning mode of the function as a whole. Populate MAP with
2284 head and tail blocks. We also clear the BB visited flag, which is
2285 used when finding partitions. */
2286
2287 static void
2288 nvptx_split_blocks (bb_insn_map_t *map)
2289 {
2290 insn_bb_vec_t worklist;
2291 basic_block block;
2292 rtx_insn *insn;
2293
2294 /* Locate all the reorg instructions of interest. */
2295 FOR_ALL_BB_FN (block, cfun)
2296 {
2297 bool seen_insn = false;
2298
2299 /* Clear visited flag, for use by parallel locator */
2300 block->flags &= ~BB_VISITED;
2301
2302 FOR_BB_INSNS (block, insn)
2303 {
2304 if (!INSN_P (insn))
2305 continue;
2306 switch (recog_memoized (insn))
2307 {
2308 default:
2309 seen_insn = true;
2310 continue;
2311 case CODE_FOR_nvptx_forked:
2312 case CODE_FOR_nvptx_join:
2313 break;
2314
2315 case CODE_FOR_return:
2316 /* We also need to split just before return insns, as
2317 that insn needs executing by all threads, but the
2318 block it is in probably does not. */
2319 break;
2320 }
2321
2322 if (seen_insn)
2323 /* We've found an instruction that must be at the start of
2324 a block, but isn't. Add it to the worklist. */
2325 worklist.safe_push (insn_bb_t (insn, block));
2326 else
2327 /* It was already the first instruction. Just add it to
2328 the map. */
2329 map->get_or_insert (block) = insn;
2330 seen_insn = true;
2331 }
2332 }
2333
2334 /* Split blocks on the worklist. */
2335 unsigned ix;
2336 insn_bb_t *elt;
2337 basic_block remap = 0;
2338 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2339 {
2340 if (remap != elt->second)
2341 {
2342 block = elt->second;
2343 remap = block;
2344 }
2345
2346 /* Split block before insn. The insn is in the new block */
2347 edge e = split_block (block, PREV_INSN (elt->first));
2348
2349 block = e->dest;
2350 map->get_or_insert (block) = elt->first;
2351 }
2352 }
2353
2354 /* BLOCK is a basic block containing a head or tail instruction.
2355 Locate the associated prehead or pretail instruction, which must be
2356 in the single predecessor block. */
2357
2358 static rtx_insn *
2359 nvptx_discover_pre (basic_block block, int expected)
2360 {
2361 gcc_assert (block->preds->length () == 1);
2362 basic_block pre_block = (*block->preds)[0]->src;
2363 rtx_insn *pre_insn;
2364
2365 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2366 pre_insn = PREV_INSN (pre_insn))
2367 gcc_assert (pre_insn != BB_HEAD (pre_block));
2368
2369 gcc_assert (recog_memoized (pre_insn) == expected);
2370 return pre_insn;
2371 }
2372
2373 /* Dump this parallel and all its inner parallels. */
2374
2375 static void
2376 nvptx_dump_pars (parallel *par, unsigned depth)
2377 {
2378 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2379 depth, par->mask,
2380 par->forked_block ? par->forked_block->index : -1,
2381 par->join_block ? par->join_block->index : -1);
2382
2383 fprintf (dump_file, " blocks:");
2384
2385 basic_block block;
2386 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2387 fprintf (dump_file, " %d", block->index);
2388 fprintf (dump_file, "\n");
2389 if (par->inner)
2390 nvptx_dump_pars (par->inner, depth + 1);
2391
2392 if (par->next)
2393 nvptx_dump_pars (par->next, depth);
2394 }
2395
2396 /* If BLOCK contains a fork/join marker, process it to create or
2397 terminate a loop structure. Add this block to the current loop,
2398 and then walk successor blocks. */
2399
2400 static parallel *
2401 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2402 {
2403 if (block->flags & BB_VISITED)
2404 return par;
2405 block->flags |= BB_VISITED;
2406
2407 if (rtx_insn **endp = map->get (block))
2408 {
2409 rtx_insn *end = *endp;
2410
2411 /* This is a block head or tail, or return instruction. */
2412 switch (recog_memoized (end))
2413 {
2414 case CODE_FOR_return:
2415 /* Return instructions are in their own block, and we
2416 don't need to do anything more. */
2417 return par;
2418
2419 case CODE_FOR_nvptx_forked:
2420 /* Loop head, create a new inner loop and add it into
2421 our parent's child list. */
2422 {
2423 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2424
2425 gcc_assert (mask);
2426 par = new parallel (par, mask);
2427 par->forked_block = block;
2428 par->forked_insn = end;
2429 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2430 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2431 par->fork_insn
2432 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2433 }
2434 break;
2435
2436 case CODE_FOR_nvptx_join:
2437 /* A loop tail. Finish the current loop and return to
2438 parent. */
2439 {
2440 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2441
2442 gcc_assert (par->mask == mask);
2443 par->join_block = block;
2444 par->join_insn = end;
2445 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2446 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2447 par->joining_insn
2448 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2449 par = par->parent;
2450 }
2451 break;
2452
2453 default:
2454 gcc_unreachable ();
2455 }
2456 }
2457
2458 if (par)
2459 /* Add this block onto the current loop's list of blocks. */
2460 par->blocks.safe_push (block);
2461 else
2462 /* This must be the entry block. Create a NULL parallel. */
2463 par = new parallel (0, 0);
2464
2465 /* Walk successor blocks. */
2466 edge e;
2467 edge_iterator ei;
2468
2469 FOR_EACH_EDGE (e, ei, block->succs)
2470 nvptx_find_par (map, par, e->dest);
2471
2472 return par;
2473 }
2474
2475 /* DFS walk the CFG looking for fork & join markers. Construct
2476 loop structures as we go. MAP is a mapping of basic blocks
2477 to head & tail markers, discovered when splitting blocks. This
2478 speeds up the discovery. We rely on the BB visited flag having
2479 been cleared when splitting blocks. */
2480
2481 static parallel *
2482 nvptx_discover_pars (bb_insn_map_t *map)
2483 {
2484 basic_block block;
2485
2486 /* Mark exit blocks as visited. */
2487 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2488 block->flags |= BB_VISITED;
2489
2490 /* And entry block as not. */
2491 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2492 block->flags &= ~BB_VISITED;
2493
2494 parallel *par = nvptx_find_par (map, 0, block);
2495
2496 if (dump_file)
2497 {
2498 fprintf (dump_file, "\nLoops\n");
2499 nvptx_dump_pars (par, 0);
2500 fprintf (dump_file, "\n");
2501 }
2502
2503 return par;
2504 }
2505
2506 /* Analyse a group of BBs within a partitioned region and create N
2507 Single-Entry-Single-Exit regions. Some of those regions will be
2508 trivial ones consisting of a single BB. The blocks of a
2509 partitioned region might form a set of disjoint graphs -- because
2510 the region encloses a differently partitoned sub region.
2511
2512 We use the linear time algorithm described in 'Finding Regions Fast:
2513 Single Entry Single Exit and control Regions in Linear Time'
2514 Johnson, Pearson & Pingali. That algorithm deals with complete
2515 CFGs, where a back edge is inserted from END to START, and thus the
2516 problem becomes one of finding equivalent loops.
2517
2518 In this case we have a partial CFG. We complete it by redirecting
2519 any incoming edge to the graph to be from an arbitrary external BB,
2520 and similarly redirecting any outgoing edge to be to that BB.
2521 Thus we end up with a closed graph.
2522
2523 The algorithm works by building a spanning tree of an undirected
2524 graph and keeping track of back edges from nodes further from the
2525 root in the tree to nodes nearer to the root in the tree. In the
2526 description below, the root is up and the tree grows downwards.
2527
2528 We avoid having to deal with degenerate back-edges to the same
2529 block, by splitting each BB into 3 -- one for input edges, one for
2530 the node itself and one for the output edges. Such back edges are
2531 referred to as 'Brackets'. Cycle equivalent nodes will have the
2532 same set of brackets.
2533
2534 Determining bracket equivalency is done by maintaining a list of
2535 brackets in such a manner that the list length and final bracket
2536 uniquely identify the set.
2537
2538 We use coloring to mark all BBs with cycle equivalency with the
2539 same color. This is the output of the 'Finding Regions Fast'
2540 algorithm. Notice it doesn't actually find the set of nodes within
2541 a particular region, just unorderd sets of nodes that are the
2542 entries and exits of SESE regions.
2543
2544 After determining cycle equivalency, we need to find the minimal
2545 set of SESE regions. Do this with a DFS coloring walk of the
2546 complete graph. We're either 'looking' or 'coloring'. When
2547 looking, and we're in the subgraph, we start coloring the color of
2548 the current node, and remember that node as the start of the
2549 current color's SESE region. Every time we go to a new node, we
2550 decrement the count of nodes with thet color. If it reaches zero,
2551 we remember that node as the end of the current color's SESE region
2552 and return to 'looking'. Otherwise we color the node the current
2553 color.
2554
2555 This way we end up with coloring the inside of non-trivial SESE
2556 regions with the color of that region. */
2557
2558 /* A pair of BBs. We use this to represent SESE regions. */
2559 typedef std::pair<basic_block, basic_block> bb_pair_t;
2560 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2561
2562 /* A node in the undirected CFG. The discriminator SECOND indicates just
2563 above or just below the BB idicated by FIRST. */
2564 typedef std::pair<basic_block, int> pseudo_node_t;
2565
2566 /* A bracket indicates an edge towards the root of the spanning tree of the
2567 undirected graph. Each bracket has a color, determined
2568 from the currrent set of brackets. */
2569 struct bracket
2570 {
2571 pseudo_node_t back; /* Back target */
2572
2573 /* Current color and size of set. */
2574 unsigned color;
2575 unsigned size;
2576
2577 bracket (pseudo_node_t back_)
2578 : back (back_), color (~0u), size (~0u)
2579 {
2580 }
2581
2582 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2583 {
2584 if (length != size)
2585 {
2586 size = length;
2587 color = color_counts.length ();
2588 color_counts.quick_push (0);
2589 }
2590 color_counts[color]++;
2591 return color;
2592 }
2593 };
2594
2595 typedef auto_vec<bracket> bracket_vec_t;
2596
2597 /* Basic block info for finding SESE regions. */
2598
2599 struct bb_sese
2600 {
2601 int node; /* Node number in spanning tree. */
2602 int parent; /* Parent node number. */
2603
2604 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2605 edges arrive at pseudo-node Ai and the outgoing edges leave at
2606 pseudo-node Ao. We have to remember which way we arrived at a
2607 particular node when generating the spanning tree. dir > 0 means
2608 we arrived at Ai, dir < 0 means we arrived at Ao. */
2609 int dir;
2610
2611 /* Lowest numbered pseudo-node reached via a backedge from thsis
2612 node, or any descendant. */
2613 pseudo_node_t high;
2614
2615 int color; /* Cycle-equivalence color */
2616
2617 /* Stack of brackets for this node. */
2618 bracket_vec_t brackets;
2619
2620 bb_sese (unsigned node_, unsigned p, int dir_)
2621 :node (node_), parent (p), dir (dir_)
2622 {
2623 }
2624 ~bb_sese ();
2625
2626 /* Push a bracket ending at BACK. */
2627 void push (const pseudo_node_t &back)
2628 {
2629 if (dump_file)
2630 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2631 back.first ? back.first->index : 0, back.second);
2632 brackets.safe_push (bracket (back));
2633 }
2634
2635 void append (bb_sese *child);
2636 void remove (const pseudo_node_t &);
2637
2638 /* Set node's color. */
2639 void set_color (auto_vec<unsigned> &color_counts)
2640 {
2641 color = brackets.last ().get_color (color_counts, brackets.length ());
2642 }
2643 };
2644
2645 bb_sese::~bb_sese ()
2646 {
2647 }
2648
2649 /* Destructively append CHILD's brackets. */
2650
2651 void
2652 bb_sese::append (bb_sese *child)
2653 {
2654 if (int len = child->brackets.length ())
2655 {
2656 int ix;
2657
2658 if (dump_file)
2659 {
2660 for (ix = 0; ix < len; ix++)
2661 {
2662 const pseudo_node_t &pseudo = child->brackets[ix].back;
2663 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2664 child->node, pseudo.first ? pseudo.first->index : 0,
2665 pseudo.second);
2666 }
2667 }
2668 if (!brackets.length ())
2669 std::swap (brackets, child->brackets);
2670 else
2671 {
2672 brackets.reserve (len);
2673 for (ix = 0; ix < len; ix++)
2674 brackets.quick_push (child->brackets[ix]);
2675 }
2676 }
2677 }
2678
2679 /* Remove brackets that terminate at PSEUDO. */
2680
2681 void
2682 bb_sese::remove (const pseudo_node_t &pseudo)
2683 {
2684 unsigned removed = 0;
2685 int len = brackets.length ();
2686
2687 for (int ix = 0; ix < len; ix++)
2688 {
2689 if (brackets[ix].back == pseudo)
2690 {
2691 if (dump_file)
2692 fprintf (dump_file, "Removing backedge %d:%+d\n",
2693 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2694 removed++;
2695 }
2696 else if (removed)
2697 brackets[ix-removed] = brackets[ix];
2698 }
2699 while (removed--)
2700 brackets.pop ();
2701 }
2702
2703 /* Accessors for BB's aux pointer. */
2704 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2705 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2706
2707 /* DFS walk creating SESE data structures. Only cover nodes with
2708 BB_VISITED set. Append discovered blocks to LIST. We number in
2709 increments of 3 so that the above and below pseudo nodes can be
2710 implicitly numbered too. */
2711
2712 static int
2713 nvptx_sese_number (int n, int p, int dir, basic_block b,
2714 auto_vec<basic_block> *list)
2715 {
2716 if (BB_GET_SESE (b))
2717 return n;
2718
2719 if (dump_file)
2720 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2721 b->index, n, p, dir);
2722
2723 BB_SET_SESE (b, new bb_sese (n, p, dir));
2724 p = n;
2725
2726 n += 3;
2727 list->quick_push (b);
2728
2729 /* First walk the nodes on the 'other side' of this node, then walk
2730 the nodes on the same side. */
2731 for (unsigned ix = 2; ix; ix--)
2732 {
2733 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2734 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2735 : offsetof (edge_def, src));
2736 edge e;
2737 edge_iterator (ei);
2738
2739 FOR_EACH_EDGE (e, ei, edges)
2740 {
2741 basic_block target = *(basic_block *)((char *)e + offset);
2742
2743 if (target->flags & BB_VISITED)
2744 n = nvptx_sese_number (n, p, dir, target, list);
2745 }
2746 dir = -dir;
2747 }
2748 return n;
2749 }
2750
2751 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2752 EDGES are the outgoing edges and OFFSET is the offset to the src
2753 or dst block on the edges. */
2754
2755 static void
2756 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2757 vec<edge, va_gc> *edges, size_t offset)
2758 {
2759 edge e;
2760 edge_iterator (ei);
2761 int hi_back = depth;
2762 pseudo_node_t node_back (0, depth);
2763 int hi_child = depth;
2764 pseudo_node_t node_child (0, depth);
2765 basic_block child = NULL;
2766 unsigned num_children = 0;
2767 int usd = -dir * sese->dir;
2768
2769 if (dump_file)
2770 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2771 me->index, sese->node, dir);
2772
2773 if (dir < 0)
2774 {
2775 /* This is the above pseudo-child. It has the BB itself as an
2776 additional child node. */
2777 node_child = sese->high;
2778 hi_child = node_child.second;
2779 if (node_child.first)
2780 hi_child += BB_GET_SESE (node_child.first)->node;
2781 num_children++;
2782 }
2783
2784 /* Examine each edge.
2785 - if it is a child (a) append its bracket list and (b) record
2786 whether it is the child with the highest reaching bracket.
2787 - if it is an edge to ancestor, record whether it's the highest
2788 reaching backlink. */
2789 FOR_EACH_EDGE (e, ei, edges)
2790 {
2791 basic_block target = *(basic_block *)((char *)e + offset);
2792
2793 if (bb_sese *t_sese = BB_GET_SESE (target))
2794 {
2795 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2796 {
2797 /* Child node. Append its bracket list. */
2798 num_children++;
2799 sese->append (t_sese);
2800
2801 /* Compare it's hi value. */
2802 int t_hi = t_sese->high.second;
2803
2804 if (basic_block child_hi_block = t_sese->high.first)
2805 t_hi += BB_GET_SESE (child_hi_block)->node;
2806
2807 if (hi_child > t_hi)
2808 {
2809 hi_child = t_hi;
2810 node_child = t_sese->high;
2811 child = target;
2812 }
2813 }
2814 else if (t_sese->node < sese->node + dir
2815 && !(dir < 0 && sese->parent == t_sese->node))
2816 {
2817 /* Non-parental ancestor node -- a backlink. */
2818 int d = usd * t_sese->dir;
2819 int back = t_sese->node + d;
2820
2821 if (hi_back > back)
2822 {
2823 hi_back = back;
2824 node_back = pseudo_node_t (target, d);
2825 }
2826 }
2827 }
2828 else
2829 { /* Fallen off graph, backlink to entry node. */
2830 hi_back = 0;
2831 node_back = pseudo_node_t (0, 0);
2832 }
2833 }
2834
2835 /* Remove any brackets that terminate at this pseudo node. */
2836 sese->remove (pseudo_node_t (me, dir));
2837
2838 /* Now push any backlinks from this pseudo node. */
2839 FOR_EACH_EDGE (e, ei, edges)
2840 {
2841 basic_block target = *(basic_block *)((char *)e + offset);
2842 if (bb_sese *t_sese = BB_GET_SESE (target))
2843 {
2844 if (t_sese->node < sese->node + dir
2845 && !(dir < 0 && sese->parent == t_sese->node))
2846 /* Non-parental ancestor node - backedge from me. */
2847 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2848 }
2849 else
2850 {
2851 /* back edge to entry node */
2852 sese->push (pseudo_node_t (0, 0));
2853 }
2854 }
2855
2856 /* If this node leads directly or indirectly to a no-return region of
2857 the graph, then fake a backedge to entry node. */
2858 if (!sese->brackets.length () || !edges || !edges->length ())
2859 {
2860 hi_back = 0;
2861 node_back = pseudo_node_t (0, 0);
2862 sese->push (node_back);
2863 }
2864
2865 /* Record the highest reaching backedge from us or a descendant. */
2866 sese->high = hi_back < hi_child ? node_back : node_child;
2867
2868 if (num_children > 1)
2869 {
2870 /* There is more than one child -- this is a Y shaped piece of
2871 spanning tree. We have to insert a fake backedge from this
2872 node to the highest ancestor reached by not-the-highest
2873 reaching child. Note that there may be multiple children
2874 with backedges to the same highest node. That's ok and we
2875 insert the edge to that highest node. */
2876 hi_child = depth;
2877 if (dir < 0 && child)
2878 {
2879 node_child = sese->high;
2880 hi_child = node_child.second;
2881 if (node_child.first)
2882 hi_child += BB_GET_SESE (node_child.first)->node;
2883 }
2884
2885 FOR_EACH_EDGE (e, ei, edges)
2886 {
2887 basic_block target = *(basic_block *)((char *)e + offset);
2888
2889 if (target == child)
2890 /* Ignore the highest child. */
2891 continue;
2892
2893 bb_sese *t_sese = BB_GET_SESE (target);
2894 if (!t_sese)
2895 continue;
2896 if (t_sese->parent != sese->node)
2897 /* Not a child. */
2898 continue;
2899
2900 /* Compare its hi value. */
2901 int t_hi = t_sese->high.second;
2902
2903 if (basic_block child_hi_block = t_sese->high.first)
2904 t_hi += BB_GET_SESE (child_hi_block)->node;
2905
2906 if (hi_child > t_hi)
2907 {
2908 hi_child = t_hi;
2909 node_child = t_sese->high;
2910 }
2911 }
2912
2913 sese->push (node_child);
2914 }
2915 }
2916
2917
2918 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
2919 proceed to successors. Set SESE entry and exit nodes of
2920 REGIONS. */
2921
2922 static void
2923 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2924 basic_block block, int coloring)
2925 {
2926 bb_sese *sese = BB_GET_SESE (block);
2927
2928 if (block->flags & BB_VISITED)
2929 {
2930 /* If we've already encountered this block, either we must not
2931 be coloring, or it must have been colored the current color. */
2932 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
2933 return;
2934 }
2935
2936 block->flags |= BB_VISITED;
2937
2938 if (sese)
2939 {
2940 if (coloring < 0)
2941 {
2942 /* Start coloring a region. */
2943 regions[sese->color].first = block;
2944 coloring = sese->color;
2945 }
2946
2947 if (!--color_counts[sese->color] && sese->color == coloring)
2948 {
2949 /* Found final block of SESE region. */
2950 regions[sese->color].second = block;
2951 coloring = -1;
2952 }
2953 else
2954 /* Color the node, so we can assert on revisiting the node
2955 that the graph is indeed SESE. */
2956 sese->color = coloring;
2957 }
2958 else
2959 /* Fallen off the subgraph, we cannot be coloring. */
2960 gcc_assert (coloring < 0);
2961
2962 /* Walk each successor block. */
2963 if (block->succs && block->succs->length ())
2964 {
2965 edge e;
2966 edge_iterator ei;
2967
2968 FOR_EACH_EDGE (e, ei, block->succs)
2969 nvptx_sese_color (color_counts, regions, e->dest, coloring);
2970 }
2971 else
2972 gcc_assert (coloring < 0);
2973 }
2974
2975 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
2976 end up with NULL entries in it. */
2977
2978 static void
2979 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
2980 {
2981 basic_block block;
2982 int ix;
2983
2984 /* First clear each BB of the whole function. */
2985 FOR_EACH_BB_FN (block, cfun)
2986 {
2987 block->flags &= ~BB_VISITED;
2988 BB_SET_SESE (block, 0);
2989 }
2990 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2991 block->flags &= ~BB_VISITED;
2992 BB_SET_SESE (block, 0);
2993 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2994 block->flags &= ~BB_VISITED;
2995 BB_SET_SESE (block, 0);
2996
2997 /* Mark blocks in the function that are in this graph. */
2998 for (ix = 0; blocks.iterate (ix, &block); ix++)
2999 block->flags |= BB_VISITED;
3000
3001 /* Counts of nodes assigned to each color. There cannot be more
3002 colors than blocks (and hopefully there will be fewer). */
3003 auto_vec<unsigned> color_counts;
3004 color_counts.reserve (blocks.length ());
3005
3006 /* Worklist of nodes in the spanning tree. Again, there cannot be
3007 more nodes in the tree than blocks (there will be fewer if the
3008 CFG of blocks is disjoint). */
3009 auto_vec<basic_block> spanlist;
3010 spanlist.reserve (blocks.length ());
3011
3012 /* Make sure every block has its cycle class determined. */
3013 for (ix = 0; blocks.iterate (ix, &block); ix++)
3014 {
3015 if (BB_GET_SESE (block))
3016 /* We already met this block in an earlier graph solve. */
3017 continue;
3018
3019 if (dump_file)
3020 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3021
3022 /* Number the nodes reachable from block initial DFS order. */
3023 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3024
3025 /* Now walk in reverse DFS order to find cycle equivalents. */
3026 while (spanlist.length ())
3027 {
3028 block = spanlist.pop ();
3029 bb_sese *sese = BB_GET_SESE (block);
3030
3031 /* Do the pseudo node below. */
3032 nvptx_sese_pseudo (block, sese, depth, +1,
3033 sese->dir > 0 ? block->succs : block->preds,
3034 (sese->dir > 0 ? offsetof (edge_def, dest)
3035 : offsetof (edge_def, src)));
3036 sese->set_color (color_counts);
3037 /* Do the pseudo node above. */
3038 nvptx_sese_pseudo (block, sese, depth, -1,
3039 sese->dir < 0 ? block->succs : block->preds,
3040 (sese->dir < 0 ? offsetof (edge_def, dest)
3041 : offsetof (edge_def, src)));
3042 }
3043 if (dump_file)
3044 fprintf (dump_file, "\n");
3045 }
3046
3047 if (dump_file)
3048 {
3049 unsigned count;
3050 const char *comma = "";
3051
3052 fprintf (dump_file, "Found %d cycle equivalents\n",
3053 color_counts.length ());
3054 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3055 {
3056 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3057
3058 comma = "";
3059 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3060 if (BB_GET_SESE (block)->color == ix)
3061 {
3062 block->flags |= BB_VISITED;
3063 fprintf (dump_file, "%s%d", comma, block->index);
3064 comma=",";
3065 }
3066 fprintf (dump_file, "}");
3067 comma = ", ";
3068 }
3069 fprintf (dump_file, "\n");
3070 }
3071
3072 /* Now we've colored every block in the subgraph. We now need to
3073 determine the minimal set of SESE regions that cover that
3074 subgraph. Do this with a DFS walk of the complete function.
3075 During the walk we're either 'looking' or 'coloring'. When we
3076 reach the last node of a particular color, we stop coloring and
3077 return to looking. */
3078
3079 /* There cannot be more SESE regions than colors. */
3080 regions.reserve (color_counts.length ());
3081 for (ix = color_counts.length (); ix--;)
3082 regions.quick_push (bb_pair_t (0, 0));
3083
3084 for (ix = 0; blocks.iterate (ix, &block); ix++)
3085 block->flags &= ~BB_VISITED;
3086
3087 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3088
3089 if (dump_file)
3090 {
3091 const char *comma = "";
3092 int len = regions.length ();
3093
3094 fprintf (dump_file, "SESE regions:");
3095 for (ix = 0; ix != len; ix++)
3096 {
3097 basic_block from = regions[ix].first;
3098 basic_block to = regions[ix].second;
3099
3100 if (from)
3101 {
3102 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3103 if (to != from)
3104 fprintf (dump_file, "->%d", to->index);
3105
3106 int color = BB_GET_SESE (from)->color;
3107
3108 /* Print the blocks within the region (excluding ends). */
3109 FOR_EACH_BB_FN (block, cfun)
3110 {
3111 bb_sese *sese = BB_GET_SESE (block);
3112
3113 if (sese && sese->color == color
3114 && block != from && block != to)
3115 fprintf (dump_file, ".%d", block->index);
3116 }
3117 fprintf (dump_file, "}");
3118 }
3119 comma = ",";
3120 }
3121 fprintf (dump_file, "\n\n");
3122 }
3123
3124 for (ix = 0; blocks.iterate (ix, &block); ix++)
3125 delete BB_GET_SESE (block);
3126 }
3127
3128 #undef BB_SET_SESE
3129 #undef BB_GET_SESE
3130
3131 /* Propagate live state at the start of a partitioned region. BLOCK
3132 provides the live register information, and might not contain
3133 INSN. Propagation is inserted just after INSN. RW indicates whether
3134 we are reading and/or writing state. This
3135 separation is needed for worker-level proppagation where we
3136 essentially do a spill & fill. FN is the underlying worker
3137 function to generate the propagation instructions for single
3138 register. DATA is user data.
3139
3140 We propagate the live register set and the entire frame. We could
3141 do better by (a) propagating just the live set that is used within
3142 the partitioned regions and (b) only propagating stack entries that
3143 are used. The latter might be quite hard to determine. */
3144
3145 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3146
3147 static void
3148 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3149 propagator_fn fn, void *data)
3150 {
3151 bitmap live = DF_LIVE_IN (block);
3152 bitmap_iterator iterator;
3153 unsigned ix;
3154
3155 /* Copy the frame array. */
3156 HOST_WIDE_INT fs = get_frame_size ();
3157 if (fs)
3158 {
3159 rtx tmp = gen_reg_rtx (DImode);
3160 rtx idx = NULL_RTX;
3161 rtx ptr = gen_reg_rtx (Pmode);
3162 rtx pred = NULL_RTX;
3163 rtx_code_label *label = NULL;
3164
3165 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3166 fs /= GET_MODE_SIZE (DImode);
3167 /* Detect single iteration loop. */
3168 if (fs == 1)
3169 fs = 0;
3170
3171 start_sequence ();
3172 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3173 if (fs)
3174 {
3175 idx = gen_reg_rtx (SImode);
3176 pred = gen_reg_rtx (BImode);
3177 label = gen_label_rtx ();
3178
3179 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3180 /* Allow worker function to initialize anything needed. */
3181 rtx init = fn (tmp, PM_loop_begin, fs, data);
3182 if (init)
3183 emit_insn (init);
3184 emit_label (label);
3185 LABEL_NUSES (label)++;
3186 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3187 }
3188 if (rw & PM_read)
3189 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3190 emit_insn (fn (tmp, rw, fs, data));
3191 if (rw & PM_write)
3192 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3193 if (fs)
3194 {
3195 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3196 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3197 emit_insn (gen_br_true_uni (pred, label));
3198 rtx fini = fn (tmp, PM_loop_end, fs, data);
3199 if (fini)
3200 emit_insn (fini);
3201 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3202 }
3203 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3204 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3205 rtx cpy = get_insns ();
3206 end_sequence ();
3207 insn = emit_insn_after (cpy, insn);
3208 }
3209
3210 /* Copy live registers. */
3211 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3212 {
3213 rtx reg = regno_reg_rtx[ix];
3214
3215 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3216 {
3217 rtx bcast = fn (reg, rw, 0, data);
3218
3219 insn = emit_insn_after (bcast, insn);
3220 }
3221 }
3222 }
3223
3224 /* Worker for nvptx_vpropagate. */
3225
3226 static rtx
3227 vprop_gen (rtx reg, propagate_mask pm,
3228 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3229 {
3230 if (!(pm & PM_read_write))
3231 return 0;
3232
3233 return nvptx_gen_vcast (reg);
3234 }
3235
3236 /* Propagate state that is live at start of BLOCK across the vectors
3237 of a single warp. Propagation is inserted just after INSN. */
3238
3239 static void
3240 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3241 {
3242 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3243 }
3244
3245 /* Worker for nvptx_wpropagate. */
3246
3247 static rtx
3248 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3249 {
3250 wcast_data_t *data = (wcast_data_t *)data_;
3251
3252 if (pm & PM_loop_begin)
3253 {
3254 /* Starting a loop, initialize pointer. */
3255 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3256
3257 if (align > worker_bcast_align)
3258 worker_bcast_align = align;
3259 data->offset = (data->offset + align - 1) & ~(align - 1);
3260
3261 data->ptr = gen_reg_rtx (Pmode);
3262
3263 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3264 }
3265 else if (pm & PM_loop_end)
3266 {
3267 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3268 data->ptr = NULL_RTX;
3269 return clobber;
3270 }
3271 else
3272 return nvptx_gen_wcast (reg, pm, rep, data);
3273 }
3274
3275 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3276 indicates if this is just before partitioned mode (do spill), or
3277 just after it starts (do fill). Sequence is inserted just after
3278 INSN. */
3279
3280 static void
3281 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3282 {
3283 wcast_data_t data;
3284
3285 data.base = gen_reg_rtx (Pmode);
3286 data.offset = 0;
3287 data.ptr = NULL_RTX;
3288
3289 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3290 if (data.offset)
3291 {
3292 /* Stuff was emitted, initialize the base pointer now. */
3293 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3294 emit_insn_after (init, insn);
3295
3296 if (worker_bcast_size < data.offset)
3297 worker_bcast_size = data.offset;
3298 }
3299 }
3300
3301 /* Emit a worker-level synchronization barrier. We use different
3302 markers for before and after synchronizations. */
3303
3304 static rtx
3305 nvptx_wsync (bool after)
3306 {
3307 return gen_nvptx_barsync (GEN_INT (after));
3308 }
3309
3310 /* Single neutering according to MASK. FROM is the incoming block and
3311 TO is the outgoing block. These may be the same block. Insert at
3312 start of FROM:
3313
3314 if (tid.<axis>) goto end.
3315
3316 and insert before ending branch of TO (if there is such an insn):
3317
3318 end:
3319 <possibly-broadcast-cond>
3320 <branch>
3321
3322 We currently only use differnt FROM and TO when skipping an entire
3323 loop. We could do more if we detected superblocks. */
3324
3325 static void
3326 nvptx_single (unsigned mask, basic_block from, basic_block to)
3327 {
3328 rtx_insn *head = BB_HEAD (from);
3329 rtx_insn *tail = BB_END (to);
3330 unsigned skip_mask = mask;
3331
3332 /* Find first insn of from block */
3333 while (head != BB_END (from) && !INSN_P (head))
3334 head = NEXT_INSN (head);
3335
3336 /* Find last insn of to block */
3337 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3338 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3339 tail = PREV_INSN (tail);
3340
3341 /* Detect if tail is a branch. */
3342 rtx tail_branch = NULL_RTX;
3343 rtx cond_branch = NULL_RTX;
3344 if (tail && INSN_P (tail))
3345 {
3346 tail_branch = PATTERN (tail);
3347 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3348 tail_branch = NULL_RTX;
3349 else
3350 {
3351 cond_branch = SET_SRC (tail_branch);
3352 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3353 cond_branch = NULL_RTX;
3354 }
3355 }
3356
3357 if (tail == head)
3358 {
3359 /* If this is empty, do nothing. */
3360 if (!head || !INSN_P (head))
3361 return;
3362
3363 /* If this is a dummy insn, do nothing. */
3364 switch (recog_memoized (head))
3365 {
3366 default:
3367 break;
3368 case CODE_FOR_nvptx_fork:
3369 case CODE_FOR_nvptx_forked:
3370 case CODE_FOR_nvptx_joining:
3371 case CODE_FOR_nvptx_join:
3372 return;
3373 }
3374
3375 if (cond_branch)
3376 {
3377 /* If we're only doing vector single, there's no need to
3378 emit skip code because we'll not insert anything. */
3379 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3380 skip_mask = 0;
3381 }
3382 else if (tail_branch)
3383 /* Block with only unconditional branch. Nothing to do. */
3384 return;
3385 }
3386
3387 /* Insert the vector test inside the worker test. */
3388 unsigned mode;
3389 rtx_insn *before = tail;
3390 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3391 if (GOMP_DIM_MASK (mode) & skip_mask)
3392 {
3393 rtx_code_label *label = gen_label_rtx ();
3394 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3395
3396 if (!pred)
3397 {
3398 pred = gen_reg_rtx (BImode);
3399 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3400 }
3401
3402 rtx br;
3403 if (mode == GOMP_DIM_VECTOR)
3404 br = gen_br_true (pred, label);
3405 else
3406 br = gen_br_true_uni (pred, label);
3407 emit_insn_before (br, head);
3408
3409 LABEL_NUSES (label)++;
3410 if (tail_branch)
3411 before = emit_label_before (label, before);
3412 else
3413 emit_label_after (label, tail);
3414 }
3415
3416 /* Now deal with propagating the branch condition. */
3417 if (cond_branch)
3418 {
3419 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3420
3421 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3422 {
3423 /* Vector mode only, do a shuffle. */
3424 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3425 }
3426 else
3427 {
3428 /* Includes worker mode, do spill & fill. By construction
3429 we should never have worker mode only. */
3430 wcast_data_t data;
3431
3432 data.base = worker_bcast_sym;
3433 data.ptr = 0;
3434
3435 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3436 worker_bcast_size = GET_MODE_SIZE (SImode);
3437
3438 data.offset = 0;
3439 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3440 before);
3441 /* Barrier so other workers can see the write. */
3442 emit_insn_before (nvptx_wsync (false), tail);
3443 data.offset = 0;
3444 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3445 /* This barrier is needed to avoid worker zero clobbering
3446 the broadcast buffer before all the other workers have
3447 had a chance to read this instance of it. */
3448 emit_insn_before (nvptx_wsync (true), tail);
3449 }
3450
3451 extract_insn (tail);
3452 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3453 UNSPEC_BR_UNIFIED);
3454 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3455 }
3456 }
3457
3458 /* PAR is a parallel that is being skipped in its entirety according to
3459 MASK. Treat this as skipping a superblock starting at forked
3460 and ending at joining. */
3461
3462 static void
3463 nvptx_skip_par (unsigned mask, parallel *par)
3464 {
3465 basic_block tail = par->join_block;
3466 gcc_assert (tail->preds->length () == 1);
3467
3468 basic_block pre_tail = (*tail->preds)[0]->src;
3469 gcc_assert (pre_tail->succs->length () == 1);
3470
3471 nvptx_single (mask, par->forked_block, pre_tail);
3472 }
3473
3474 /* If PAR has a single inner parallel and PAR itself only contains
3475 empty entry and exit blocks, swallow the inner PAR. */
3476
3477 static void
3478 nvptx_optimize_inner (parallel *par)
3479 {
3480 parallel *inner = par->inner;
3481
3482 /* We mustn't be the outer dummy par. */
3483 if (!par->mask)
3484 return;
3485
3486 /* We must have a single inner par. */
3487 if (!inner || inner->next)
3488 return;
3489
3490 /* We must only contain 2 blocks ourselves -- the head and tail of
3491 the inner par. */
3492 if (par->blocks.length () != 2)
3493 return;
3494
3495 /* We must be disjoint partitioning. As we only have vector and
3496 worker partitioning, this is sufficient to guarantee the pars
3497 have adjacent partitioning. */
3498 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3499 /* This indicates malformed code generation. */
3500 return;
3501
3502 /* The outer forked insn should be immediately followed by the inner
3503 fork insn. */
3504 rtx_insn *forked = par->forked_insn;
3505 rtx_insn *fork = BB_END (par->forked_block);
3506
3507 if (NEXT_INSN (forked) != fork)
3508 return;
3509 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3510
3511 /* The outer joining insn must immediately follow the inner join
3512 insn. */
3513 rtx_insn *joining = par->joining_insn;
3514 rtx_insn *join = inner->join_insn;
3515 if (NEXT_INSN (join) != joining)
3516 return;
3517
3518 /* Preconditions met. Swallow the inner par. */
3519 if (dump_file)
3520 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3521 inner->mask, inner->forked_block->index,
3522 inner->join_block->index,
3523 par->mask, par->forked_block->index, par->join_block->index);
3524
3525 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3526
3527 par->blocks.reserve (inner->blocks.length ());
3528 while (inner->blocks.length ())
3529 par->blocks.quick_push (inner->blocks.pop ());
3530
3531 par->inner = inner->inner;
3532 inner->inner = NULL;
3533
3534 delete inner;
3535 }
3536
3537 /* Process the parallel PAR and all its contained
3538 parallels. We do everything but the neutering. Return mask of
3539 partitioned modes used within this parallel. */
3540
3541 static unsigned
3542 nvptx_process_pars (parallel *par)
3543 {
3544 if (nvptx_optimize)
3545 nvptx_optimize_inner (par);
3546
3547 unsigned inner_mask = par->mask;
3548
3549 /* Do the inner parallels first. */
3550 if (par->inner)
3551 {
3552 par->inner_mask = nvptx_process_pars (par->inner);
3553 inner_mask |= par->inner_mask;
3554 }
3555
3556 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3557 /* No propagation needed for a call. */;
3558 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3559 {
3560 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3561 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3562 /* Insert begin and end synchronizations. */
3563 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3564 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3565 }
3566 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3567 nvptx_vpropagate (par->forked_block, par->forked_insn);
3568
3569 /* Now do siblings. */
3570 if (par->next)
3571 inner_mask |= nvptx_process_pars (par->next);
3572 return inner_mask;
3573 }
3574
3575 /* Neuter the parallel described by PAR. We recurse in depth-first
3576 order. MODES are the partitioning of the execution and OUTER is
3577 the partitioning of the parallels we are contained in. */
3578
3579 static void
3580 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3581 {
3582 unsigned me = (par->mask
3583 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3584 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3585 unsigned skip_mask = 0, neuter_mask = 0;
3586
3587 if (par->inner)
3588 nvptx_neuter_pars (par->inner, modes, outer | me);
3589
3590 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3591 {
3592 if ((outer | me) & GOMP_DIM_MASK (mode))
3593 {} /* Mode is partitioned: no neutering. */
3594 else if (!(modes & GOMP_DIM_MASK (mode)))
3595 {} /* Mode is not used: nothing to do. */
3596 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3597 || !par->forked_insn)
3598 /* Partitioned in inner parallels, or we're not a partitioned
3599 at all: neuter individual blocks. */
3600 neuter_mask |= GOMP_DIM_MASK (mode);
3601 else if (!par->parent || !par->parent->forked_insn
3602 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3603 /* Parent isn't a parallel or contains this paralleling: skip
3604 parallel at this level. */
3605 skip_mask |= GOMP_DIM_MASK (mode);
3606 else
3607 {} /* Parent will skip this parallel itself. */
3608 }
3609
3610 if (neuter_mask)
3611 {
3612 int ix, len;
3613
3614 if (nvptx_optimize)
3615 {
3616 /* Neuter whole SESE regions. */
3617 bb_pair_vec_t regions;
3618
3619 nvptx_find_sese (par->blocks, regions);
3620 len = regions.length ();
3621 for (ix = 0; ix != len; ix++)
3622 {
3623 basic_block from = regions[ix].first;
3624 basic_block to = regions[ix].second;
3625
3626 if (from)
3627 nvptx_single (neuter_mask, from, to);
3628 else
3629 gcc_assert (!to);
3630 }
3631 }
3632 else
3633 {
3634 /* Neuter each BB individually. */
3635 len = par->blocks.length ();
3636 for (ix = 0; ix != len; ix++)
3637 {
3638 basic_block block = par->blocks[ix];
3639
3640 nvptx_single (neuter_mask, block, block);
3641 }
3642 }
3643 }
3644
3645 if (skip_mask)
3646 nvptx_skip_par (skip_mask, par);
3647
3648 if (par->next)
3649 nvptx_neuter_pars (par->next, modes, outer);
3650 }
3651
3652 /* PTX-specific reorganization
3653 - Split blocks at fork and join instructions
3654 - Compute live registers
3655 - Mark now-unused registers, so function begin doesn't declare
3656 unused registers.
3657 - Insert state propagation when entering partitioned mode
3658 - Insert neutering instructions when in single mode
3659 - Replace subregs with suitable sequences.
3660 */
3661
3662 static void
3663 nvptx_reorg (void)
3664 {
3665 /* We are freeing block_for_insn in the toplev to keep compatibility
3666 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3667 compute_bb_for_insn ();
3668
3669 thread_prologue_and_epilogue_insns ();
3670
3671 /* Split blocks and record interesting unspecs. */
3672 bb_insn_map_t bb_insn_map;
3673
3674 nvptx_split_blocks (&bb_insn_map);
3675
3676 /* Compute live regs */
3677 df_clear_flags (DF_LR_RUN_DCE);
3678 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3679 df_live_add_problem ();
3680 df_live_set_all_dirty ();
3681 df_analyze ();
3682 regstat_init_n_sets_and_refs ();
3683
3684 if (dump_file)
3685 df_dump (dump_file);
3686
3687 /* Mark unused regs as unused. */
3688 int max_regs = max_reg_num ();
3689 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3690 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3691 regno_reg_rtx[i] = const0_rtx;
3692
3693 /* Determine launch dimensions of the function. If it is not an
3694 offloaded function (i.e. this is a regular compiler), the
3695 function has no neutering. */
3696 tree attr = get_oacc_fn_attrib (current_function_decl);
3697 if (attr)
3698 {
3699 /* If we determined this mask before RTL expansion, we could
3700 elide emission of some levels of forks and joins. */
3701 unsigned mask = 0;
3702 tree dims = TREE_VALUE (attr);
3703 unsigned ix;
3704
3705 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3706 {
3707 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3708 tree allowed = TREE_PURPOSE (dims);
3709
3710 if (size != 1 && !(allowed && integer_zerop (allowed)))
3711 mask |= GOMP_DIM_MASK (ix);
3712 }
3713 /* If there is worker neutering, there must be vector
3714 neutering. Otherwise the hardware will fail. */
3715 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3716 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3717
3718 /* Discover & process partitioned regions. */
3719 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3720 nvptx_process_pars (pars);
3721 nvptx_neuter_pars (pars, mask, 0);
3722 delete pars;
3723 }
3724
3725 /* Replace subregs. */
3726 nvptx_reorg_subreg ();
3727
3728 regstat_free_n_sets_and_refs ();
3729
3730 df_finish_pass (true);
3731 }
3732 \f
3733 /* Handle a "kernel" attribute; arguments as in
3734 struct attribute_spec.handler. */
3735
3736 static tree
3737 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3738 int ARG_UNUSED (flags), bool *no_add_attrs)
3739 {
3740 tree decl = *node;
3741
3742 if (TREE_CODE (decl) != FUNCTION_DECL)
3743 {
3744 error ("%qE attribute only applies to functions", name);
3745 *no_add_attrs = true;
3746 }
3747
3748 else if (TREE_TYPE (TREE_TYPE (decl)) != void_type_node)
3749 {
3750 error ("%qE attribute requires a void return type", name);
3751 *no_add_attrs = true;
3752 }
3753
3754 return NULL_TREE;
3755 }
3756
3757 /* Table of valid machine attributes. */
3758 static const struct attribute_spec nvptx_attribute_table[] =
3759 {
3760 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3761 affects_type_identity } */
3762 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3763 { NULL, 0, 0, false, false, false, NULL, false }
3764 };
3765 \f
3766 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3767
3768 static HOST_WIDE_INT
3769 nvptx_vector_alignment (const_tree type)
3770 {
3771 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3772
3773 return MIN (align, BIGGEST_ALIGNMENT);
3774 }
3775
3776 /* Indicate that INSN cannot be duplicated. */
3777
3778 static bool
3779 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3780 {
3781 switch (recog_memoized (insn))
3782 {
3783 case CODE_FOR_nvptx_shufflesi:
3784 case CODE_FOR_nvptx_shufflesf:
3785 case CODE_FOR_nvptx_barsync:
3786 case CODE_FOR_nvptx_fork:
3787 case CODE_FOR_nvptx_forked:
3788 case CODE_FOR_nvptx_joining:
3789 case CODE_FOR_nvptx_join:
3790 return true;
3791 default:
3792 return false;
3793 }
3794 }
3795
3796 /* Section anchors do not work. Initialization for flag_section_anchor
3797 probes the existence of the anchoring target hooks and prevents
3798 anchoring if they don't exist. However, we may be being used with
3799 a host-side compiler that does support anchoring, and hence see
3800 the anchor flag set (as it's not recalculated). So provide an
3801 implementation denying anchoring. */
3802
3803 static bool
3804 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3805 {
3806 return false;
3807 }
3808 \f
3809 /* Record a symbol for mkoffload to enter into the mapping table. */
3810
3811 static void
3812 nvptx_record_offload_symbol (tree decl)
3813 {
3814 switch (TREE_CODE (decl))
3815 {
3816 case VAR_DECL:
3817 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3818 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3819 break;
3820
3821 case FUNCTION_DECL:
3822 {
3823 tree attr = get_oacc_fn_attrib (decl);
3824 tree dims = TREE_VALUE (attr);
3825 unsigned ix;
3826
3827 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3828 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3829
3830 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3831 {
3832 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3833
3834 gcc_assert (!TREE_PURPOSE (dims));
3835 fprintf (asm_out_file, ", %#x", size);
3836 }
3837
3838 fprintf (asm_out_file, "\n");
3839 }
3840 break;
3841
3842 default:
3843 gcc_unreachable ();
3844 }
3845 }
3846
3847 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3848 at the start of a file. */
3849
3850 static void
3851 nvptx_file_start (void)
3852 {
3853 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3854 fputs ("\t.version\t3.1\n", asm_out_file);
3855 fputs ("\t.target\tsm_30\n", asm_out_file);
3856 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3857 fputs ("// END PREAMBLE\n", asm_out_file);
3858 }
3859
3860 /* Write out the function declarations we've collected and declare storage
3861 for the broadcast buffer. */
3862
3863 static void
3864 nvptx_file_end (void)
3865 {
3866 hash_table<tree_hasher>::iterator iter;
3867 tree decl;
3868 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3869 nvptx_record_fndecl (decl);
3870 fputs (func_decls.str().c_str(), asm_out_file);
3871
3872 if (worker_bcast_size)
3873 {
3874 /* Define the broadcast buffer. */
3875
3876 worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
3877 & ~(worker_bcast_align - 1);
3878
3879 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_bcast_name);
3880 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3881 worker_bcast_align,
3882 worker_bcast_name, worker_bcast_size);
3883 }
3884
3885 if (worker_red_size)
3886 {
3887 /* Define the reduction buffer. */
3888
3889 worker_red_size = ((worker_red_size + worker_red_align - 1)
3890 & ~(worker_red_align - 1));
3891
3892 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_red_name);
3893 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3894 worker_red_align,
3895 worker_red_name, worker_red_size);
3896 }
3897 }
3898
3899 /* Expander for the shuffle builtins. */
3900
3901 static rtx
3902 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3903 {
3904 if (ignore)
3905 return target;
3906
3907 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3908 NULL_RTX, mode, EXPAND_NORMAL);
3909 if (!REG_P (src))
3910 src = copy_to_mode_reg (mode, src);
3911
3912 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3913 NULL_RTX, SImode, EXPAND_NORMAL);
3914 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3915 NULL_RTX, SImode, EXPAND_NORMAL);
3916
3917 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3918 idx = copy_to_mode_reg (SImode, idx);
3919
3920 rtx pat = nvptx_gen_shuffle (target, src, idx, INTVAL (op));
3921 if (pat)
3922 emit_insn (pat);
3923
3924 return target;
3925 }
3926
3927 /* Worker reduction address expander. */
3928
3929 static rtx
3930 nvptx_expand_worker_addr (tree exp, rtx target,
3931 machine_mode ARG_UNUSED (mode), int ignore)
3932 {
3933 if (ignore)
3934 return target;
3935
3936 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3937 if (align > worker_red_align)
3938 worker_red_align = align;
3939
3940 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
3941 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3942 if (size + offset > worker_red_size)
3943 worker_red_size = size + offset;
3944
3945 emit_insn (gen_rtx_SET (target, worker_red_sym));
3946
3947 if (offset)
3948 emit_insn (gen_rtx_SET (target,
3949 gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
3950
3951 emit_insn (gen_rtx_SET (target,
3952 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
3953 UNSPEC_FROM_SHARED)));
3954
3955 return target;
3956 }
3957
3958 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
3959 not require taking the address of any object, other than the memory
3960 cell being operated on. */
3961
3962 static rtx
3963 nvptx_expand_cmp_swap (tree exp, rtx target,
3964 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
3965 {
3966 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3967
3968 if (!target)
3969 target = gen_reg_rtx (mode);
3970
3971 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
3972 NULL_RTX, Pmode, EXPAND_NORMAL);
3973 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3974 NULL_RTX, mode, EXPAND_NORMAL);
3975 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3976 NULL_RTX, mode, EXPAND_NORMAL);
3977 rtx pat;
3978
3979 mem = gen_rtx_MEM (mode, mem);
3980 if (!REG_P (cmp))
3981 cmp = copy_to_mode_reg (mode, cmp);
3982 if (!REG_P (src))
3983 src = copy_to_mode_reg (mode, src);
3984
3985 if (mode == SImode)
3986 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
3987 else
3988 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
3989
3990 emit_insn (pat);
3991
3992 return target;
3993 }
3994
3995
3996 /* Codes for all the NVPTX builtins. */
3997 enum nvptx_builtins
3998 {
3999 NVPTX_BUILTIN_SHUFFLE,
4000 NVPTX_BUILTIN_SHUFFLELL,
4001 NVPTX_BUILTIN_WORKER_ADDR,
4002 NVPTX_BUILTIN_CMP_SWAP,
4003 NVPTX_BUILTIN_CMP_SWAPLL,
4004 NVPTX_BUILTIN_MAX
4005 };
4006
4007 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4008
4009 /* Return the NVPTX builtin for CODE. */
4010
4011 static tree
4012 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4013 {
4014 if (code >= NVPTX_BUILTIN_MAX)
4015 return error_mark_node;
4016
4017 return nvptx_builtin_decls[code];
4018 }
4019
4020 /* Set up all builtin functions for this target. */
4021
4022 static void
4023 nvptx_init_builtins (void)
4024 {
4025 #define DEF(ID, NAME, T) \
4026 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4027 = add_builtin_function ("__builtin_nvptx_" NAME, \
4028 build_function_type_list T, \
4029 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4030 #define ST sizetype
4031 #define UINT unsigned_type_node
4032 #define LLUINT long_long_unsigned_type_node
4033 #define PTRVOID ptr_type_node
4034
4035 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4036 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4037 DEF (WORKER_ADDR, "worker_addr",
4038 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4039 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4040 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4041
4042 #undef DEF
4043 #undef ST
4044 #undef UINT
4045 #undef LLUINT
4046 #undef PTRVOID
4047 }
4048
4049 /* Expand an expression EXP that calls a built-in function,
4050 with result going to TARGET if that's convenient
4051 (and in mode MODE if that's convenient).
4052 SUBTARGET may be used as the target for computing one of EXP's operands.
4053 IGNORE is nonzero if the value is to be ignored. */
4054
4055 static rtx
4056 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4057 machine_mode mode, int ignore)
4058 {
4059 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4060 switch (DECL_FUNCTION_CODE (fndecl))
4061 {
4062 case NVPTX_BUILTIN_SHUFFLE:
4063 case NVPTX_BUILTIN_SHUFFLELL:
4064 return nvptx_expand_shuffle (exp, target, mode, ignore);
4065
4066 case NVPTX_BUILTIN_WORKER_ADDR:
4067 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4068
4069 case NVPTX_BUILTIN_CMP_SWAP:
4070 case NVPTX_BUILTIN_CMP_SWAPLL:
4071 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4072
4073 default: gcc_unreachable ();
4074 }
4075 }
4076 \f
4077 /* Define dimension sizes for known hardware. */
4078 #define PTX_VECTOR_LENGTH 32
4079 #define PTX_WORKER_LENGTH 32
4080
4081 /* Validate compute dimensions of an OpenACC offload or routine, fill
4082 in non-unity defaults. FN_LEVEL indicates the level at which a
4083 routine might spawn a loop. It is negative for non-routines. */
4084
4085 static bool
4086 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4087 {
4088 bool changed = false;
4089
4090 /* The vector size must be 32, unless this is a SEQ routine. */
4091 if (fn_level <= GOMP_DIM_VECTOR
4092 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4093 {
4094 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4095 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4096 dims[GOMP_DIM_VECTOR]
4097 ? "using vector_length (%d), ignoring %d"
4098 : "using vector_length (%d), ignoring runtime setting",
4099 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4100 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4101 changed = true;
4102 }
4103
4104 /* Check the num workers is not too large. */
4105 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4106 {
4107 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4108 "using num_workers (%d), ignoring %d",
4109 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4110 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4111 changed = true;
4112 }
4113
4114 return changed;
4115 }
4116
4117 /* Return maximum dimension size, or zero for unbounded. */
4118
4119 static int
4120 nvptx_dim_limit (int axis)
4121 {
4122 switch (axis)
4123 {
4124 case GOMP_DIM_WORKER:
4125 return PTX_WORKER_LENGTH;
4126
4127 case GOMP_DIM_VECTOR:
4128 return PTX_VECTOR_LENGTH;
4129
4130 default:
4131 break;
4132 }
4133 return 0;
4134 }
4135
4136 /* Determine whether fork & joins are needed. */
4137
4138 static bool
4139 nvptx_goacc_fork_join (gcall *call, const int dims[],
4140 bool ARG_UNUSED (is_fork))
4141 {
4142 tree arg = gimple_call_arg (call, 2);
4143 unsigned axis = TREE_INT_CST_LOW (arg);
4144
4145 /* We only care about worker and vector partitioning. */
4146 if (axis < GOMP_DIM_WORKER)
4147 return false;
4148
4149 /* If the size is 1, there's no partitioning. */
4150 if (dims[axis] == 1)
4151 return false;
4152
4153 return true;
4154 }
4155
4156 /* Generate a PTX builtin function call that returns the address in
4157 the worker reduction buffer at OFFSET. TYPE is the type of the
4158 data at that location. */
4159
4160 static tree
4161 nvptx_get_worker_red_addr (tree type, tree offset)
4162 {
4163 machine_mode mode = TYPE_MODE (type);
4164 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4165 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4166 tree align = build_int_cst (unsigned_type_node,
4167 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4168 tree call = build_call_expr (fndecl, 3, offset, size, align);
4169
4170 return fold_convert (build_pointer_type (type), call);
4171 }
4172
4173 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4174 will cast the variable if necessary. */
4175
4176 static void
4177 nvptx_generate_vector_shuffle (location_t loc,
4178 tree dest_var, tree var, unsigned shift,
4179 gimple_seq *seq)
4180 {
4181 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4182 tree_code code = NOP_EXPR;
4183 tree arg_type = unsigned_type_node;
4184 tree var_type = TREE_TYPE (var);
4185 tree dest_type = var_type;
4186
4187 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4188 var_type = TREE_TYPE (var_type);
4189
4190 if (TREE_CODE (var_type) == REAL_TYPE)
4191 code = VIEW_CONVERT_EXPR;
4192
4193 if (TYPE_SIZE (var_type)
4194 == TYPE_SIZE (long_long_unsigned_type_node))
4195 {
4196 fn = NVPTX_BUILTIN_SHUFFLELL;
4197 arg_type = long_long_unsigned_type_node;
4198 }
4199
4200 tree call = nvptx_builtin_decl (fn, true);
4201 tree bits = build_int_cst (unsigned_type_node, shift);
4202 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4203 tree expr;
4204
4205 if (var_type != dest_type)
4206 {
4207 /* Do real and imaginary parts separately. */
4208 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4209 real = fold_build1 (code, arg_type, real);
4210 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4211 real = fold_build1 (code, var_type, real);
4212
4213 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4214 imag = fold_build1 (code, arg_type, imag);
4215 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4216 imag = fold_build1 (code, var_type, imag);
4217
4218 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4219 }
4220 else
4221 {
4222 expr = fold_build1 (code, arg_type, var);
4223 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4224 expr = fold_build1 (code, dest_type, expr);
4225 }
4226
4227 gimplify_assign (dest_var, expr, seq);
4228 }
4229
4230 /* Lazily generate the global lock var decl and return its address. */
4231
4232 static tree
4233 nvptx_global_lock_addr ()
4234 {
4235 tree v = global_lock_var;
4236
4237 if (!v)
4238 {
4239 tree name = get_identifier ("__reduction_lock");
4240 tree type = build_qualified_type (unsigned_type_node,
4241 TYPE_QUAL_VOLATILE);
4242 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4243 global_lock_var = v;
4244 DECL_ARTIFICIAL (v) = 1;
4245 DECL_EXTERNAL (v) = 1;
4246 TREE_STATIC (v) = 1;
4247 TREE_PUBLIC (v) = 1;
4248 TREE_USED (v) = 1;
4249 mark_addressable (v);
4250 mark_decl_referenced (v);
4251 }
4252
4253 return build_fold_addr_expr (v);
4254 }
4255
4256 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4257 GSI. We use a lockless scheme for nearly all case, which looks
4258 like:
4259 actual = initval(OP);
4260 do {
4261 guess = actual;
4262 write = guess OP myval;
4263 actual = cmp&swap (ptr, guess, write)
4264 } while (actual bit-different-to guess);
4265 return write;
4266
4267 This relies on a cmp&swap instruction, which is available for 32-
4268 and 64-bit types. Larger types must use a locking scheme. */
4269
4270 static tree
4271 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4272 tree ptr, tree var, tree_code op)
4273 {
4274 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4275 tree_code code = NOP_EXPR;
4276 tree arg_type = unsigned_type_node;
4277 tree var_type = TREE_TYPE (var);
4278
4279 if (TREE_CODE (var_type) == COMPLEX_TYPE
4280 || TREE_CODE (var_type) == REAL_TYPE)
4281 code = VIEW_CONVERT_EXPR;
4282
4283 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4284 {
4285 arg_type = long_long_unsigned_type_node;
4286 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4287 }
4288
4289 tree swap_fn = nvptx_builtin_decl (fn, true);
4290
4291 gimple_seq init_seq = NULL;
4292 tree init_var = make_ssa_name (arg_type);
4293 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4294 init_expr = fold_build1 (code, arg_type, init_expr);
4295 gimplify_assign (init_var, init_expr, &init_seq);
4296 gimple *init_end = gimple_seq_last (init_seq);
4297
4298 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4299
4300 /* Split the block just after the init stmts. */
4301 basic_block pre_bb = gsi_bb (*gsi);
4302 edge pre_edge = split_block (pre_bb, init_end);
4303 basic_block loop_bb = pre_edge->dest;
4304 pre_bb = pre_edge->src;
4305 /* Reset the iterator. */
4306 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4307
4308 tree expect_var = make_ssa_name (arg_type);
4309 tree actual_var = make_ssa_name (arg_type);
4310 tree write_var = make_ssa_name (arg_type);
4311
4312 /* Build and insert the reduction calculation. */
4313 gimple_seq red_seq = NULL;
4314 tree write_expr = fold_build1 (code, var_type, expect_var);
4315 write_expr = fold_build2 (op, var_type, write_expr, var);
4316 write_expr = fold_build1 (code, arg_type, write_expr);
4317 gimplify_assign (write_var, write_expr, &red_seq);
4318
4319 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4320
4321 /* Build & insert the cmp&swap sequence. */
4322 gimple_seq latch_seq = NULL;
4323 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4324 ptr, expect_var, write_var);
4325 gimplify_assign (actual_var, swap_expr, &latch_seq);
4326
4327 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4328 NULL_TREE, NULL_TREE);
4329 gimple_seq_add_stmt (&latch_seq, cond);
4330
4331 gimple *latch_end = gimple_seq_last (latch_seq);
4332 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4333
4334 /* Split the block just after the latch stmts. */
4335 edge post_edge = split_block (loop_bb, latch_end);
4336 basic_block post_bb = post_edge->dest;
4337 loop_bb = post_edge->src;
4338 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4339
4340 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4341 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4342 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4343 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4344
4345 gphi *phi = create_phi_node (expect_var, loop_bb);
4346 add_phi_arg (phi, init_var, pre_edge, loc);
4347 add_phi_arg (phi, actual_var, loop_edge, loc);
4348
4349 loop *loop = alloc_loop ();
4350 loop->header = loop_bb;
4351 loop->latch = loop_bb;
4352 add_loop (loop, loop_bb->loop_father);
4353
4354 return fold_build1 (code, var_type, write_var);
4355 }
4356
4357 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4358 GSI. This is necessary for types larger than 64 bits, where there
4359 is no cmp&swap instruction to implement a lockless scheme. We use
4360 a lock variable in global memory.
4361
4362 while (cmp&swap (&lock_var, 0, 1))
4363 continue;
4364 T accum = *ptr;
4365 accum = accum OP var;
4366 *ptr = accum;
4367 cmp&swap (&lock_var, 1, 0);
4368 return accum;
4369
4370 A lock in global memory is necessary to force execution engine
4371 descheduling and avoid resource starvation that can occur if the
4372 lock is in .shared memory. */
4373
4374 static tree
4375 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4376 tree ptr, tree var, tree_code op)
4377 {
4378 tree var_type = TREE_TYPE (var);
4379 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4380 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4381 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4382
4383 /* Split the block just before the gsi. Insert a gimple nop to make
4384 this easier. */
4385 gimple *nop = gimple_build_nop ();
4386 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4387 basic_block entry_bb = gsi_bb (*gsi);
4388 edge entry_edge = split_block (entry_bb, nop);
4389 basic_block lock_bb = entry_edge->dest;
4390 /* Reset the iterator. */
4391 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4392
4393 /* Build and insert the locking sequence. */
4394 gimple_seq lock_seq = NULL;
4395 tree lock_var = make_ssa_name (unsigned_type_node);
4396 tree lock_expr = nvptx_global_lock_addr ();
4397 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4398 uns_unlocked, uns_locked);
4399 gimplify_assign (lock_var, lock_expr, &lock_seq);
4400 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4401 NULL_TREE, NULL_TREE);
4402 gimple_seq_add_stmt (&lock_seq, cond);
4403 gimple *lock_end = gimple_seq_last (lock_seq);
4404 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4405
4406 /* Split the block just after the lock sequence. */
4407 edge locked_edge = split_block (lock_bb, lock_end);
4408 basic_block update_bb = locked_edge->dest;
4409 lock_bb = locked_edge->src;
4410 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4411
4412 /* Create the lock loop ... */
4413 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4414 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4415 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4416 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4417
4418 /* ... and the loop structure. */
4419 loop *lock_loop = alloc_loop ();
4420 lock_loop->header = lock_bb;
4421 lock_loop->latch = lock_bb;
4422 lock_loop->nb_iterations_estimate = 1;
4423 lock_loop->any_estimate = true;
4424 add_loop (lock_loop, entry_bb->loop_father);
4425
4426 /* Build and insert the reduction calculation. */
4427 gimple_seq red_seq = NULL;
4428 tree acc_in = make_ssa_name (var_type);
4429 tree ref_in = build_simple_mem_ref (ptr);
4430 TREE_THIS_VOLATILE (ref_in) = 1;
4431 gimplify_assign (acc_in, ref_in, &red_seq);
4432
4433 tree acc_out = make_ssa_name (var_type);
4434 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4435 gimplify_assign (acc_out, update_expr, &red_seq);
4436
4437 tree ref_out = build_simple_mem_ref (ptr);
4438 TREE_THIS_VOLATILE (ref_out) = 1;
4439 gimplify_assign (ref_out, acc_out, &red_seq);
4440
4441 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4442
4443 /* Build & insert the unlock sequence. */
4444 gimple_seq unlock_seq = NULL;
4445 tree unlock_expr = nvptx_global_lock_addr ();
4446 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4447 uns_locked, uns_unlocked);
4448 gimplify_and_add (unlock_expr, &unlock_seq);
4449 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4450
4451 return acc_out;
4452 }
4453
4454 /* Emit a sequence to update a reduction accumlator at *PTR with the
4455 value held in VAR using operator OP. Return the updated value.
4456
4457 TODO: optimize for atomic ops and indepedent complex ops. */
4458
4459 static tree
4460 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4461 tree ptr, tree var, tree_code op)
4462 {
4463 tree type = TREE_TYPE (var);
4464 tree size = TYPE_SIZE (type);
4465
4466 if (size == TYPE_SIZE (unsigned_type_node)
4467 || size == TYPE_SIZE (long_long_unsigned_type_node))
4468 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4469 else
4470 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4471 }
4472
4473 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4474
4475 static void
4476 nvptx_goacc_reduction_setup (gcall *call)
4477 {
4478 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4479 tree lhs = gimple_call_lhs (call);
4480 tree var = gimple_call_arg (call, 2);
4481 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4482 gimple_seq seq = NULL;
4483
4484 push_gimplify_context (true);
4485
4486 if (level != GOMP_DIM_GANG)
4487 {
4488 /* Copy the receiver object. */
4489 tree ref_to_res = gimple_call_arg (call, 1);
4490
4491 if (!integer_zerop (ref_to_res))
4492 var = build_simple_mem_ref (ref_to_res);
4493 }
4494
4495 if (level == GOMP_DIM_WORKER)
4496 {
4497 /* Store incoming value to worker reduction buffer. */
4498 tree offset = gimple_call_arg (call, 5);
4499 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4500 tree ptr = make_ssa_name (TREE_TYPE (call));
4501
4502 gimplify_assign (ptr, call, &seq);
4503 tree ref = build_simple_mem_ref (ptr);
4504 TREE_THIS_VOLATILE (ref) = 1;
4505 gimplify_assign (ref, var, &seq);
4506 }
4507
4508 if (lhs)
4509 gimplify_assign (lhs, var, &seq);
4510
4511 pop_gimplify_context (NULL);
4512 gsi_replace_with_seq (&gsi, seq, true);
4513 }
4514
4515 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4516
4517 static void
4518 nvptx_goacc_reduction_init (gcall *call)
4519 {
4520 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4521 tree lhs = gimple_call_lhs (call);
4522 tree var = gimple_call_arg (call, 2);
4523 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4524 enum tree_code rcode
4525 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4526 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4527 TREE_TYPE (var));
4528 gimple_seq seq = NULL;
4529
4530 push_gimplify_context (true);
4531
4532 if (level == GOMP_DIM_VECTOR)
4533 {
4534 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4535 tree tid = make_ssa_name (integer_type_node);
4536 tree dim_vector = gimple_call_arg (call, 3);
4537 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4538 dim_vector);
4539 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4540 NULL_TREE, NULL_TREE);
4541
4542 gimple_call_set_lhs (tid_call, tid);
4543 gimple_seq_add_stmt (&seq, tid_call);
4544 gimple_seq_add_stmt (&seq, cond_stmt);
4545
4546 /* Split the block just after the call. */
4547 edge init_edge = split_block (gsi_bb (gsi), call);
4548 basic_block init_bb = init_edge->dest;
4549 basic_block call_bb = init_edge->src;
4550
4551 /* Fixup flags from call_bb to init_bb. */
4552 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4553
4554 /* Set the initialization stmts. */
4555 gimple_seq init_seq = NULL;
4556 tree init_var = make_ssa_name (TREE_TYPE (var));
4557 gimplify_assign (init_var, init, &init_seq);
4558 gsi = gsi_start_bb (init_bb);
4559 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4560
4561 /* Split block just after the init stmt. */
4562 gsi_prev (&gsi);
4563 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4564 basic_block dst_bb = inited_edge->dest;
4565
4566 /* Create false edge from call_bb to dst_bb. */
4567 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4568
4569 /* Create phi node in dst block. */
4570 gphi *phi = create_phi_node (lhs, dst_bb);
4571 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4572 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4573
4574 /* Reset dominator of dst bb. */
4575 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4576
4577 /* Reset the gsi. */
4578 gsi = gsi_for_stmt (call);
4579 }
4580 else
4581 {
4582 if (level == GOMP_DIM_GANG)
4583 {
4584 /* If there's no receiver object, propagate the incoming VAR. */
4585 tree ref_to_res = gimple_call_arg (call, 1);
4586 if (integer_zerop (ref_to_res))
4587 init = var;
4588 }
4589
4590 gimplify_assign (lhs, init, &seq);
4591 }
4592
4593 pop_gimplify_context (NULL);
4594 gsi_replace_with_seq (&gsi, seq, true);
4595 }
4596
4597 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4598
4599 static void
4600 nvptx_goacc_reduction_fini (gcall *call)
4601 {
4602 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4603 tree lhs = gimple_call_lhs (call);
4604 tree ref_to_res = gimple_call_arg (call, 1);
4605 tree var = gimple_call_arg (call, 2);
4606 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4607 enum tree_code op
4608 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4609 gimple_seq seq = NULL;
4610 tree r = NULL_TREE;;
4611
4612 push_gimplify_context (true);
4613
4614 if (level == GOMP_DIM_VECTOR)
4615 {
4616 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4617 but that requires a method of emitting a unified jump at the
4618 gimple level. */
4619 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4620 {
4621 tree other_var = make_ssa_name (TREE_TYPE (var));
4622 nvptx_generate_vector_shuffle (gimple_location (call),
4623 other_var, var, shfl, &seq);
4624
4625 r = make_ssa_name (TREE_TYPE (var));
4626 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4627 var, other_var), &seq);
4628 var = r;
4629 }
4630 }
4631 else
4632 {
4633 tree accum = NULL_TREE;
4634
4635 if (level == GOMP_DIM_WORKER)
4636 {
4637 /* Get reduction buffer address. */
4638 tree offset = gimple_call_arg (call, 5);
4639 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4640 tree ptr = make_ssa_name (TREE_TYPE (call));
4641
4642 gimplify_assign (ptr, call, &seq);
4643 accum = ptr;
4644 }
4645 else if (integer_zerop (ref_to_res))
4646 r = var;
4647 else
4648 accum = ref_to_res;
4649
4650 if (accum)
4651 {
4652 /* UPDATE the accumulator. */
4653 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4654 seq = NULL;
4655 r = nvptx_reduction_update (gimple_location (call), &gsi,
4656 accum, var, op);
4657 }
4658 }
4659
4660 if (lhs)
4661 gimplify_assign (lhs, r, &seq);
4662 pop_gimplify_context (NULL);
4663
4664 gsi_replace_with_seq (&gsi, seq, true);
4665 }
4666
4667 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4668
4669 static void
4670 nvptx_goacc_reduction_teardown (gcall *call)
4671 {
4672 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4673 tree lhs = gimple_call_lhs (call);
4674 tree var = gimple_call_arg (call, 2);
4675 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4676 gimple_seq seq = NULL;
4677
4678 push_gimplify_context (true);
4679 if (level == GOMP_DIM_WORKER)
4680 {
4681 /* Read the worker reduction buffer. */
4682 tree offset = gimple_call_arg (call, 5);
4683 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4684 tree ptr = make_ssa_name (TREE_TYPE (call));
4685
4686 gimplify_assign (ptr, call, &seq);
4687 var = build_simple_mem_ref (ptr);
4688 TREE_THIS_VOLATILE (var) = 1;
4689 }
4690
4691 if (level != GOMP_DIM_GANG)
4692 {
4693 /* Write to the receiver object. */
4694 tree ref_to_res = gimple_call_arg (call, 1);
4695
4696 if (!integer_zerop (ref_to_res))
4697 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4698 }
4699
4700 if (lhs)
4701 gimplify_assign (lhs, var, &seq);
4702
4703 pop_gimplify_context (NULL);
4704
4705 gsi_replace_with_seq (&gsi, seq, true);
4706 }
4707
4708 /* NVPTX reduction expander. */
4709
4710 void
4711 nvptx_goacc_reduction (gcall *call)
4712 {
4713 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4714
4715 switch (code)
4716 {
4717 case IFN_GOACC_REDUCTION_SETUP:
4718 nvptx_goacc_reduction_setup (call);
4719 break;
4720
4721 case IFN_GOACC_REDUCTION_INIT:
4722 nvptx_goacc_reduction_init (call);
4723 break;
4724
4725 case IFN_GOACC_REDUCTION_FINI:
4726 nvptx_goacc_reduction_fini (call);
4727 break;
4728
4729 case IFN_GOACC_REDUCTION_TEARDOWN:
4730 nvptx_goacc_reduction_teardown (call);
4731 break;
4732
4733 default:
4734 gcc_unreachable ();
4735 }
4736 }
4737
4738 #undef TARGET_OPTION_OVERRIDE
4739 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4740
4741 #undef TARGET_ATTRIBUTE_TABLE
4742 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4743
4744 #undef TARGET_LEGITIMATE_ADDRESS_P
4745 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4746
4747 #undef TARGET_PROMOTE_FUNCTION_MODE
4748 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4749
4750 #undef TARGET_FUNCTION_ARG
4751 #define TARGET_FUNCTION_ARG nvptx_function_arg
4752 #undef TARGET_FUNCTION_INCOMING_ARG
4753 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4754 #undef TARGET_FUNCTION_ARG_ADVANCE
4755 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4756 #undef TARGET_FUNCTION_ARG_BOUNDARY
4757 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4758 #undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
4759 #define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
4760 #undef TARGET_PASS_BY_REFERENCE
4761 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4762 #undef TARGET_FUNCTION_VALUE_REGNO_P
4763 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4764 #undef TARGET_FUNCTION_VALUE
4765 #define TARGET_FUNCTION_VALUE nvptx_function_value
4766 #undef TARGET_LIBCALL_VALUE
4767 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4768 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4769 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4770 #undef TARGET_GET_DRAP_RTX
4771 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4772 #undef TARGET_SPLIT_COMPLEX_ARG
4773 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4774 #undef TARGET_RETURN_IN_MEMORY
4775 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4776 #undef TARGET_OMIT_STRUCT_RETURN_REG
4777 #define TARGET_OMIT_STRUCT_RETURN_REG true
4778 #undef TARGET_STRICT_ARGUMENT_NAMING
4779 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4780 #undef TARGET_STATIC_CHAIN
4781 #define TARGET_STATIC_CHAIN nvptx_static_chain
4782
4783 #undef TARGET_CALL_ARGS
4784 #define TARGET_CALL_ARGS nvptx_call_args
4785 #undef TARGET_END_CALL_ARGS
4786 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4787
4788 #undef TARGET_ASM_FILE_START
4789 #define TARGET_ASM_FILE_START nvptx_file_start
4790 #undef TARGET_ASM_FILE_END
4791 #define TARGET_ASM_FILE_END nvptx_file_end
4792 #undef TARGET_ASM_GLOBALIZE_LABEL
4793 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4794 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4795 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4796 #undef TARGET_PRINT_OPERAND
4797 #define TARGET_PRINT_OPERAND nvptx_print_operand
4798 #undef TARGET_PRINT_OPERAND_ADDRESS
4799 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4800 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4801 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4802 #undef TARGET_ASM_INTEGER
4803 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4804 #undef TARGET_ASM_DECL_END
4805 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4806 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4807 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4808 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4809 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4810 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4811 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4812
4813 #undef TARGET_MACHINE_DEPENDENT_REORG
4814 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4815 #undef TARGET_NO_REGISTER_ALLOCATION
4816 #define TARGET_NO_REGISTER_ALLOCATION true
4817
4818 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4819 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4820
4821 #undef TARGET_VECTOR_ALIGNMENT
4822 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4823
4824 #undef TARGET_CANNOT_COPY_INSN_P
4825 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4826
4827 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4828 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4829
4830 #undef TARGET_INIT_BUILTINS
4831 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4832 #undef TARGET_EXPAND_BUILTIN
4833 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4834 #undef TARGET_BUILTIN_DECL
4835 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4836
4837 #undef TARGET_GOACC_VALIDATE_DIMS
4838 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4839
4840 #undef TARGET_GOACC_DIM_LIMIT
4841 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4842
4843 #undef TARGET_GOACC_FORK_JOIN
4844 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4845
4846 #undef TARGET_GOACC_REDUCTION
4847 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4848
4849 struct gcc_target targetm = TARGET_INITIALIZER;
4850
4851 #include "gt-nvptx.h"