nvptx.c (nvptx_function_arg, [...]): Move earlier.
[gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "expmed.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic.h"
38 #include "alias.h"
39 #include "insn-flags.h"
40 #include "output.h"
41 #include "insn-attr.h"
42 #include "flags.h"
43 #include "dojump.h"
44 #include "explow.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "stmt.h"
48 #include "expr.h"
49 #include "tm-preds.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
52 #include "dbxout.h"
53 #include "cfgrtl.h"
54 #include "gimple.h"
55 #include "stor-layout.h"
56 #include "builtins.h"
57 #include "omp-low.h"
58 #include "gomp-constants.h"
59 #include "dumpfile.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-ssa-operands.h"
64 #include "tree-ssanames.h"
65 #include "gimplify.h"
66 #include "tree-phinodes.h"
67 #include "cfgloop.h"
68 #include "fold-const.h"
69
70 /* This file should be included last. */
71 #include "target-def.h"
72
73 /* The kind of shuffe instruction. */
74 enum nvptx_shuffle_kind
75 {
76 SHUFFLE_UP,
77 SHUFFLE_DOWN,
78 SHUFFLE_BFLY,
79 SHUFFLE_IDX,
80 SHUFFLE_MAX
81 };
82
83 /* The various PTX memory areas an object might reside in. */
84 enum nvptx_data_area
85 {
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
93 };
94
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls;
105
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
107 {
108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
109 static bool equal (rtx a, rtx b) { return a == b; }
110 };
111
112 static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
116 {
117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
118 static bool equal (tree a, tree b) { return a == b; }
119 };
120
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
123
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 #define worker_bcast_name "__worker_bcast"
132 static GTY(()) rtx worker_bcast_sym;
133
134 /* Buffer needed for worker reductions. This has to be distinct from
135 the worker broadcast array, as both may be live concurrently. */
136 static unsigned worker_red_size;
137 static unsigned worker_red_align;
138 #define worker_red_name "__worker_red"
139 static GTY(()) rtx worker_red_sym;
140
141 /* Global lock variable, needed for 128bit worker & gang reductions. */
142 static GTY(()) tree global_lock_var;
143
144 /* Allocate a new, cleared machine_function structure. */
145
146 static struct machine_function *
147 nvptx_init_machine_status (void)
148 {
149 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
150 p->ret_reg_mode = VOIDmode;
151 return p;
152 }
153
154 /* Implement TARGET_OPTION_OVERRIDE. */
155
156 static void
157 nvptx_option_override (void)
158 {
159 init_machine_status = nvptx_init_machine_status;
160 /* Gives us a predictable order, which we need especially for variables. */
161 flag_toplevel_reorder = 1;
162 /* Assumes that it will see only hard registers. */
163 flag_var_tracking = 0;
164 write_symbols = NO_DEBUG;
165 debug_info_level = DINFO_LEVEL_NONE;
166
167 if (nvptx_optimize < 0)
168 nvptx_optimize = optimize > 0;
169
170 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
171 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
172 declared_libfuncs_htab
173 = hash_table<declared_libfunc_hasher>::create_ggc (17);
174
175 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
176 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
177 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
178
179 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
180 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
181 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
182 }
183
184 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
185 deal with ptx ideosyncracies. */
186
187 const char *
188 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
189 {
190 switch (mode)
191 {
192 case BLKmode:
193 return ".b8";
194 case BImode:
195 return ".pred";
196 case QImode:
197 if (promote)
198 return ".u32";
199 else
200 return ".u8";
201 case HImode:
202 return ".u16";
203 case SImode:
204 return ".u32";
205 case DImode:
206 return ".u64";
207
208 case SFmode:
209 return ".f32";
210 case DFmode:
211 return ".f64";
212
213 default:
214 gcc_unreachable ();
215 }
216 }
217
218 /* Encode the PTX data area that DECL (which might not actually be a
219 _DECL) should reside in. */
220
221 static void
222 nvptx_encode_section_info (tree decl, rtx rtl, int first)
223 {
224 default_encode_section_info (decl, rtl, first);
225 if (first && MEM_P (rtl))
226 {
227 nvptx_data_area area = DATA_AREA_GENERIC;
228
229 if (TREE_CONSTANT (decl))
230 area = DATA_AREA_CONST;
231 else if (TREE_CODE (decl) == VAR_DECL)
232 /* TODO: This would be a good place to check for a .shared or
233 other section name. */
234 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
235
236 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
237 }
238 }
239
240 /* Return the PTX name of the data area in which SYM should be
241 placed. The symbol must have already been processed by
242 nvptx_encode_seciton_info, or equivalent. */
243
244 static const char *
245 section_for_sym (rtx sym)
246 {
247 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
248 /* Same order as nvptx_data_area enum. */
249 static char const *const areas[] =
250 {"", ".global", ".shared", ".local", ".const", ".param"};
251
252 return areas[area];
253 }
254
255 /* Similarly for a decl. */
256
257 static const char *
258 section_for_decl (const_tree decl)
259 {
260 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
261 }
262
263 /* Check NAME for special function names and redirect them by returning a
264 replacement. This applies to malloc, free and realloc, for which we
265 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
266
267 static const char *
268 nvptx_name_replacement (const char *name)
269 {
270 if (strcmp (name, "call") == 0)
271 return "__nvptx_call";
272 if (strcmp (name, "malloc") == 0)
273 return "__nvptx_malloc";
274 if (strcmp (name, "free") == 0)
275 return "__nvptx_free";
276 if (strcmp (name, "realloc") == 0)
277 return "__nvptx_realloc";
278 return name;
279 }
280
281 /* If MODE should be treated as two registers of an inner mode, return
282 that inner mode. Otherwise return VOIDmode. */
283
284 static machine_mode
285 maybe_split_mode (machine_mode mode)
286 {
287 if (COMPLEX_MODE_P (mode))
288 return GET_MODE_INNER (mode);
289
290 if (mode == TImode)
291 return DImode;
292
293 return VOIDmode;
294 }
295
296 /* Output a register, subreg, or register pair (with optional
297 enclosing braces). */
298
299 static void
300 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
301 int subreg_offset = -1)
302 {
303 if (inner_mode == VOIDmode)
304 {
305 if (HARD_REGISTER_NUM_P (regno))
306 fprintf (file, "%s", reg_names[regno]);
307 else
308 fprintf (file, "%%r%d", regno);
309 }
310 else if (subreg_offset >= 0)
311 {
312 output_reg (file, regno, VOIDmode);
313 fprintf (file, "$%d", subreg_offset);
314 }
315 else
316 {
317 if (subreg_offset == -1)
318 fprintf (file, "{");
319 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
320 fprintf (file, ",");
321 output_reg (file, regno, inner_mode, 0);
322 if (subreg_offset == -1)
323 fprintf (file, "}");
324 }
325 }
326
327 /* Emit forking instructions for MASK. */
328
329 static void
330 nvptx_emit_forking (unsigned mask, bool is_call)
331 {
332 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
333 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
334 if (mask)
335 {
336 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
337
338 /* Emit fork at all levels. This helps form SESE regions, as
339 it creates a block with a single successor before entering a
340 partitooned region. That is a good candidate for the end of
341 an SESE region. */
342 if (!is_call)
343 emit_insn (gen_nvptx_fork (op));
344 emit_insn (gen_nvptx_forked (op));
345 }
346 }
347
348 /* Emit joining instructions for MASK. */
349
350 static void
351 nvptx_emit_joining (unsigned mask, bool is_call)
352 {
353 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
354 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
355 if (mask)
356 {
357 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
358
359 /* Emit joining for all non-call pars to ensure there's a single
360 predecessor for the block the join insn ends up in. This is
361 needed for skipping entire loops. */
362 if (!is_call)
363 emit_insn (gen_nvptx_joining (op));
364 emit_insn (gen_nvptx_join (op));
365 }
366 }
367
368 #define PASS_IN_REG_P(MODE, TYPE) \
369 ((GET_MODE_CLASS (MODE) == MODE_INT \
370 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
371 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
372 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
373 && !AGGREGATE_TYPE_P (TYPE))) \
374 && (MODE) != TImode)
375
376 #define RETURN_IN_REG_P(MODE) \
377 ((GET_MODE_CLASS (MODE) == MODE_INT \
378 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
379 && GET_MODE_SIZE (MODE) <= 8)
380 \f
381 /* Perform a mode promotion for a function argument with MODE. Return
382 the promoted mode. */
383
384 static machine_mode
385 arg_promotion (machine_mode mode)
386 {
387 if (mode == QImode || mode == HImode)
388 return SImode;
389 return mode;
390 }
391
392 /* Implement TARGET_FUNCTION_ARG. */
393
394 static rtx
395 nvptx_function_arg (cumulative_args_t, machine_mode mode,
396 const_tree, bool named)
397 {
398 if (mode == VOIDmode)
399 return NULL_RTX;
400
401 if (named)
402 return gen_reg_rtx (mode);
403 return NULL_RTX;
404 }
405
406 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
407
408 static rtx
409 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
410 const_tree, bool named)
411 {
412 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
413 if (mode == VOIDmode)
414 return NULL_RTX;
415
416 if (!named)
417 return NULL_RTX;
418
419 /* No need to deal with split modes here, the only case that can
420 happen is complex modes and those are dealt with by
421 TARGET_SPLIT_COMPLEX_ARG. */
422 return gen_rtx_UNSPEC (mode,
423 gen_rtvec (1, GEN_INT (cum->count)),
424 UNSPEC_ARG_REG);
425 }
426
427 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
428
429 static void
430 nvptx_function_arg_advance (cumulative_args_t cum_v,
431 machine_mode ARG_UNUSED (mode),
432 const_tree ARG_UNUSED (type),
433 bool ARG_UNUSED (named))
434 {
435 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
436 cum->count++;
437 }
438
439 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
440
441 For nvptx, we know how to handle functions declared as stdarg: by
442 passing an extra pointer to the unnamed arguments. However, the
443 Fortran frontend can produce a different situation, where a
444 function pointer is declared with no arguments, but the actual
445 function and calls to it take more arguments. In that case, we
446 want to ensure the call matches the definition of the function. */
447
448 static bool
449 nvptx_strict_argument_naming (cumulative_args_t cum_v)
450 {
451 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
452 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
453 }
454
455 /* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
456
457 static unsigned int
458 nvptx_function_arg_boundary (machine_mode mode, const_tree type)
459 {
460 unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
461
462 if (boundary > BITS_PER_WORD)
463 return 2 * BITS_PER_WORD;
464
465 if (mode == BLKmode)
466 {
467 HOST_WIDE_INT size = int_size_in_bytes (type);
468 if (size > 4)
469 return 2 * BITS_PER_WORD;
470 if (boundary < BITS_PER_WORD)
471 {
472 if (size >= 3)
473 return BITS_PER_WORD;
474 if (size >= 2)
475 return 2 * BITS_PER_UNIT;
476 }
477 }
478 return boundary;
479 }
480
481 /* Implement TARGET_LIBCALL_VALUE. */
482
483 static rtx
484 nvptx_libcall_value (machine_mode mode, const_rtx)
485 {
486 if (cfun->machine->start_call == NULL_RTX)
487 /* Pretend to return in a hard reg for early uses before pseudos can be
488 generated. */
489 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
490 return gen_reg_rtx (mode);
491 }
492
493 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
494 where function FUNC returns or receives a value of data type TYPE. */
495
496 static rtx
497 nvptx_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED,
498 bool outgoing)
499 {
500 int unsignedp = TYPE_UNSIGNED (type);
501 machine_mode orig_mode = TYPE_MODE (type);
502 machine_mode mode = promote_function_mode (type, orig_mode,
503 &unsignedp, NULL_TREE, 1);
504 if (outgoing)
505 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
506 if (cfun->machine->start_call == NULL_RTX)
507 /* Pretend to return in a hard reg for early uses before pseudos can be
508 generated. */
509 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
510 return gen_reg_rtx (mode);
511 }
512
513 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
514
515 static bool
516 nvptx_function_value_regno_p (const unsigned int regno)
517 {
518 return regno == NVPTX_RETURN_REGNUM;
519 }
520
521 /* Types with a mode other than those supported by the machine are passed by
522 reference in memory. */
523
524 static bool
525 nvptx_pass_by_reference (cumulative_args_t, machine_mode mode,
526 const_tree type, bool)
527 {
528 return !PASS_IN_REG_P (mode, type);
529 }
530
531 /* Implement TARGET_RETURN_IN_MEMORY. */
532
533 static bool
534 nvptx_return_in_memory (const_tree type, const_tree)
535 {
536 machine_mode mode = TYPE_MODE (type);
537 if (!RETURN_IN_REG_P (mode))
538 return true;
539 return false;
540 }
541
542 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
543
544 static machine_mode
545 nvptx_promote_function_mode (const_tree type, machine_mode mode,
546 int *punsignedp,
547 const_tree funtype, int for_return)
548 {
549 if (type == NULL_TREE)
550 return mode;
551 if (for_return)
552 return promote_mode (type, mode, punsignedp);
553 /* For K&R-style functions, try to match the language promotion rules to
554 minimize type mismatches at assembly time. */
555 if (TYPE_ARG_TYPES (funtype) == NULL_TREE
556 && type != NULL_TREE
557 && !AGGREGATE_TYPE_P (type))
558 {
559 if (mode == SFmode)
560 mode = DFmode;
561 mode = arg_promotion (mode);
562 }
563
564 return mode;
565 }
566
567 /* Implement TARGET_STATIC_CHAIN. */
568
569 static rtx
570 nvptx_static_chain (const_tree fndecl, bool incoming_p)
571 {
572 if (!DECL_STATIC_CHAIN (fndecl))
573 return NULL;
574
575 if (incoming_p)
576 return gen_rtx_REG (Pmode, STATIC_CHAIN_REGNUM);
577 else
578 return gen_rtx_REG (Pmode, OUTGOING_STATIC_CHAIN_REGNUM);
579 }
580
581 /* Helper for write_arg. Emit a single PTX argument of MODE, either
582 in a prototype, or as copy in a function prologue. ARGNO is the
583 index of this argument in the PTX function. FOR_REG is negative,
584 if we're emitting the PTX prototype. It is zero if we're copying
585 to an argument register and it is greater than zero if we're
586 copying to a specific hard register. */
587
588 static int
589 write_one_arg (std::stringstream &s, int for_reg, int argno, machine_mode mode)
590 {
591 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
592
593 if (for_reg < 0)
594 {
595 /* Writing PTX prototype. */
596 s << (argno ? ", " : " (");
597 s << ".param" << ptx_type << " %in_ar" << argno;
598 if (mode == QImode || mode == HImode)
599 s << "[1]";
600 }
601 else
602 {
603 s << "\t.reg" << ptx_type << " ";
604 if (for_reg)
605 s << reg_names[for_reg];
606 else
607 s << "%ar" << argno;
608 s << ";\n";
609 s << "\tld.param" << ptx_type << " ";
610 if (for_reg)
611 s << reg_names[for_reg];
612 else
613 s << "%ar" << argno;
614 s << ", [%in_ar" << argno << "];\n";
615 }
616 return argno + 1;
617 }
618
619 /* Process function parameter TYPE to emit one or more PTX
620 arguments. PROTOTYPED is true, if this is a prototyped function,
621 rather than an old-style C declaration.
622
623 The promotion behaviour here must match the regular GCC function
624 parameter marshalling machinery. */
625
626 static int
627 write_arg (std::stringstream &s, int for_reg, int argno,
628 tree type, bool prototyped)
629 {
630 machine_mode mode = TYPE_MODE (type);
631
632 if (mode == VOIDmode)
633 return argno;
634
635 if (!PASS_IN_REG_P (mode, type))
636 mode = Pmode;
637
638 machine_mode split = maybe_split_mode (mode);
639 if (split != VOIDmode)
640 mode = split;
641
642 if (!prototyped && !AGGREGATE_TYPE_P (type))
643 {
644 if (mode == SFmode)
645 mode = DFmode;
646 mode = arg_promotion (mode);
647 }
648 else if (for_reg >= 0)
649 mode = arg_promotion (mode);
650
651 if (split != VOIDmode)
652 argno = write_one_arg (s, for_reg, argno, mode);
653 return write_one_arg (s, for_reg, argno, mode);
654 }
655
656 static bool
657 write_return (std::stringstream &s, bool for_proto, tree type,
658 machine_mode ret_mode)
659 {
660 machine_mode mode = TYPE_MODE (type);
661 bool return_in_mem = mode != VOIDmode && !RETURN_IN_REG_P (mode);
662
663 mode = arg_promotion (mode);
664 if (for_proto)
665 {
666 if (!return_in_mem && mode != VOIDmode)
667 s << "(.param" << nvptx_ptx_type_from_mode (mode, false)
668 << " %out_retval) ";
669 }
670 else
671 {
672 /* Prologue. C++11 ABI causes us to return a reference to the
673 passed in pointer for return_in_mem. */
674 ret_mode = arg_promotion (ret_mode);
675 if (ret_mode != VOIDmode)
676 s << "\t.reg" << nvptx_ptx_type_from_mode (ret_mode, false)
677 << " %retval;\n";
678 }
679
680 return return_in_mem;
681 }
682
683 /* Look for attributes in ATTRS that would indicate we must write a function
684 as a .entry kernel rather than a .func. Return true if one is found. */
685
686 static bool
687 write_as_kernel (tree attrs)
688 {
689 return (lookup_attribute ("kernel", attrs) != NULL_TREE
690 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
691 }
692
693 /* Emit a linker marker for a function decl or defn. */
694
695 static void
696 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
697 const char *name)
698 {
699 s << "\n// BEGIN";
700 if (globalize)
701 s << " GLOBAL";
702 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
703 s << name << "\n";
704 }
705
706 /* Emit a linker marker for a variable decl or defn. */
707
708 static void
709 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
710 {
711 fprintf (file, "\n// BEGIN%s VAR %s: ",
712 globalize ? " GLOBAL" : "",
713 is_defn ? "DEF" : "DECL");
714 assemble_name_raw (file, name);
715 fputs ("\n", file);
716 }
717
718 /* Write a .func or .kernel declaration or definition along with
719 a helper comment for use by ld. S is the stream to write to, DECL
720 the decl for the function with name NAME. For definitions, emit
721 a declaration too. */
722
723 static const char *
724 write_fn_proto (std::stringstream &s, bool is_defn,
725 const char *name, const_tree decl)
726 {
727 if (is_defn)
728 /* Emit a declaration. The PTX assembler gets upset without it. */
729 name = write_fn_proto (s, false, name, decl);
730 else
731 {
732 /* Avoid repeating the name replacement. */
733 name = nvptx_name_replacement (name);
734 if (name[0] == '*')
735 name++;
736 }
737
738 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
739
740 /* PTX declaration. */
741 if (DECL_EXTERNAL (decl))
742 s << ".extern ";
743 else if (TREE_PUBLIC (decl))
744 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
745 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
746
747 tree fntype = TREE_TYPE (decl);
748 tree result_type = TREE_TYPE (fntype);
749
750 /* Declare the result. */
751 bool return_in_mem = write_return (s, true, result_type, VOIDmode);
752
753 s << name;
754
755 int argno = 0;
756
757 /* Emit argument list. */
758 if (return_in_mem)
759 argno = write_arg (s, -1, argno, ptr_type_node, true);
760
761 /* We get:
762 NULL in TYPE_ARG_TYPES, for old-style functions
763 NULL in DECL_ARGUMENTS, for builtin functions without another
764 declaration.
765 So we have to pick the best one we have. */
766 tree args = TYPE_ARG_TYPES (fntype);
767 bool prototyped = true;
768 if (!args)
769 {
770 args = DECL_ARGUMENTS (decl);
771 prototyped = false;
772 }
773
774 for (; args; args = TREE_CHAIN (args))
775 {
776 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
777
778 argno = write_arg (s, -1, argno, type, prototyped);
779 }
780
781 if (stdarg_p (fntype))
782 argno = write_arg (s, -1, argno, ptr_type_node, true);
783
784 if (DECL_STATIC_CHAIN (decl))
785 argno = write_arg (s, -1, argno, ptr_type_node, true);
786
787 if (!argno && strcmp (name, "main") == 0)
788 {
789 argno = write_arg (s, -1, argno, integer_type_node, true);
790 argno = write_arg (s, -1, argno, ptr_type_node, true);
791 }
792
793 if (argno)
794 s << ")";
795
796 s << (is_defn ? "\n" : ";\n");
797
798 return name;
799 }
800
801 /* Construct a function declaration from a call insn. This can be
802 necessary for two reasons - either we have an indirect call which
803 requires a .callprototype declaration, or we have a libcall
804 generated by emit_library_call for which no decl exists. */
805
806 static void
807 write_fn_proto_from_insn (std::stringstream &s, const char *name,
808 rtx result, rtx pat)
809 {
810 if (!name)
811 {
812 s << "\t.callprototype ";
813 name = "_";
814 }
815 else
816 {
817 name = nvptx_name_replacement (name);
818 write_fn_marker (s, false, true, name);
819 s << "\t.extern .func ";
820 }
821
822 if (result != NULL_RTX)
823 s << "(.param"
824 << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)), false)
825 << " %rval) ";
826
827 s << name;
828
829 const char *sep = " (";
830 int arg_end = XVECLEN (pat, 0);
831 for (int i = 1; i < arg_end; i++)
832 {
833 /* We don't have to deal with mode splitting here, as that was
834 already done when generating the call sequence. */
835 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
836
837 s << sep
838 << ".param"
839 << nvptx_ptx_type_from_mode (mode, false)
840 << " %arg"
841 << i;
842 if (mode == QImode || mode == HImode)
843 s << "[1]";
844 sep = ", ";
845 }
846 if (arg_end != 1)
847 s << ")";
848 s << ";\n";
849 }
850
851 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
852 table and and write a ptx prototype. These are emitted at end of
853 compilation. */
854
855 static void
856 nvptx_record_fndecl (tree decl)
857 {
858 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
859 if (*slot == NULL)
860 {
861 *slot = decl;
862 const char *name = get_fnname_from_decl (decl);
863 write_fn_proto (func_decls, false, name, decl);
864 }
865 }
866
867 /* Record a libcall or unprototyped external function. CALLEE is the
868 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
869 declaration for it. */
870
871 static void
872 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
873 {
874 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
875 if (*slot == NULL)
876 {
877 *slot = callee;
878
879 const char *name = XSTR (callee, 0);
880 write_fn_proto_from_insn (func_decls, name, retval, pat);
881 }
882 }
883
884 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
885 is prototyped, record it now. Otherwise record it as needed at end
886 of compilation, when we might have more information about it. */
887
888 void
889 nvptx_record_needed_fndecl (tree decl)
890 {
891 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
892 {
893 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
894 if (*slot == NULL)
895 *slot = decl;
896 }
897 else
898 nvptx_record_fndecl (decl);
899 }
900
901 /* SYM is a SYMBOL_REF. If it refers to an external function, record
902 it as needed. */
903
904 static void
905 nvptx_maybe_record_fnsym (rtx sym)
906 {
907 tree decl = SYMBOL_REF_DECL (sym);
908
909 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
910 nvptx_record_needed_fndecl (decl);
911 }
912
913 /* Emit code to initialize the REGNO predicate register to indicate
914 whether we are not lane zero on the NAME axis. */
915
916 static void
917 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
918 {
919 fprintf (file, "\t{\n");
920 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
921 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
922 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
923 fprintf (file, "\t}\n");
924 }
925
926 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
927 function, including local var decls and copies from the arguments to
928 local regs. */
929
930 void
931 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
932 {
933 tree fntype = TREE_TYPE (decl);
934 tree result_type = TREE_TYPE (fntype);
935 int argno = 0;
936
937 /* We construct the initial part of the function into a string
938 stream, in order to share the prototype writing code. */
939 std::stringstream s;
940 write_fn_proto (s, true, name, decl);
941 s << "{\n";
942
943 bool return_in_mem = write_return (s, false, result_type,
944 (machine_mode)cfun->machine->ret_reg_mode);
945 if (return_in_mem)
946 argno = write_arg (s, 0, argno, ptr_type_node, true);
947
948 /* Declare and initialize incoming arguments. */
949 tree args = TYPE_ARG_TYPES (fntype);
950 bool prototyped = true;
951 if (!args)
952 {
953 args = DECL_ARGUMENTS (decl);
954 prototyped = false;
955 }
956
957 for (; args != NULL_TREE; args = TREE_CHAIN (args))
958 {
959 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
960
961 argno = write_arg (s, 0, argno, type, prototyped);
962 }
963
964 if (stdarg_p (fntype))
965 argno = write_arg (s, ARG_POINTER_REGNUM, argno, ptr_type_node, true);
966
967 if (DECL_STATIC_CHAIN (decl))
968 argno = write_arg (s, STATIC_CHAIN_REGNUM, argno, ptr_type_node, true);
969
970 fprintf (file, "%s", s.str().c_str());
971
972 fprintf (file, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode),
973 reg_names[OUTGOING_STATIC_CHAIN_REGNUM]);
974
975 /* Declare the pseudos we have as ptx registers. */
976 int maxregs = max_reg_num ();
977 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
978 {
979 if (regno_reg_rtx[i] != const0_rtx)
980 {
981 machine_mode mode = PSEUDO_REGNO_MODE (i);
982 machine_mode split = maybe_split_mode (mode);
983
984 if (split != VOIDmode)
985 mode = split;
986 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
987 output_reg (file, i, split, -2);
988 fprintf (file, ";\n");
989 }
990 }
991
992 /* The only reason we might be using outgoing args is if we call a stdargs
993 function. Allocate the space for this. If we called varargs functions
994 without passing any variadic arguments, we'll see a reference to outargs
995 even with a zero outgoing_args_size. */
996 HOST_WIDE_INT sz = crtl->outgoing_args_size;
997 if (sz == 0)
998 sz = 1;
999 if (cfun->machine->has_call_with_varargs)
1000 {
1001 fprintf (file, "\t.reg.u%d %%outargs;\n"
1002 "\t.local.align 8 .b8 %%outargs_ar["
1003 HOST_WIDE_INT_PRINT_DEC"];\n",
1004 BITS_PER_WORD, sz);
1005 fprintf (file, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
1006 BITS_PER_WORD);
1007 }
1008
1009 /* Declare a local variable for the frame. */
1010 sz = get_frame_size ();
1011 if (sz > 0 || cfun->machine->has_call_with_sc)
1012 {
1013 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1014
1015 fprintf (file, "\t.reg.u%d %%frame;\n"
1016 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
1017 BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
1018 fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
1019 BITS_PER_WORD);
1020 }
1021
1022 /* Emit axis predicates. */
1023 if (cfun->machine->axis_predicate[0])
1024 nvptx_init_axis_predicate (file,
1025 REGNO (cfun->machine->axis_predicate[0]), "y");
1026 if (cfun->machine->axis_predicate[1])
1027 nvptx_init_axis_predicate (file,
1028 REGNO (cfun->machine->axis_predicate[1]), "x");
1029 }
1030
1031 /* Output a return instruction. Also copy the return value to its outgoing
1032 location. */
1033
1034 const char *
1035 nvptx_output_return (void)
1036 {
1037 machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
1038
1039 if (mode != VOIDmode)
1040 {
1041 mode = arg_promotion (mode);
1042 fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
1043 nvptx_ptx_type_from_mode (mode, false));
1044 }
1045
1046 return "ret;";
1047 }
1048
1049 /* Terminate a function by writing a closing brace to FILE. */
1050
1051 void
1052 nvptx_function_end (FILE *file)
1053 {
1054 fprintf (file, "}\n");
1055 }
1056 \f
1057 /* Decide whether we can make a sibling call to a function. For ptx, we
1058 can't. */
1059
1060 static bool
1061 nvptx_function_ok_for_sibcall (tree, tree)
1062 {
1063 return false;
1064 }
1065
1066 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1067
1068 static rtx
1069 nvptx_get_drap_rtx (void)
1070 {
1071 return NULL_RTX;
1072 }
1073
1074 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1075 argument to the next call. */
1076
1077 static void
1078 nvptx_call_args (rtx arg, tree funtype)
1079 {
1080 if (cfun->machine->start_call == NULL_RTX)
1081 {
1082 cfun->machine->call_args = NULL;
1083 cfun->machine->funtype = funtype;
1084 cfun->machine->start_call = const0_rtx;
1085 }
1086 if (arg == pc_rtx)
1087 return;
1088
1089 rtx_expr_list *args_so_far = cfun->machine->call_args;
1090 if (REG_P (arg))
1091 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, args_so_far);
1092 }
1093
1094 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1095 information we recorded. */
1096
1097 static void
1098 nvptx_end_call_args (void)
1099 {
1100 cfun->machine->start_call = NULL_RTX;
1101 free_EXPR_LIST_list (&cfun->machine->call_args);
1102 }
1103
1104 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1105 track of whether calls involving static chains or varargs were seen
1106 in the current function.
1107 For libcalls, maintain a hash table of decls we have seen, and
1108 record a function decl for later when encountering a new one. */
1109
1110 void
1111 nvptx_expand_call (rtx retval, rtx address)
1112 {
1113 int nargs = 0;
1114 rtx callee = XEXP (address, 0);
1115 rtx pat, t;
1116 rtvec vec;
1117 rtx varargs = NULL_RTX;
1118 unsigned parallel = 0;
1119
1120 for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
1121 nargs++;
1122
1123 if (!call_insn_operand (callee, Pmode))
1124 {
1125 callee = force_reg (Pmode, callee);
1126 address = change_address (address, QImode, callee);
1127 }
1128
1129 if (GET_CODE (callee) == SYMBOL_REF)
1130 {
1131 tree decl = SYMBOL_REF_DECL (callee);
1132 if (decl != NULL_TREE)
1133 {
1134 if (DECL_STATIC_CHAIN (decl))
1135 cfun->machine->has_call_with_sc = true;
1136
1137 tree attr = get_oacc_fn_attrib (decl);
1138 if (attr)
1139 {
1140 tree dims = TREE_VALUE (attr);
1141
1142 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1143 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1144 {
1145 if (TREE_PURPOSE (dims)
1146 && !integer_zerop (TREE_PURPOSE (dims)))
1147 break;
1148 /* Not on this axis. */
1149 parallel ^= GOMP_DIM_MASK (ix);
1150 dims = TREE_CHAIN (dims);
1151 }
1152 }
1153 }
1154 }
1155
1156 if (cfun->machine->funtype
1157 /* It's possible to construct testcases where we call a variable.
1158 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
1159 in such a case. */
1160 && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
1161 || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
1162 && stdarg_p (cfun->machine->funtype))
1163 {
1164 varargs = gen_reg_rtx (Pmode);
1165 emit_move_insn (varargs, stack_pointer_rtx);
1166 cfun->machine->has_call_with_varargs = true;
1167 }
1168 vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
1169 pat = gen_rtx_PARALLEL (VOIDmode, vec);
1170
1171 int vec_pos = 0;
1172
1173 rtx tmp_retval = retval;
1174 t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1175 if (retval != NULL_RTX)
1176 {
1177 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1178 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1179 t = gen_rtx_SET (tmp_retval, t);
1180 }
1181 XVECEXP (pat, 0, vec_pos++) = t;
1182
1183 /* Construct the call insn, including a USE for each argument pseudo
1184 register. These will be used when printing the insn. */
1185 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1186 {
1187 rtx this_arg = XEXP (arg, 0);
1188 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
1189 }
1190
1191 if (varargs)
1192 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1193
1194 gcc_assert (vec_pos = XVECLEN (pat, 0));
1195
1196 nvptx_emit_forking (parallel, true);
1197 emit_call_insn (pat);
1198 nvptx_emit_joining (parallel, true);
1199
1200 if (tmp_retval != retval)
1201 emit_move_insn (retval, tmp_retval);
1202 }
1203 /* Emit a comparison COMPARE, and return the new test to be used in the
1204 jump. */
1205
1206 rtx
1207 nvptx_expand_compare (rtx compare)
1208 {
1209 rtx pred = gen_reg_rtx (BImode);
1210 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1211 XEXP (compare, 0), XEXP (compare, 1));
1212 emit_insn (gen_rtx_SET (pred, cmp));
1213 return gen_rtx_NE (BImode, pred, const0_rtx);
1214 }
1215
1216 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1217
1218 void
1219 nvptx_expand_oacc_fork (unsigned mode)
1220 {
1221 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1222 }
1223
1224 void
1225 nvptx_expand_oacc_join (unsigned mode)
1226 {
1227 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1228 }
1229
1230 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1231 objects. */
1232
1233 static rtx
1234 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1235 {
1236 rtx res;
1237
1238 switch (GET_MODE (src))
1239 {
1240 case DImode:
1241 res = gen_unpackdisi2 (dst0, dst1, src);
1242 break;
1243 case DFmode:
1244 res = gen_unpackdfsi2 (dst0, dst1, src);
1245 break;
1246 default: gcc_unreachable ();
1247 }
1248 return res;
1249 }
1250
1251 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1252 object. */
1253
1254 static rtx
1255 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1256 {
1257 rtx res;
1258
1259 switch (GET_MODE (dst))
1260 {
1261 case DImode:
1262 res = gen_packsidi2 (dst, src0, src1);
1263 break;
1264 case DFmode:
1265 res = gen_packsidf2 (dst, src0, src1);
1266 break;
1267 default: gcc_unreachable ();
1268 }
1269 return res;
1270 }
1271
1272 /* Generate an instruction or sequence to broadcast register REG
1273 across the vectors of a single warp. */
1274
1275 static rtx
1276 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1277 {
1278 rtx res;
1279
1280 switch (GET_MODE (dst))
1281 {
1282 case SImode:
1283 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1284 break;
1285 case SFmode:
1286 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1287 break;
1288 case DImode:
1289 case DFmode:
1290 {
1291 rtx tmp0 = gen_reg_rtx (SImode);
1292 rtx tmp1 = gen_reg_rtx (SImode);
1293
1294 start_sequence ();
1295 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1296 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1297 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1298 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1299 res = get_insns ();
1300 end_sequence ();
1301 }
1302 break;
1303 case BImode:
1304 {
1305 rtx tmp = gen_reg_rtx (SImode);
1306
1307 start_sequence ();
1308 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1309 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1310 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1311 res = get_insns ();
1312 end_sequence ();
1313 }
1314 break;
1315
1316 default:
1317 gcc_unreachable ();
1318 }
1319 return res;
1320 }
1321
1322 /* Generate an instruction or sequence to broadcast register REG
1323 across the vectors of a single warp. */
1324
1325 static rtx
1326 nvptx_gen_vcast (rtx reg)
1327 {
1328 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1329 }
1330
1331 /* Structure used when generating a worker-level spill or fill. */
1332
1333 struct wcast_data_t
1334 {
1335 rtx base; /* Register holding base addr of buffer. */
1336 rtx ptr; /* Iteration var, if needed. */
1337 unsigned offset; /* Offset into worker buffer. */
1338 };
1339
1340 /* Direction of the spill/fill and looping setup/teardown indicator. */
1341
1342 enum propagate_mask
1343 {
1344 PM_read = 1 << 0,
1345 PM_write = 1 << 1,
1346 PM_loop_begin = 1 << 2,
1347 PM_loop_end = 1 << 3,
1348
1349 PM_read_write = PM_read | PM_write
1350 };
1351
1352 /* Generate instruction(s) to spill or fill register REG to/from the
1353 worker broadcast array. PM indicates what is to be done, REP
1354 how many loop iterations will be executed (0 for not a loop). */
1355
1356 static rtx
1357 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1358 {
1359 rtx res;
1360 machine_mode mode = GET_MODE (reg);
1361
1362 switch (mode)
1363 {
1364 case BImode:
1365 {
1366 rtx tmp = gen_reg_rtx (SImode);
1367
1368 start_sequence ();
1369 if (pm & PM_read)
1370 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1371 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1372 if (pm & PM_write)
1373 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1374 res = get_insns ();
1375 end_sequence ();
1376 }
1377 break;
1378
1379 default:
1380 {
1381 rtx addr = data->ptr;
1382
1383 if (!addr)
1384 {
1385 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1386
1387 if (align > worker_bcast_align)
1388 worker_bcast_align = align;
1389 data->offset = (data->offset + align - 1) & ~(align - 1);
1390 addr = data->base;
1391 if (data->offset)
1392 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1393 }
1394
1395 addr = gen_rtx_MEM (mode, addr);
1396 addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
1397 if (pm == PM_read)
1398 res = gen_rtx_SET (addr, reg);
1399 else if (pm == PM_write)
1400 res = gen_rtx_SET (reg, addr);
1401 else
1402 gcc_unreachable ();
1403
1404 if (data->ptr)
1405 {
1406 /* We're using a ptr, increment it. */
1407 start_sequence ();
1408
1409 emit_insn (res);
1410 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1411 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1412 res = get_insns ();
1413 end_sequence ();
1414 }
1415 else
1416 rep = 1;
1417 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1418 }
1419 break;
1420 }
1421 return res;
1422 }
1423
1424 /* When loading an operand ORIG_OP, verify whether an address space
1425 conversion to generic is required, and if so, perform it. Check
1426 for SYMBOL_REFs and record them if needed. Return either the
1427 original operand, or the converted one. */
1428
1429 rtx
1430 nvptx_maybe_convert_symbolic_operand (rtx op)
1431 {
1432 if (GET_MODE (op) != Pmode)
1433 return op;
1434
1435 rtx sym = op;
1436 if (GET_CODE (sym) == CONST)
1437 sym = XEXP (sym, 0);
1438 if (GET_CODE (sym) == PLUS)
1439 sym = XEXP (sym, 0);
1440
1441 if (GET_CODE (sym) != SYMBOL_REF)
1442 return op;
1443
1444 nvptx_maybe_record_fnsym (sym);
1445
1446 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
1447 if (area == DATA_AREA_GENERIC)
1448 return op;
1449
1450 rtx dest = gen_reg_rtx (Pmode);
1451 emit_insn (gen_rtx_SET (dest,
1452 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op),
1453 UNSPEC_TO_GENERIC)));
1454 return dest;
1455 }
1456 \f
1457 /* Returns true if X is a valid address for use in a memory reference. */
1458
1459 static bool
1460 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1461 {
1462 enum rtx_code code = GET_CODE (x);
1463
1464 switch (code)
1465 {
1466 case REG:
1467 return true;
1468
1469 case PLUS:
1470 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1471 return true;
1472 return false;
1473
1474 case CONST:
1475 case SYMBOL_REF:
1476 case LABEL_REF:
1477 return true;
1478
1479 default:
1480 return false;
1481 }
1482 }
1483
1484 /* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1485 to ensure that the return register's mode isn't changed. */
1486
1487 bool
1488 nvptx_hard_regno_mode_ok (int regno, machine_mode mode)
1489 {
1490 if (regno != NVPTX_RETURN_REGNUM
1491 || cfun == NULL || cfun->machine->ret_reg_mode == VOIDmode)
1492 return true;
1493 return mode == cfun->machine->ret_reg_mode;
1494 }
1495 \f
1496 /* Machinery to output constant initializers. When beginning an
1497 initializer, we decide on a fragment size (which is visible in ptx
1498 in the type used), and then all initializer data is buffered until
1499 a fragment is filled and ready to be written out. */
1500
1501 static struct
1502 {
1503 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1504 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1505 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1506 out. */
1507 unsigned size; /* Fragment size to accumulate. */
1508 unsigned offset; /* Offset within current fragment. */
1509 bool started; /* Whether we've output any initializer. */
1510 } init_frag;
1511
1512 /* The current fragment is full, write it out. SYM may provide a
1513 symbolic reference we should output, in which case the fragment
1514 value is the addend. */
1515
1516 static void
1517 output_init_frag (rtx sym)
1518 {
1519 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1520 unsigned HOST_WIDE_INT val = init_frag.val;
1521
1522 init_frag.started = true;
1523 init_frag.val = 0;
1524 init_frag.offset = 0;
1525 init_frag.remaining--;
1526
1527 if (sym)
1528 {
1529 fprintf (asm_out_file, "generic(");
1530 output_address (VOIDmode, sym);
1531 fprintf (asm_out_file, val ? ") + " : ")");
1532 }
1533
1534 if (!sym || val)
1535 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1536 }
1537
1538 /* Add value VAL of size SIZE to the data we're emitting, and keep
1539 writing out chunks as they fill up. */
1540
1541 static void
1542 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1543 {
1544 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1545
1546 for (unsigned part = 0; size; size -= part)
1547 {
1548 val >>= part * BITS_PER_UNIT;
1549 part = init_frag.size - init_frag.offset;
1550 if (part > size)
1551 part = size;
1552
1553 unsigned HOST_WIDE_INT partial
1554 = val << (init_frag.offset * BITS_PER_UNIT);
1555 init_frag.val |= partial & init_frag.mask;
1556 init_frag.offset += part;
1557
1558 if (init_frag.offset == init_frag.size)
1559 output_init_frag (NULL);
1560 }
1561 }
1562
1563 /* Target hook for assembling integer object X of size SIZE. */
1564
1565 static bool
1566 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1567 {
1568 HOST_WIDE_INT val = 0;
1569
1570 switch (GET_CODE (x))
1571 {
1572 default:
1573 gcc_unreachable ();
1574
1575 case CONST_INT:
1576 nvptx_assemble_value (INTVAL (x), size);
1577 break;
1578
1579 case CONST:
1580 x = XEXP (x, 0);
1581 gcc_assert (GET_CODE (x) == PLUS);
1582 val = INTVAL (XEXP (x, 1));
1583 x = XEXP (x, 0);
1584 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1585 /* FALLTHROUGH */
1586
1587 case SYMBOL_REF:
1588 gcc_assert (size == init_frag.size);
1589 if (init_frag.offset)
1590 sorry ("cannot emit unaligned pointers in ptx assembly");
1591
1592 nvptx_maybe_record_fnsym (x);
1593 init_frag.val = val;
1594 output_init_frag (x);
1595 break;
1596 }
1597
1598 return true;
1599 }
1600
1601 /* Output SIZE zero bytes. We ignore the FILE argument since the
1602 functions we're calling to perform the output just use
1603 asm_out_file. */
1604
1605 void
1606 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1607 {
1608 /* Finish the current fragment, if it's started. */
1609 if (init_frag.offset)
1610 {
1611 unsigned part = init_frag.size - init_frag.offset;
1612 if (part > size)
1613 part = (unsigned) size;
1614 size -= part;
1615 nvptx_assemble_value (0, part);
1616 }
1617
1618 /* If this skip doesn't terminate the initializer, write as many
1619 remaining pieces as possible directly. */
1620 if (size < init_frag.remaining * init_frag.size)
1621 {
1622 while (size >= init_frag.size)
1623 {
1624 size -= init_frag.size;
1625 output_init_frag (NULL_RTX);
1626 }
1627 if (size)
1628 nvptx_assemble_value (0, size);
1629 }
1630 }
1631
1632 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1633 ignore the FILE arg. */
1634
1635 void
1636 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1637 {
1638 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1639 nvptx_assemble_value (str[i], 1);
1640 }
1641
1642 /* Emit a PTX variable decl and prepare for emission of its
1643 initializer. NAME is the symbol name and SETION the PTX data
1644 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1645 The caller has already emitted any indentation and linkage
1646 specifier. It is responsible for any initializer, terminating ;
1647 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1648 this is the opposite way round that PTX wants them! */
1649
1650 static void
1651 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1652 const_tree type, HOST_WIDE_INT size, unsigned align)
1653 {
1654 while (TREE_CODE (type) == ARRAY_TYPE)
1655 type = TREE_TYPE (type);
1656
1657 if (TREE_CODE (type) == VECTOR_TYPE
1658 || TREE_CODE (type) == COMPLEX_TYPE)
1659 /* Neither vector nor complex types can contain the other. */
1660 type = TREE_TYPE (type);
1661
1662 unsigned elt_size = int_size_in_bytes (type);
1663
1664 /* Largest mode we're prepared to accept. For BLKmode types we
1665 don't know if it'll contain pointer constants, so have to choose
1666 pointer size, otherwise we can choose DImode. */
1667 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1668
1669 elt_size |= GET_MODE_SIZE (elt_mode);
1670 elt_size &= -elt_size; /* Extract LSB set. */
1671
1672 init_frag.size = elt_size;
1673 /* Avoid undefined shift behaviour by using '2'. */
1674 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1675 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1676 init_frag.val = 0;
1677 init_frag.offset = 0;
1678 init_frag.started = false;
1679 /* Size might not be a multiple of elt size, if there's an
1680 initialized trailing struct array with smaller type than
1681 elt_size. */
1682 init_frag.remaining = (size + elt_size - 1) / elt_size;
1683
1684 fprintf (file, "%s .align %d .u%d ",
1685 section, align / BITS_PER_UNIT,
1686 elt_size * BITS_PER_UNIT);
1687 assemble_name (file, name);
1688
1689 if (size)
1690 /* We make everything an array, to simplify any initialization
1691 emission. */
1692 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1693 }
1694
1695 /* Called when the initializer for a decl has been completely output through
1696 combinations of the three functions above. */
1697
1698 static void
1699 nvptx_assemble_decl_end (void)
1700 {
1701 if (init_frag.offset)
1702 /* This can happen with a packed struct with trailing array member. */
1703 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1704 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1705 }
1706
1707 /* Output an uninitialized common or file-scope variable. */
1708
1709 void
1710 nvptx_output_aligned_decl (FILE *file, const char *name,
1711 const_tree decl, HOST_WIDE_INT size, unsigned align)
1712 {
1713 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1714
1715 /* If this is public, it is common. The nearest thing we have to
1716 common is weak. */
1717 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1718
1719 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1720 TREE_TYPE (decl), size, align);
1721 nvptx_assemble_decl_end ();
1722 }
1723
1724 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1725 writing a constant variable EXP with NAME and SIZE and its
1726 initializer to FILE. */
1727
1728 static void
1729 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1730 const_tree exp, HOST_WIDE_INT obj_size)
1731 {
1732 write_var_marker (file, true, false, name);
1733
1734 fprintf (file, "\t");
1735
1736 tree type = TREE_TYPE (exp);
1737 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1738 TYPE_ALIGN (type));
1739 }
1740
1741 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1742 a variable DECL with NAME to FILE. */
1743
1744 void
1745 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1746 {
1747 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1748
1749 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1750 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1751
1752 tree type = TREE_TYPE (decl);
1753 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1754 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1755 type, obj_size, DECL_ALIGN (decl));
1756 }
1757
1758 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1759
1760 static void
1761 nvptx_globalize_label (FILE *, const char *)
1762 {
1763 }
1764
1765 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1766 declaration only for variable DECL with NAME to FILE. */
1767
1768 static void
1769 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1770 {
1771 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1772
1773 fprintf (file, "\t.extern ");
1774 tree size = DECL_SIZE_UNIT (decl);
1775 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1776 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1777 DECL_ALIGN (decl));
1778 fprintf (file, ";\n");
1779 }
1780
1781 /* Output a pattern for a move instruction. */
1782
1783 const char *
1784 nvptx_output_mov_insn (rtx dst, rtx src)
1785 {
1786 machine_mode dst_mode = GET_MODE (dst);
1787 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1788 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1789 machine_mode src_inner = (GET_CODE (src) == SUBREG
1790 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1791
1792 if (REG_P (dst) && REGNO (dst) == NVPTX_RETURN_REGNUM && dst_mode == HImode)
1793 /* Special handling for the return register. It's never really an
1794 HI object, and only occurs as the destination of a move
1795 insn. */
1796 dst_inner = SImode;
1797
1798 if (src_inner == dst_inner)
1799 return "%.\tmov%t0\t%0, %1;";
1800
1801 if (CONSTANT_P (src))
1802 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1803 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1804 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1805
1806 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1807 return "%.\tmov.b%T0\t%0, %1;";
1808
1809 return "%.\tcvt%t0%t1\t%0, %1;";
1810 }
1811
1812 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1813 involves writing .param declarations and in/out copies into them. For
1814 indirect calls, also write the .callprototype. */
1815
1816 const char *
1817 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1818 {
1819 char buf[16];
1820 static int labelno;
1821 bool needs_tgt = register_operand (callee, Pmode);
1822 rtx pat = PATTERN (insn);
1823 int arg_end = XVECLEN (pat, 0);
1824 tree decl = NULL_TREE;
1825
1826 fprintf (asm_out_file, "\t{\n");
1827 if (result != NULL)
1828 fprintf (asm_out_file, "\t\t.param%s %%retval_in;\n",
1829 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
1830 false));
1831
1832 /* Ensure we have a ptx declaration in the output if necessary. */
1833 if (GET_CODE (callee) == SYMBOL_REF)
1834 {
1835 decl = SYMBOL_REF_DECL (callee);
1836 if (!decl
1837 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1838 nvptx_record_libfunc (callee, result, pat);
1839 else if (DECL_EXTERNAL (decl))
1840 nvptx_record_fndecl (decl);
1841 }
1842
1843 if (needs_tgt)
1844 {
1845 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1846 labelno++;
1847 ASM_OUTPUT_LABEL (asm_out_file, buf);
1848 std::stringstream s;
1849 write_fn_proto_from_insn (s, NULL, result, pat);
1850 fputs (s.str().c_str(), asm_out_file);
1851 }
1852
1853 for (int argno = 1; argno < arg_end; argno++)
1854 {
1855 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1856 machine_mode mode = GET_MODE (t);
1857
1858 /* Mode splitting has already been done. */
1859 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1860 nvptx_ptx_type_from_mode (mode, false), argno,
1861 mode == QImode || mode == HImode ? "[1]" : "");
1862 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1863 nvptx_ptx_type_from_mode (mode, false), argno,
1864 REGNO (t));
1865 }
1866
1867 fprintf (asm_out_file, "\t\tcall ");
1868 if (result != NULL_RTX)
1869 fprintf (asm_out_file, "(%%retval_in), ");
1870
1871 if (decl)
1872 {
1873 const char *name = get_fnname_from_decl (decl);
1874 name = nvptx_name_replacement (name);
1875 assemble_name (asm_out_file, name);
1876 }
1877 else
1878 output_address (VOIDmode, callee);
1879
1880 const char *open = "(";
1881 for (int argno = 1; argno < arg_end; argno++)
1882 {
1883 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1884 open = "";
1885 }
1886 if (decl && DECL_STATIC_CHAIN (decl))
1887 {
1888 fprintf (asm_out_file, ", %s%s", open,
1889 reg_names [OUTGOING_STATIC_CHAIN_REGNUM]);
1890 open = "";
1891 }
1892 if (!open[0])
1893 fprintf (asm_out_file, ")");
1894
1895 if (needs_tgt)
1896 {
1897 fprintf (asm_out_file, ", ");
1898 assemble_name (asm_out_file, buf);
1899 }
1900 fprintf (asm_out_file, ";\n");
1901
1902 if (find_reg_note (insn, REG_NORETURN, NULL))
1903 /* No return functions confuse the PTX JIT, as it doesn't realize
1904 the flow control barrier they imply. It can seg fault if it
1905 encounters what looks like an unexitable loop. Emit a trailing
1906 trap, which it does grok. */
1907 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1908
1909 return result != NULL_RTX ? "\tld.param%t0\t%0, [%%retval_in];\n\t}" : "}";
1910 }
1911
1912 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1913
1914 static bool
1915 nvptx_print_operand_punct_valid_p (unsigned char c)
1916 {
1917 return c == '.' || c== '#';
1918 }
1919
1920 static void nvptx_print_operand (FILE *, rtx, int);
1921
1922 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1923
1924 static void
1925 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1926 {
1927 rtx off;
1928 if (GET_CODE (x) == CONST)
1929 x = XEXP (x, 0);
1930 switch (GET_CODE (x))
1931 {
1932 case PLUS:
1933 off = XEXP (x, 1);
1934 output_address (VOIDmode, XEXP (x, 0));
1935 fprintf (file, "+");
1936 output_address (VOIDmode, off);
1937 break;
1938
1939 case SYMBOL_REF:
1940 case LABEL_REF:
1941 output_addr_const (file, x);
1942 break;
1943
1944 default:
1945 gcc_assert (GET_CODE (x) != MEM);
1946 nvptx_print_operand (file, x, 0);
1947 break;
1948 }
1949 }
1950
1951 /* Write assembly language output for the address ADDR to FILE. */
1952
1953 static void
1954 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1955 {
1956 nvptx_print_address_operand (file, addr, mode);
1957 }
1958
1959 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1960
1961 Meaning of CODE:
1962 . -- print the predicate for the instruction or an emptry string for an
1963 unconditional one.
1964 # -- print a rounding mode for the instruction
1965
1966 A -- print a data area for a MEM
1967 c -- print an opcode suffix for a comparison operator, including a type code
1968 D -- print a data area for a MEM operand
1969 S -- print a shuffle kind specified by CONST_INT
1970 t -- print a type opcode suffix, promoting QImode to 32 bits
1971 T -- print a type size in bits
1972 u -- print a type opcode suffix without promotions. */
1973
1974 static void
1975 nvptx_print_operand (FILE *file, rtx x, int code)
1976 {
1977 if (code == '.')
1978 {
1979 x = current_insn_predicate;
1980 if (x)
1981 {
1982 unsigned int regno = REGNO (XEXP (x, 0));
1983 fputs ("[", file);
1984 if (GET_CODE (x) == EQ)
1985 fputs ("!", file);
1986 fputs (reg_names [regno], file);
1987 fputs ("]", file);
1988 }
1989 return;
1990 }
1991 else if (code == '#')
1992 {
1993 fputs (".rn", file);
1994 return;
1995 }
1996
1997 enum rtx_code x_code = GET_CODE (x);
1998 machine_mode mode = GET_MODE (x);
1999
2000 switch (code)
2001 {
2002 case 'A':
2003 x = XEXP (x, 0);
2004 /* FALLTHROUGH. */
2005
2006 case 'D':
2007 if (GET_CODE (x) == CONST)
2008 x = XEXP (x, 0);
2009 if (GET_CODE (x) == PLUS)
2010 x = XEXP (x, 0);
2011
2012 if (GET_CODE (x) == SYMBOL_REF)
2013 fputs (section_for_sym (x), file);
2014 break;
2015
2016 case 't':
2017 case 'u':
2018 if (x_code == SUBREG)
2019 {
2020 mode = GET_MODE (SUBREG_REG (x));
2021 if (mode == TImode)
2022 mode = DImode;
2023 else if (COMPLEX_MODE_P (mode))
2024 mode = GET_MODE_INNER (mode);
2025 }
2026 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2027 break;
2028
2029 case 'S':
2030 {
2031 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2032 /* Same order as nvptx_shuffle_kind. */
2033 static const char *const kinds[] =
2034 {".up", ".down", ".bfly", ".idx"};
2035 fputs (kinds[kind], file);
2036 }
2037 break;
2038
2039 case 'T':
2040 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2041 break;
2042
2043 case 'j':
2044 fprintf (file, "@");
2045 goto common;
2046
2047 case 'J':
2048 fprintf (file, "@!");
2049 goto common;
2050
2051 case 'c':
2052 mode = GET_MODE (XEXP (x, 0));
2053 switch (x_code)
2054 {
2055 case EQ:
2056 fputs (".eq", file);
2057 break;
2058 case NE:
2059 if (FLOAT_MODE_P (mode))
2060 fputs (".neu", file);
2061 else
2062 fputs (".ne", file);
2063 break;
2064 case LE:
2065 fputs (".le", file);
2066 break;
2067 case GE:
2068 fputs (".ge", file);
2069 break;
2070 case LT:
2071 fputs (".lt", file);
2072 break;
2073 case GT:
2074 fputs (".gt", file);
2075 break;
2076 case LEU:
2077 fputs (".ls", file);
2078 break;
2079 case GEU:
2080 fputs (".hs", file);
2081 break;
2082 case LTU:
2083 fputs (".lo", file);
2084 break;
2085 case GTU:
2086 fputs (".hi", file);
2087 break;
2088 case LTGT:
2089 fputs (".ne", file);
2090 break;
2091 case UNEQ:
2092 fputs (".equ", file);
2093 break;
2094 case UNLE:
2095 fputs (".leu", file);
2096 break;
2097 case UNGE:
2098 fputs (".geu", file);
2099 break;
2100 case UNLT:
2101 fputs (".ltu", file);
2102 break;
2103 case UNGT:
2104 fputs (".gtu", file);
2105 break;
2106 case UNORDERED:
2107 fputs (".nan", file);
2108 break;
2109 case ORDERED:
2110 fputs (".num", file);
2111 break;
2112 default:
2113 gcc_unreachable ();
2114 }
2115 if (FLOAT_MODE_P (mode)
2116 || x_code == EQ || x_code == NE
2117 || x_code == GEU || x_code == GTU
2118 || x_code == LEU || x_code == LTU)
2119 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2120 else
2121 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2122 break;
2123 default:
2124 common:
2125 switch (x_code)
2126 {
2127 case SUBREG:
2128 {
2129 rtx inner_x = SUBREG_REG (x);
2130 machine_mode inner_mode = GET_MODE (inner_x);
2131 machine_mode split = maybe_split_mode (inner_mode);
2132
2133 if (split != VOIDmode
2134 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2135 output_reg (file, REGNO (inner_x), split);
2136 else
2137 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2138 }
2139 break;
2140
2141 case REG:
2142 output_reg (file, REGNO (x), maybe_split_mode (mode));
2143 break;
2144
2145 case MEM:
2146 fputc ('[', file);
2147 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2148 fputc (']', file);
2149 break;
2150
2151 case CONST_INT:
2152 output_addr_const (file, x);
2153 break;
2154
2155 case CONST:
2156 case SYMBOL_REF:
2157 case LABEL_REF:
2158 /* We could use output_addr_const, but that can print things like
2159 "x-8", which breaks ptxas. Need to ensure it is output as
2160 "x+-8". */
2161 nvptx_print_address_operand (file, x, VOIDmode);
2162 break;
2163
2164 case CONST_DOUBLE:
2165 long vals[2];
2166 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2167 vals[0] &= 0xffffffff;
2168 vals[1] &= 0xffffffff;
2169 if (mode == SFmode)
2170 fprintf (file, "0f%08lx", vals[0]);
2171 else
2172 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2173 break;
2174
2175 default:
2176 output_addr_const (file, x);
2177 }
2178 }
2179 }
2180 \f
2181 /* Record replacement regs used to deal with subreg operands. */
2182 struct reg_replace
2183 {
2184 rtx replacement[MAX_RECOG_OPERANDS];
2185 machine_mode mode;
2186 int n_allocated;
2187 int n_in_use;
2188 };
2189
2190 /* Allocate or reuse a replacement in R and return the rtx. */
2191
2192 static rtx
2193 get_replacement (struct reg_replace *r)
2194 {
2195 if (r->n_allocated == r->n_in_use)
2196 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2197 return r->replacement[r->n_in_use++];
2198 }
2199
2200 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2201 the presence of subregs would break the rules for most instructions.
2202 Replace them with a suitable new register of the right size, plus
2203 conversion copyin/copyout instructions. */
2204
2205 static void
2206 nvptx_reorg_subreg (void)
2207 {
2208 struct reg_replace qiregs, hiregs, siregs, diregs;
2209 rtx_insn *insn, *next;
2210
2211 qiregs.n_allocated = 0;
2212 hiregs.n_allocated = 0;
2213 siregs.n_allocated = 0;
2214 diregs.n_allocated = 0;
2215 qiregs.mode = QImode;
2216 hiregs.mode = HImode;
2217 siregs.mode = SImode;
2218 diregs.mode = DImode;
2219
2220 for (insn = get_insns (); insn; insn = next)
2221 {
2222 next = NEXT_INSN (insn);
2223 if (!NONDEBUG_INSN_P (insn)
2224 || asm_noperands (PATTERN (insn)) >= 0
2225 || GET_CODE (PATTERN (insn)) == USE
2226 || GET_CODE (PATTERN (insn)) == CLOBBER)
2227 continue;
2228
2229 qiregs.n_in_use = 0;
2230 hiregs.n_in_use = 0;
2231 siregs.n_in_use = 0;
2232 diregs.n_in_use = 0;
2233 extract_insn (insn);
2234 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2235
2236 for (int i = 0; i < recog_data.n_operands; i++)
2237 {
2238 rtx op = recog_data.operand[i];
2239 if (GET_CODE (op) != SUBREG)
2240 continue;
2241
2242 rtx inner = SUBREG_REG (op);
2243
2244 machine_mode outer_mode = GET_MODE (op);
2245 machine_mode inner_mode = GET_MODE (inner);
2246 gcc_assert (s_ok);
2247 if (s_ok
2248 && (GET_MODE_PRECISION (inner_mode)
2249 >= GET_MODE_PRECISION (outer_mode)))
2250 continue;
2251 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2252 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2253 : outer_mode == HImode ? &hiregs
2254 : outer_mode == SImode ? &siregs
2255 : &diregs);
2256 rtx new_reg = get_replacement (r);
2257
2258 if (recog_data.operand_type[i] != OP_OUT)
2259 {
2260 enum rtx_code code;
2261 if (GET_MODE_PRECISION (inner_mode)
2262 < GET_MODE_PRECISION (outer_mode))
2263 code = ZERO_EXTEND;
2264 else
2265 code = TRUNCATE;
2266
2267 rtx pat = gen_rtx_SET (new_reg,
2268 gen_rtx_fmt_e (code, outer_mode, inner));
2269 emit_insn_before (pat, insn);
2270 }
2271
2272 if (recog_data.operand_type[i] != OP_IN)
2273 {
2274 enum rtx_code code;
2275 if (GET_MODE_PRECISION (inner_mode)
2276 < GET_MODE_PRECISION (outer_mode))
2277 code = TRUNCATE;
2278 else
2279 code = ZERO_EXTEND;
2280
2281 rtx pat = gen_rtx_SET (inner,
2282 gen_rtx_fmt_e (code, inner_mode, new_reg));
2283 emit_insn_after (pat, insn);
2284 }
2285 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2286 }
2287 }
2288 }
2289
2290 /* Loop structure of the function. The entire function is described as
2291 a NULL loop. */
2292
2293 struct parallel
2294 {
2295 /* Parent parallel. */
2296 parallel *parent;
2297
2298 /* Next sibling parallel. */
2299 parallel *next;
2300
2301 /* First child parallel. */
2302 parallel *inner;
2303
2304 /* Partitioning mask of the parallel. */
2305 unsigned mask;
2306
2307 /* Partitioning used within inner parallels. */
2308 unsigned inner_mask;
2309
2310 /* Location of parallel forked and join. The forked is the first
2311 block in the parallel and the join is the first block after of
2312 the partition. */
2313 basic_block forked_block;
2314 basic_block join_block;
2315
2316 rtx_insn *forked_insn;
2317 rtx_insn *join_insn;
2318
2319 rtx_insn *fork_insn;
2320 rtx_insn *joining_insn;
2321
2322 /* Basic blocks in this parallel, but not in child parallels. The
2323 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2324 blocks are not. */
2325 auto_vec<basic_block> blocks;
2326
2327 public:
2328 parallel (parallel *parent, unsigned mode);
2329 ~parallel ();
2330 };
2331
2332 /* Constructor links the new parallel into it's parent's chain of
2333 children. */
2334
2335 parallel::parallel (parallel *parent_, unsigned mask_)
2336 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2337 {
2338 forked_block = join_block = 0;
2339 forked_insn = join_insn = 0;
2340 fork_insn = joining_insn = 0;
2341
2342 if (parent)
2343 {
2344 next = parent->inner;
2345 parent->inner = this;
2346 }
2347 }
2348
2349 parallel::~parallel ()
2350 {
2351 delete inner;
2352 delete next;
2353 }
2354
2355 /* Map of basic blocks to insns */
2356 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2357
2358 /* A tuple of an insn of interest and the BB in which it resides. */
2359 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2360 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2361
2362 /* Split basic blocks such that each forked and join unspecs are at
2363 the start of their basic blocks. Thus afterwards each block will
2364 have a single partitioning mode. We also do the same for return
2365 insns, as they are executed by every thread. Return the
2366 partitioning mode of the function as a whole. Populate MAP with
2367 head and tail blocks. We also clear the BB visited flag, which is
2368 used when finding partitions. */
2369
2370 static void
2371 nvptx_split_blocks (bb_insn_map_t *map)
2372 {
2373 insn_bb_vec_t worklist;
2374 basic_block block;
2375 rtx_insn *insn;
2376
2377 /* Locate all the reorg instructions of interest. */
2378 FOR_ALL_BB_FN (block, cfun)
2379 {
2380 bool seen_insn = false;
2381
2382 /* Clear visited flag, for use by parallel locator */
2383 block->flags &= ~BB_VISITED;
2384
2385 FOR_BB_INSNS (block, insn)
2386 {
2387 if (!INSN_P (insn))
2388 continue;
2389 switch (recog_memoized (insn))
2390 {
2391 default:
2392 seen_insn = true;
2393 continue;
2394 case CODE_FOR_nvptx_forked:
2395 case CODE_FOR_nvptx_join:
2396 break;
2397
2398 case CODE_FOR_return:
2399 /* We also need to split just before return insns, as
2400 that insn needs executing by all threads, but the
2401 block it is in probably does not. */
2402 break;
2403 }
2404
2405 if (seen_insn)
2406 /* We've found an instruction that must be at the start of
2407 a block, but isn't. Add it to the worklist. */
2408 worklist.safe_push (insn_bb_t (insn, block));
2409 else
2410 /* It was already the first instruction. Just add it to
2411 the map. */
2412 map->get_or_insert (block) = insn;
2413 seen_insn = true;
2414 }
2415 }
2416
2417 /* Split blocks on the worklist. */
2418 unsigned ix;
2419 insn_bb_t *elt;
2420 basic_block remap = 0;
2421 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2422 {
2423 if (remap != elt->second)
2424 {
2425 block = elt->second;
2426 remap = block;
2427 }
2428
2429 /* Split block before insn. The insn is in the new block */
2430 edge e = split_block (block, PREV_INSN (elt->first));
2431
2432 block = e->dest;
2433 map->get_or_insert (block) = elt->first;
2434 }
2435 }
2436
2437 /* BLOCK is a basic block containing a head or tail instruction.
2438 Locate the associated prehead or pretail instruction, which must be
2439 in the single predecessor block. */
2440
2441 static rtx_insn *
2442 nvptx_discover_pre (basic_block block, int expected)
2443 {
2444 gcc_assert (block->preds->length () == 1);
2445 basic_block pre_block = (*block->preds)[0]->src;
2446 rtx_insn *pre_insn;
2447
2448 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2449 pre_insn = PREV_INSN (pre_insn))
2450 gcc_assert (pre_insn != BB_HEAD (pre_block));
2451
2452 gcc_assert (recog_memoized (pre_insn) == expected);
2453 return pre_insn;
2454 }
2455
2456 /* Dump this parallel and all its inner parallels. */
2457
2458 static void
2459 nvptx_dump_pars (parallel *par, unsigned depth)
2460 {
2461 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2462 depth, par->mask,
2463 par->forked_block ? par->forked_block->index : -1,
2464 par->join_block ? par->join_block->index : -1);
2465
2466 fprintf (dump_file, " blocks:");
2467
2468 basic_block block;
2469 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2470 fprintf (dump_file, " %d", block->index);
2471 fprintf (dump_file, "\n");
2472 if (par->inner)
2473 nvptx_dump_pars (par->inner, depth + 1);
2474
2475 if (par->next)
2476 nvptx_dump_pars (par->next, depth);
2477 }
2478
2479 /* If BLOCK contains a fork/join marker, process it to create or
2480 terminate a loop structure. Add this block to the current loop,
2481 and then walk successor blocks. */
2482
2483 static parallel *
2484 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2485 {
2486 if (block->flags & BB_VISITED)
2487 return par;
2488 block->flags |= BB_VISITED;
2489
2490 if (rtx_insn **endp = map->get (block))
2491 {
2492 rtx_insn *end = *endp;
2493
2494 /* This is a block head or tail, or return instruction. */
2495 switch (recog_memoized (end))
2496 {
2497 case CODE_FOR_return:
2498 /* Return instructions are in their own block, and we
2499 don't need to do anything more. */
2500 return par;
2501
2502 case CODE_FOR_nvptx_forked:
2503 /* Loop head, create a new inner loop and add it into
2504 our parent's child list. */
2505 {
2506 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2507
2508 gcc_assert (mask);
2509 par = new parallel (par, mask);
2510 par->forked_block = block;
2511 par->forked_insn = end;
2512 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2513 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2514 par->fork_insn
2515 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2516 }
2517 break;
2518
2519 case CODE_FOR_nvptx_join:
2520 /* A loop tail. Finish the current loop and return to
2521 parent. */
2522 {
2523 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2524
2525 gcc_assert (par->mask == mask);
2526 par->join_block = block;
2527 par->join_insn = end;
2528 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2529 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2530 par->joining_insn
2531 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2532 par = par->parent;
2533 }
2534 break;
2535
2536 default:
2537 gcc_unreachable ();
2538 }
2539 }
2540
2541 if (par)
2542 /* Add this block onto the current loop's list of blocks. */
2543 par->blocks.safe_push (block);
2544 else
2545 /* This must be the entry block. Create a NULL parallel. */
2546 par = new parallel (0, 0);
2547
2548 /* Walk successor blocks. */
2549 edge e;
2550 edge_iterator ei;
2551
2552 FOR_EACH_EDGE (e, ei, block->succs)
2553 nvptx_find_par (map, par, e->dest);
2554
2555 return par;
2556 }
2557
2558 /* DFS walk the CFG looking for fork & join markers. Construct
2559 loop structures as we go. MAP is a mapping of basic blocks
2560 to head & tail markers, discovered when splitting blocks. This
2561 speeds up the discovery. We rely on the BB visited flag having
2562 been cleared when splitting blocks. */
2563
2564 static parallel *
2565 nvptx_discover_pars (bb_insn_map_t *map)
2566 {
2567 basic_block block;
2568
2569 /* Mark exit blocks as visited. */
2570 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2571 block->flags |= BB_VISITED;
2572
2573 /* And entry block as not. */
2574 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2575 block->flags &= ~BB_VISITED;
2576
2577 parallel *par = nvptx_find_par (map, 0, block);
2578
2579 if (dump_file)
2580 {
2581 fprintf (dump_file, "\nLoops\n");
2582 nvptx_dump_pars (par, 0);
2583 fprintf (dump_file, "\n");
2584 }
2585
2586 return par;
2587 }
2588
2589 /* Analyse a group of BBs within a partitioned region and create N
2590 Single-Entry-Single-Exit regions. Some of those regions will be
2591 trivial ones consisting of a single BB. The blocks of a
2592 partitioned region might form a set of disjoint graphs -- because
2593 the region encloses a differently partitoned sub region.
2594
2595 We use the linear time algorithm described in 'Finding Regions Fast:
2596 Single Entry Single Exit and control Regions in Linear Time'
2597 Johnson, Pearson & Pingali. That algorithm deals with complete
2598 CFGs, where a back edge is inserted from END to START, and thus the
2599 problem becomes one of finding equivalent loops.
2600
2601 In this case we have a partial CFG. We complete it by redirecting
2602 any incoming edge to the graph to be from an arbitrary external BB,
2603 and similarly redirecting any outgoing edge to be to that BB.
2604 Thus we end up with a closed graph.
2605
2606 The algorithm works by building a spanning tree of an undirected
2607 graph and keeping track of back edges from nodes further from the
2608 root in the tree to nodes nearer to the root in the tree. In the
2609 description below, the root is up and the tree grows downwards.
2610
2611 We avoid having to deal with degenerate back-edges to the same
2612 block, by splitting each BB into 3 -- one for input edges, one for
2613 the node itself and one for the output edges. Such back edges are
2614 referred to as 'Brackets'. Cycle equivalent nodes will have the
2615 same set of brackets.
2616
2617 Determining bracket equivalency is done by maintaining a list of
2618 brackets in such a manner that the list length and final bracket
2619 uniquely identify the set.
2620
2621 We use coloring to mark all BBs with cycle equivalency with the
2622 same color. This is the output of the 'Finding Regions Fast'
2623 algorithm. Notice it doesn't actually find the set of nodes within
2624 a particular region, just unorderd sets of nodes that are the
2625 entries and exits of SESE regions.
2626
2627 After determining cycle equivalency, we need to find the minimal
2628 set of SESE regions. Do this with a DFS coloring walk of the
2629 complete graph. We're either 'looking' or 'coloring'. When
2630 looking, and we're in the subgraph, we start coloring the color of
2631 the current node, and remember that node as the start of the
2632 current color's SESE region. Every time we go to a new node, we
2633 decrement the count of nodes with thet color. If it reaches zero,
2634 we remember that node as the end of the current color's SESE region
2635 and return to 'looking'. Otherwise we color the node the current
2636 color.
2637
2638 This way we end up with coloring the inside of non-trivial SESE
2639 regions with the color of that region. */
2640
2641 /* A pair of BBs. We use this to represent SESE regions. */
2642 typedef std::pair<basic_block, basic_block> bb_pair_t;
2643 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2644
2645 /* A node in the undirected CFG. The discriminator SECOND indicates just
2646 above or just below the BB idicated by FIRST. */
2647 typedef std::pair<basic_block, int> pseudo_node_t;
2648
2649 /* A bracket indicates an edge towards the root of the spanning tree of the
2650 undirected graph. Each bracket has a color, determined
2651 from the currrent set of brackets. */
2652 struct bracket
2653 {
2654 pseudo_node_t back; /* Back target */
2655
2656 /* Current color and size of set. */
2657 unsigned color;
2658 unsigned size;
2659
2660 bracket (pseudo_node_t back_)
2661 : back (back_), color (~0u), size (~0u)
2662 {
2663 }
2664
2665 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2666 {
2667 if (length != size)
2668 {
2669 size = length;
2670 color = color_counts.length ();
2671 color_counts.quick_push (0);
2672 }
2673 color_counts[color]++;
2674 return color;
2675 }
2676 };
2677
2678 typedef auto_vec<bracket> bracket_vec_t;
2679
2680 /* Basic block info for finding SESE regions. */
2681
2682 struct bb_sese
2683 {
2684 int node; /* Node number in spanning tree. */
2685 int parent; /* Parent node number. */
2686
2687 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2688 edges arrive at pseudo-node Ai and the outgoing edges leave at
2689 pseudo-node Ao. We have to remember which way we arrived at a
2690 particular node when generating the spanning tree. dir > 0 means
2691 we arrived at Ai, dir < 0 means we arrived at Ao. */
2692 int dir;
2693
2694 /* Lowest numbered pseudo-node reached via a backedge from thsis
2695 node, or any descendant. */
2696 pseudo_node_t high;
2697
2698 int color; /* Cycle-equivalence color */
2699
2700 /* Stack of brackets for this node. */
2701 bracket_vec_t brackets;
2702
2703 bb_sese (unsigned node_, unsigned p, int dir_)
2704 :node (node_), parent (p), dir (dir_)
2705 {
2706 }
2707 ~bb_sese ();
2708
2709 /* Push a bracket ending at BACK. */
2710 void push (const pseudo_node_t &back)
2711 {
2712 if (dump_file)
2713 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2714 back.first ? back.first->index : 0, back.second);
2715 brackets.safe_push (bracket (back));
2716 }
2717
2718 void append (bb_sese *child);
2719 void remove (const pseudo_node_t &);
2720
2721 /* Set node's color. */
2722 void set_color (auto_vec<unsigned> &color_counts)
2723 {
2724 color = brackets.last ().get_color (color_counts, brackets.length ());
2725 }
2726 };
2727
2728 bb_sese::~bb_sese ()
2729 {
2730 }
2731
2732 /* Destructively append CHILD's brackets. */
2733
2734 void
2735 bb_sese::append (bb_sese *child)
2736 {
2737 if (int len = child->brackets.length ())
2738 {
2739 int ix;
2740
2741 if (dump_file)
2742 {
2743 for (ix = 0; ix < len; ix++)
2744 {
2745 const pseudo_node_t &pseudo = child->brackets[ix].back;
2746 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2747 child->node, pseudo.first ? pseudo.first->index : 0,
2748 pseudo.second);
2749 }
2750 }
2751 if (!brackets.length ())
2752 std::swap (brackets, child->brackets);
2753 else
2754 {
2755 brackets.reserve (len);
2756 for (ix = 0; ix < len; ix++)
2757 brackets.quick_push (child->brackets[ix]);
2758 }
2759 }
2760 }
2761
2762 /* Remove brackets that terminate at PSEUDO. */
2763
2764 void
2765 bb_sese::remove (const pseudo_node_t &pseudo)
2766 {
2767 unsigned removed = 0;
2768 int len = brackets.length ();
2769
2770 for (int ix = 0; ix < len; ix++)
2771 {
2772 if (brackets[ix].back == pseudo)
2773 {
2774 if (dump_file)
2775 fprintf (dump_file, "Removing backedge %d:%+d\n",
2776 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2777 removed++;
2778 }
2779 else if (removed)
2780 brackets[ix-removed] = brackets[ix];
2781 }
2782 while (removed--)
2783 brackets.pop ();
2784 }
2785
2786 /* Accessors for BB's aux pointer. */
2787 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2788 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2789
2790 /* DFS walk creating SESE data structures. Only cover nodes with
2791 BB_VISITED set. Append discovered blocks to LIST. We number in
2792 increments of 3 so that the above and below pseudo nodes can be
2793 implicitly numbered too. */
2794
2795 static int
2796 nvptx_sese_number (int n, int p, int dir, basic_block b,
2797 auto_vec<basic_block> *list)
2798 {
2799 if (BB_GET_SESE (b))
2800 return n;
2801
2802 if (dump_file)
2803 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2804 b->index, n, p, dir);
2805
2806 BB_SET_SESE (b, new bb_sese (n, p, dir));
2807 p = n;
2808
2809 n += 3;
2810 list->quick_push (b);
2811
2812 /* First walk the nodes on the 'other side' of this node, then walk
2813 the nodes on the same side. */
2814 for (unsigned ix = 2; ix; ix--)
2815 {
2816 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2817 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2818 : offsetof (edge_def, src));
2819 edge e;
2820 edge_iterator (ei);
2821
2822 FOR_EACH_EDGE (e, ei, edges)
2823 {
2824 basic_block target = *(basic_block *)((char *)e + offset);
2825
2826 if (target->flags & BB_VISITED)
2827 n = nvptx_sese_number (n, p, dir, target, list);
2828 }
2829 dir = -dir;
2830 }
2831 return n;
2832 }
2833
2834 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2835 EDGES are the outgoing edges and OFFSET is the offset to the src
2836 or dst block on the edges. */
2837
2838 static void
2839 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2840 vec<edge, va_gc> *edges, size_t offset)
2841 {
2842 edge e;
2843 edge_iterator (ei);
2844 int hi_back = depth;
2845 pseudo_node_t node_back (0, depth);
2846 int hi_child = depth;
2847 pseudo_node_t node_child (0, depth);
2848 basic_block child = NULL;
2849 unsigned num_children = 0;
2850 int usd = -dir * sese->dir;
2851
2852 if (dump_file)
2853 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2854 me->index, sese->node, dir);
2855
2856 if (dir < 0)
2857 {
2858 /* This is the above pseudo-child. It has the BB itself as an
2859 additional child node. */
2860 node_child = sese->high;
2861 hi_child = node_child.second;
2862 if (node_child.first)
2863 hi_child += BB_GET_SESE (node_child.first)->node;
2864 num_children++;
2865 }
2866
2867 /* Examine each edge.
2868 - if it is a child (a) append its bracket list and (b) record
2869 whether it is the child with the highest reaching bracket.
2870 - if it is an edge to ancestor, record whether it's the highest
2871 reaching backlink. */
2872 FOR_EACH_EDGE (e, ei, edges)
2873 {
2874 basic_block target = *(basic_block *)((char *)e + offset);
2875
2876 if (bb_sese *t_sese = BB_GET_SESE (target))
2877 {
2878 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2879 {
2880 /* Child node. Append its bracket list. */
2881 num_children++;
2882 sese->append (t_sese);
2883
2884 /* Compare it's hi value. */
2885 int t_hi = t_sese->high.second;
2886
2887 if (basic_block child_hi_block = t_sese->high.first)
2888 t_hi += BB_GET_SESE (child_hi_block)->node;
2889
2890 if (hi_child > t_hi)
2891 {
2892 hi_child = t_hi;
2893 node_child = t_sese->high;
2894 child = target;
2895 }
2896 }
2897 else if (t_sese->node < sese->node + dir
2898 && !(dir < 0 && sese->parent == t_sese->node))
2899 {
2900 /* Non-parental ancestor node -- a backlink. */
2901 int d = usd * t_sese->dir;
2902 int back = t_sese->node + d;
2903
2904 if (hi_back > back)
2905 {
2906 hi_back = back;
2907 node_back = pseudo_node_t (target, d);
2908 }
2909 }
2910 }
2911 else
2912 { /* Fallen off graph, backlink to entry node. */
2913 hi_back = 0;
2914 node_back = pseudo_node_t (0, 0);
2915 }
2916 }
2917
2918 /* Remove any brackets that terminate at this pseudo node. */
2919 sese->remove (pseudo_node_t (me, dir));
2920
2921 /* Now push any backlinks from this pseudo node. */
2922 FOR_EACH_EDGE (e, ei, edges)
2923 {
2924 basic_block target = *(basic_block *)((char *)e + offset);
2925 if (bb_sese *t_sese = BB_GET_SESE (target))
2926 {
2927 if (t_sese->node < sese->node + dir
2928 && !(dir < 0 && sese->parent == t_sese->node))
2929 /* Non-parental ancestor node - backedge from me. */
2930 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2931 }
2932 else
2933 {
2934 /* back edge to entry node */
2935 sese->push (pseudo_node_t (0, 0));
2936 }
2937 }
2938
2939 /* If this node leads directly or indirectly to a no-return region of
2940 the graph, then fake a backedge to entry node. */
2941 if (!sese->brackets.length () || !edges || !edges->length ())
2942 {
2943 hi_back = 0;
2944 node_back = pseudo_node_t (0, 0);
2945 sese->push (node_back);
2946 }
2947
2948 /* Record the highest reaching backedge from us or a descendant. */
2949 sese->high = hi_back < hi_child ? node_back : node_child;
2950
2951 if (num_children > 1)
2952 {
2953 /* There is more than one child -- this is a Y shaped piece of
2954 spanning tree. We have to insert a fake backedge from this
2955 node to the highest ancestor reached by not-the-highest
2956 reaching child. Note that there may be multiple children
2957 with backedges to the same highest node. That's ok and we
2958 insert the edge to that highest node. */
2959 hi_child = depth;
2960 if (dir < 0 && child)
2961 {
2962 node_child = sese->high;
2963 hi_child = node_child.second;
2964 if (node_child.first)
2965 hi_child += BB_GET_SESE (node_child.first)->node;
2966 }
2967
2968 FOR_EACH_EDGE (e, ei, edges)
2969 {
2970 basic_block target = *(basic_block *)((char *)e + offset);
2971
2972 if (target == child)
2973 /* Ignore the highest child. */
2974 continue;
2975
2976 bb_sese *t_sese = BB_GET_SESE (target);
2977 if (!t_sese)
2978 continue;
2979 if (t_sese->parent != sese->node)
2980 /* Not a child. */
2981 continue;
2982
2983 /* Compare its hi value. */
2984 int t_hi = t_sese->high.second;
2985
2986 if (basic_block child_hi_block = t_sese->high.first)
2987 t_hi += BB_GET_SESE (child_hi_block)->node;
2988
2989 if (hi_child > t_hi)
2990 {
2991 hi_child = t_hi;
2992 node_child = t_sese->high;
2993 }
2994 }
2995
2996 sese->push (node_child);
2997 }
2998 }
2999
3000
3001 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3002 proceed to successors. Set SESE entry and exit nodes of
3003 REGIONS. */
3004
3005 static void
3006 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3007 basic_block block, int coloring)
3008 {
3009 bb_sese *sese = BB_GET_SESE (block);
3010
3011 if (block->flags & BB_VISITED)
3012 {
3013 /* If we've already encountered this block, either we must not
3014 be coloring, or it must have been colored the current color. */
3015 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3016 return;
3017 }
3018
3019 block->flags |= BB_VISITED;
3020
3021 if (sese)
3022 {
3023 if (coloring < 0)
3024 {
3025 /* Start coloring a region. */
3026 regions[sese->color].first = block;
3027 coloring = sese->color;
3028 }
3029
3030 if (!--color_counts[sese->color] && sese->color == coloring)
3031 {
3032 /* Found final block of SESE region. */
3033 regions[sese->color].second = block;
3034 coloring = -1;
3035 }
3036 else
3037 /* Color the node, so we can assert on revisiting the node
3038 that the graph is indeed SESE. */
3039 sese->color = coloring;
3040 }
3041 else
3042 /* Fallen off the subgraph, we cannot be coloring. */
3043 gcc_assert (coloring < 0);
3044
3045 /* Walk each successor block. */
3046 if (block->succs && block->succs->length ())
3047 {
3048 edge e;
3049 edge_iterator ei;
3050
3051 FOR_EACH_EDGE (e, ei, block->succs)
3052 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3053 }
3054 else
3055 gcc_assert (coloring < 0);
3056 }
3057
3058 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3059 end up with NULL entries in it. */
3060
3061 static void
3062 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3063 {
3064 basic_block block;
3065 int ix;
3066
3067 /* First clear each BB of the whole function. */
3068 FOR_EACH_BB_FN (block, cfun)
3069 {
3070 block->flags &= ~BB_VISITED;
3071 BB_SET_SESE (block, 0);
3072 }
3073 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3074 block->flags &= ~BB_VISITED;
3075 BB_SET_SESE (block, 0);
3076 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3077 block->flags &= ~BB_VISITED;
3078 BB_SET_SESE (block, 0);
3079
3080 /* Mark blocks in the function that are in this graph. */
3081 for (ix = 0; blocks.iterate (ix, &block); ix++)
3082 block->flags |= BB_VISITED;
3083
3084 /* Counts of nodes assigned to each color. There cannot be more
3085 colors than blocks (and hopefully there will be fewer). */
3086 auto_vec<unsigned> color_counts;
3087 color_counts.reserve (blocks.length ());
3088
3089 /* Worklist of nodes in the spanning tree. Again, there cannot be
3090 more nodes in the tree than blocks (there will be fewer if the
3091 CFG of blocks is disjoint). */
3092 auto_vec<basic_block> spanlist;
3093 spanlist.reserve (blocks.length ());
3094
3095 /* Make sure every block has its cycle class determined. */
3096 for (ix = 0; blocks.iterate (ix, &block); ix++)
3097 {
3098 if (BB_GET_SESE (block))
3099 /* We already met this block in an earlier graph solve. */
3100 continue;
3101
3102 if (dump_file)
3103 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3104
3105 /* Number the nodes reachable from block initial DFS order. */
3106 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3107
3108 /* Now walk in reverse DFS order to find cycle equivalents. */
3109 while (spanlist.length ())
3110 {
3111 block = spanlist.pop ();
3112 bb_sese *sese = BB_GET_SESE (block);
3113
3114 /* Do the pseudo node below. */
3115 nvptx_sese_pseudo (block, sese, depth, +1,
3116 sese->dir > 0 ? block->succs : block->preds,
3117 (sese->dir > 0 ? offsetof (edge_def, dest)
3118 : offsetof (edge_def, src)));
3119 sese->set_color (color_counts);
3120 /* Do the pseudo node above. */
3121 nvptx_sese_pseudo (block, sese, depth, -1,
3122 sese->dir < 0 ? block->succs : block->preds,
3123 (sese->dir < 0 ? offsetof (edge_def, dest)
3124 : offsetof (edge_def, src)));
3125 }
3126 if (dump_file)
3127 fprintf (dump_file, "\n");
3128 }
3129
3130 if (dump_file)
3131 {
3132 unsigned count;
3133 const char *comma = "";
3134
3135 fprintf (dump_file, "Found %d cycle equivalents\n",
3136 color_counts.length ());
3137 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3138 {
3139 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3140
3141 comma = "";
3142 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3143 if (BB_GET_SESE (block)->color == ix)
3144 {
3145 block->flags |= BB_VISITED;
3146 fprintf (dump_file, "%s%d", comma, block->index);
3147 comma=",";
3148 }
3149 fprintf (dump_file, "}");
3150 comma = ", ";
3151 }
3152 fprintf (dump_file, "\n");
3153 }
3154
3155 /* Now we've colored every block in the subgraph. We now need to
3156 determine the minimal set of SESE regions that cover that
3157 subgraph. Do this with a DFS walk of the complete function.
3158 During the walk we're either 'looking' or 'coloring'. When we
3159 reach the last node of a particular color, we stop coloring and
3160 return to looking. */
3161
3162 /* There cannot be more SESE regions than colors. */
3163 regions.reserve (color_counts.length ());
3164 for (ix = color_counts.length (); ix--;)
3165 regions.quick_push (bb_pair_t (0, 0));
3166
3167 for (ix = 0; blocks.iterate (ix, &block); ix++)
3168 block->flags &= ~BB_VISITED;
3169
3170 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3171
3172 if (dump_file)
3173 {
3174 const char *comma = "";
3175 int len = regions.length ();
3176
3177 fprintf (dump_file, "SESE regions:");
3178 for (ix = 0; ix != len; ix++)
3179 {
3180 basic_block from = regions[ix].first;
3181 basic_block to = regions[ix].second;
3182
3183 if (from)
3184 {
3185 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3186 if (to != from)
3187 fprintf (dump_file, "->%d", to->index);
3188
3189 int color = BB_GET_SESE (from)->color;
3190
3191 /* Print the blocks within the region (excluding ends). */
3192 FOR_EACH_BB_FN (block, cfun)
3193 {
3194 bb_sese *sese = BB_GET_SESE (block);
3195
3196 if (sese && sese->color == color
3197 && block != from && block != to)
3198 fprintf (dump_file, ".%d", block->index);
3199 }
3200 fprintf (dump_file, "}");
3201 }
3202 comma = ",";
3203 }
3204 fprintf (dump_file, "\n\n");
3205 }
3206
3207 for (ix = 0; blocks.iterate (ix, &block); ix++)
3208 delete BB_GET_SESE (block);
3209 }
3210
3211 #undef BB_SET_SESE
3212 #undef BB_GET_SESE
3213
3214 /* Propagate live state at the start of a partitioned region. BLOCK
3215 provides the live register information, and might not contain
3216 INSN. Propagation is inserted just after INSN. RW indicates whether
3217 we are reading and/or writing state. This
3218 separation is needed for worker-level proppagation where we
3219 essentially do a spill & fill. FN is the underlying worker
3220 function to generate the propagation instructions for single
3221 register. DATA is user data.
3222
3223 We propagate the live register set and the entire frame. We could
3224 do better by (a) propagating just the live set that is used within
3225 the partitioned regions and (b) only propagating stack entries that
3226 are used. The latter might be quite hard to determine. */
3227
3228 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3229
3230 static void
3231 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3232 propagator_fn fn, void *data)
3233 {
3234 bitmap live = DF_LIVE_IN (block);
3235 bitmap_iterator iterator;
3236 unsigned ix;
3237
3238 /* Copy the frame array. */
3239 HOST_WIDE_INT fs = get_frame_size ();
3240 if (fs)
3241 {
3242 rtx tmp = gen_reg_rtx (DImode);
3243 rtx idx = NULL_RTX;
3244 rtx ptr = gen_reg_rtx (Pmode);
3245 rtx pred = NULL_RTX;
3246 rtx_code_label *label = NULL;
3247
3248 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3249 fs /= GET_MODE_SIZE (DImode);
3250 /* Detect single iteration loop. */
3251 if (fs == 1)
3252 fs = 0;
3253
3254 start_sequence ();
3255 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3256 if (fs)
3257 {
3258 idx = gen_reg_rtx (SImode);
3259 pred = gen_reg_rtx (BImode);
3260 label = gen_label_rtx ();
3261
3262 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3263 /* Allow worker function to initialize anything needed. */
3264 rtx init = fn (tmp, PM_loop_begin, fs, data);
3265 if (init)
3266 emit_insn (init);
3267 emit_label (label);
3268 LABEL_NUSES (label)++;
3269 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3270 }
3271 if (rw & PM_read)
3272 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3273 emit_insn (fn (tmp, rw, fs, data));
3274 if (rw & PM_write)
3275 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3276 if (fs)
3277 {
3278 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3279 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3280 emit_insn (gen_br_true_uni (pred, label));
3281 rtx fini = fn (tmp, PM_loop_end, fs, data);
3282 if (fini)
3283 emit_insn (fini);
3284 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3285 }
3286 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3287 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3288 rtx cpy = get_insns ();
3289 end_sequence ();
3290 insn = emit_insn_after (cpy, insn);
3291 }
3292
3293 /* Copy live registers. */
3294 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3295 {
3296 rtx reg = regno_reg_rtx[ix];
3297
3298 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3299 {
3300 rtx bcast = fn (reg, rw, 0, data);
3301
3302 insn = emit_insn_after (bcast, insn);
3303 }
3304 }
3305 }
3306
3307 /* Worker for nvptx_vpropagate. */
3308
3309 static rtx
3310 vprop_gen (rtx reg, propagate_mask pm,
3311 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3312 {
3313 if (!(pm & PM_read_write))
3314 return 0;
3315
3316 return nvptx_gen_vcast (reg);
3317 }
3318
3319 /* Propagate state that is live at start of BLOCK across the vectors
3320 of a single warp. Propagation is inserted just after INSN. */
3321
3322 static void
3323 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3324 {
3325 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3326 }
3327
3328 /* Worker for nvptx_wpropagate. */
3329
3330 static rtx
3331 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3332 {
3333 wcast_data_t *data = (wcast_data_t *)data_;
3334
3335 if (pm & PM_loop_begin)
3336 {
3337 /* Starting a loop, initialize pointer. */
3338 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3339
3340 if (align > worker_bcast_align)
3341 worker_bcast_align = align;
3342 data->offset = (data->offset + align - 1) & ~(align - 1);
3343
3344 data->ptr = gen_reg_rtx (Pmode);
3345
3346 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3347 }
3348 else if (pm & PM_loop_end)
3349 {
3350 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3351 data->ptr = NULL_RTX;
3352 return clobber;
3353 }
3354 else
3355 return nvptx_gen_wcast (reg, pm, rep, data);
3356 }
3357
3358 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3359 indicates if this is just before partitioned mode (do spill), or
3360 just after it starts (do fill). Sequence is inserted just after
3361 INSN. */
3362
3363 static void
3364 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3365 {
3366 wcast_data_t data;
3367
3368 data.base = gen_reg_rtx (Pmode);
3369 data.offset = 0;
3370 data.ptr = NULL_RTX;
3371
3372 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3373 if (data.offset)
3374 {
3375 /* Stuff was emitted, initialize the base pointer now. */
3376 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3377 emit_insn_after (init, insn);
3378
3379 if (worker_bcast_size < data.offset)
3380 worker_bcast_size = data.offset;
3381 }
3382 }
3383
3384 /* Emit a worker-level synchronization barrier. We use different
3385 markers for before and after synchronizations. */
3386
3387 static rtx
3388 nvptx_wsync (bool after)
3389 {
3390 return gen_nvptx_barsync (GEN_INT (after));
3391 }
3392
3393 /* Single neutering according to MASK. FROM is the incoming block and
3394 TO is the outgoing block. These may be the same block. Insert at
3395 start of FROM:
3396
3397 if (tid.<axis>) goto end.
3398
3399 and insert before ending branch of TO (if there is such an insn):
3400
3401 end:
3402 <possibly-broadcast-cond>
3403 <branch>
3404
3405 We currently only use differnt FROM and TO when skipping an entire
3406 loop. We could do more if we detected superblocks. */
3407
3408 static void
3409 nvptx_single (unsigned mask, basic_block from, basic_block to)
3410 {
3411 rtx_insn *head = BB_HEAD (from);
3412 rtx_insn *tail = BB_END (to);
3413 unsigned skip_mask = mask;
3414
3415 /* Find first insn of from block */
3416 while (head != BB_END (from) && !INSN_P (head))
3417 head = NEXT_INSN (head);
3418
3419 /* Find last insn of to block */
3420 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3421 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3422 tail = PREV_INSN (tail);
3423
3424 /* Detect if tail is a branch. */
3425 rtx tail_branch = NULL_RTX;
3426 rtx cond_branch = NULL_RTX;
3427 if (tail && INSN_P (tail))
3428 {
3429 tail_branch = PATTERN (tail);
3430 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3431 tail_branch = NULL_RTX;
3432 else
3433 {
3434 cond_branch = SET_SRC (tail_branch);
3435 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3436 cond_branch = NULL_RTX;
3437 }
3438 }
3439
3440 if (tail == head)
3441 {
3442 /* If this is empty, do nothing. */
3443 if (!head || !INSN_P (head))
3444 return;
3445
3446 /* If this is a dummy insn, do nothing. */
3447 switch (recog_memoized (head))
3448 {
3449 default:
3450 break;
3451 case CODE_FOR_nvptx_fork:
3452 case CODE_FOR_nvptx_forked:
3453 case CODE_FOR_nvptx_joining:
3454 case CODE_FOR_nvptx_join:
3455 return;
3456 }
3457
3458 if (cond_branch)
3459 {
3460 /* If we're only doing vector single, there's no need to
3461 emit skip code because we'll not insert anything. */
3462 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3463 skip_mask = 0;
3464 }
3465 else if (tail_branch)
3466 /* Block with only unconditional branch. Nothing to do. */
3467 return;
3468 }
3469
3470 /* Insert the vector test inside the worker test. */
3471 unsigned mode;
3472 rtx_insn *before = tail;
3473 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3474 if (GOMP_DIM_MASK (mode) & skip_mask)
3475 {
3476 rtx_code_label *label = gen_label_rtx ();
3477 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3478
3479 if (!pred)
3480 {
3481 pred = gen_reg_rtx (BImode);
3482 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3483 }
3484
3485 rtx br;
3486 if (mode == GOMP_DIM_VECTOR)
3487 br = gen_br_true (pred, label);
3488 else
3489 br = gen_br_true_uni (pred, label);
3490 emit_insn_before (br, head);
3491
3492 LABEL_NUSES (label)++;
3493 if (tail_branch)
3494 before = emit_label_before (label, before);
3495 else
3496 emit_label_after (label, tail);
3497 }
3498
3499 /* Now deal with propagating the branch condition. */
3500 if (cond_branch)
3501 {
3502 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3503
3504 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3505 {
3506 /* Vector mode only, do a shuffle. */
3507 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3508 }
3509 else
3510 {
3511 /* Includes worker mode, do spill & fill. By construction
3512 we should never have worker mode only. */
3513 wcast_data_t data;
3514
3515 data.base = worker_bcast_sym;
3516 data.ptr = 0;
3517
3518 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3519 worker_bcast_size = GET_MODE_SIZE (SImode);
3520
3521 data.offset = 0;
3522 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3523 before);
3524 /* Barrier so other workers can see the write. */
3525 emit_insn_before (nvptx_wsync (false), tail);
3526 data.offset = 0;
3527 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3528 /* This barrier is needed to avoid worker zero clobbering
3529 the broadcast buffer before all the other workers have
3530 had a chance to read this instance of it. */
3531 emit_insn_before (nvptx_wsync (true), tail);
3532 }
3533
3534 extract_insn (tail);
3535 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3536 UNSPEC_BR_UNIFIED);
3537 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3538 }
3539 }
3540
3541 /* PAR is a parallel that is being skipped in its entirety according to
3542 MASK. Treat this as skipping a superblock starting at forked
3543 and ending at joining. */
3544
3545 static void
3546 nvptx_skip_par (unsigned mask, parallel *par)
3547 {
3548 basic_block tail = par->join_block;
3549 gcc_assert (tail->preds->length () == 1);
3550
3551 basic_block pre_tail = (*tail->preds)[0]->src;
3552 gcc_assert (pre_tail->succs->length () == 1);
3553
3554 nvptx_single (mask, par->forked_block, pre_tail);
3555 }
3556
3557 /* If PAR has a single inner parallel and PAR itself only contains
3558 empty entry and exit blocks, swallow the inner PAR. */
3559
3560 static void
3561 nvptx_optimize_inner (parallel *par)
3562 {
3563 parallel *inner = par->inner;
3564
3565 /* We mustn't be the outer dummy par. */
3566 if (!par->mask)
3567 return;
3568
3569 /* We must have a single inner par. */
3570 if (!inner || inner->next)
3571 return;
3572
3573 /* We must only contain 2 blocks ourselves -- the head and tail of
3574 the inner par. */
3575 if (par->blocks.length () != 2)
3576 return;
3577
3578 /* We must be disjoint partitioning. As we only have vector and
3579 worker partitioning, this is sufficient to guarantee the pars
3580 have adjacent partitioning. */
3581 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3582 /* This indicates malformed code generation. */
3583 return;
3584
3585 /* The outer forked insn should be immediately followed by the inner
3586 fork insn. */
3587 rtx_insn *forked = par->forked_insn;
3588 rtx_insn *fork = BB_END (par->forked_block);
3589
3590 if (NEXT_INSN (forked) != fork)
3591 return;
3592 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3593
3594 /* The outer joining insn must immediately follow the inner join
3595 insn. */
3596 rtx_insn *joining = par->joining_insn;
3597 rtx_insn *join = inner->join_insn;
3598 if (NEXT_INSN (join) != joining)
3599 return;
3600
3601 /* Preconditions met. Swallow the inner par. */
3602 if (dump_file)
3603 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3604 inner->mask, inner->forked_block->index,
3605 inner->join_block->index,
3606 par->mask, par->forked_block->index, par->join_block->index);
3607
3608 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3609
3610 par->blocks.reserve (inner->blocks.length ());
3611 while (inner->blocks.length ())
3612 par->blocks.quick_push (inner->blocks.pop ());
3613
3614 par->inner = inner->inner;
3615 inner->inner = NULL;
3616
3617 delete inner;
3618 }
3619
3620 /* Process the parallel PAR and all its contained
3621 parallels. We do everything but the neutering. Return mask of
3622 partitioned modes used within this parallel. */
3623
3624 static unsigned
3625 nvptx_process_pars (parallel *par)
3626 {
3627 if (nvptx_optimize)
3628 nvptx_optimize_inner (par);
3629
3630 unsigned inner_mask = par->mask;
3631
3632 /* Do the inner parallels first. */
3633 if (par->inner)
3634 {
3635 par->inner_mask = nvptx_process_pars (par->inner);
3636 inner_mask |= par->inner_mask;
3637 }
3638
3639 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3640 /* No propagation needed for a call. */;
3641 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3642 {
3643 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3644 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3645 /* Insert begin and end synchronizations. */
3646 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3647 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3648 }
3649 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3650 nvptx_vpropagate (par->forked_block, par->forked_insn);
3651
3652 /* Now do siblings. */
3653 if (par->next)
3654 inner_mask |= nvptx_process_pars (par->next);
3655 return inner_mask;
3656 }
3657
3658 /* Neuter the parallel described by PAR. We recurse in depth-first
3659 order. MODES are the partitioning of the execution and OUTER is
3660 the partitioning of the parallels we are contained in. */
3661
3662 static void
3663 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3664 {
3665 unsigned me = (par->mask
3666 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3667 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3668 unsigned skip_mask = 0, neuter_mask = 0;
3669
3670 if (par->inner)
3671 nvptx_neuter_pars (par->inner, modes, outer | me);
3672
3673 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3674 {
3675 if ((outer | me) & GOMP_DIM_MASK (mode))
3676 {} /* Mode is partitioned: no neutering. */
3677 else if (!(modes & GOMP_DIM_MASK (mode)))
3678 {} /* Mode is not used: nothing to do. */
3679 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3680 || !par->forked_insn)
3681 /* Partitioned in inner parallels, or we're not a partitioned
3682 at all: neuter individual blocks. */
3683 neuter_mask |= GOMP_DIM_MASK (mode);
3684 else if (!par->parent || !par->parent->forked_insn
3685 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3686 /* Parent isn't a parallel or contains this paralleling: skip
3687 parallel at this level. */
3688 skip_mask |= GOMP_DIM_MASK (mode);
3689 else
3690 {} /* Parent will skip this parallel itself. */
3691 }
3692
3693 if (neuter_mask)
3694 {
3695 int ix, len;
3696
3697 if (nvptx_optimize)
3698 {
3699 /* Neuter whole SESE regions. */
3700 bb_pair_vec_t regions;
3701
3702 nvptx_find_sese (par->blocks, regions);
3703 len = regions.length ();
3704 for (ix = 0; ix != len; ix++)
3705 {
3706 basic_block from = regions[ix].first;
3707 basic_block to = regions[ix].second;
3708
3709 if (from)
3710 nvptx_single (neuter_mask, from, to);
3711 else
3712 gcc_assert (!to);
3713 }
3714 }
3715 else
3716 {
3717 /* Neuter each BB individually. */
3718 len = par->blocks.length ();
3719 for (ix = 0; ix != len; ix++)
3720 {
3721 basic_block block = par->blocks[ix];
3722
3723 nvptx_single (neuter_mask, block, block);
3724 }
3725 }
3726 }
3727
3728 if (skip_mask)
3729 nvptx_skip_par (skip_mask, par);
3730
3731 if (par->next)
3732 nvptx_neuter_pars (par->next, modes, outer);
3733 }
3734
3735 /* PTX-specific reorganization
3736 - Split blocks at fork and join instructions
3737 - Compute live registers
3738 - Mark now-unused registers, so function begin doesn't declare
3739 unused registers.
3740 - Insert state propagation when entering partitioned mode
3741 - Insert neutering instructions when in single mode
3742 - Replace subregs with suitable sequences.
3743 */
3744
3745 static void
3746 nvptx_reorg (void)
3747 {
3748 /* We are freeing block_for_insn in the toplev to keep compatibility
3749 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3750 compute_bb_for_insn ();
3751
3752 thread_prologue_and_epilogue_insns ();
3753
3754 /* Split blocks and record interesting unspecs. */
3755 bb_insn_map_t bb_insn_map;
3756
3757 nvptx_split_blocks (&bb_insn_map);
3758
3759 /* Compute live regs */
3760 df_clear_flags (DF_LR_RUN_DCE);
3761 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3762 df_live_add_problem ();
3763 df_live_set_all_dirty ();
3764 df_analyze ();
3765 regstat_init_n_sets_and_refs ();
3766
3767 if (dump_file)
3768 df_dump (dump_file);
3769
3770 /* Mark unused regs as unused. */
3771 int max_regs = max_reg_num ();
3772 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3773 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3774 regno_reg_rtx[i] = const0_rtx;
3775
3776 /* Determine launch dimensions of the function. If it is not an
3777 offloaded function (i.e. this is a regular compiler), the
3778 function has no neutering. */
3779 tree attr = get_oacc_fn_attrib (current_function_decl);
3780 if (attr)
3781 {
3782 /* If we determined this mask before RTL expansion, we could
3783 elide emission of some levels of forks and joins. */
3784 unsigned mask = 0;
3785 tree dims = TREE_VALUE (attr);
3786 unsigned ix;
3787
3788 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3789 {
3790 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3791 tree allowed = TREE_PURPOSE (dims);
3792
3793 if (size != 1 && !(allowed && integer_zerop (allowed)))
3794 mask |= GOMP_DIM_MASK (ix);
3795 }
3796 /* If there is worker neutering, there must be vector
3797 neutering. Otherwise the hardware will fail. */
3798 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3799 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3800
3801 /* Discover & process partitioned regions. */
3802 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3803 nvptx_process_pars (pars);
3804 nvptx_neuter_pars (pars, mask, 0);
3805 delete pars;
3806 }
3807
3808 /* Replace subregs. */
3809 nvptx_reorg_subreg ();
3810
3811 regstat_free_n_sets_and_refs ();
3812
3813 df_finish_pass (true);
3814 }
3815 \f
3816 /* Handle a "kernel" attribute; arguments as in
3817 struct attribute_spec.handler. */
3818
3819 static tree
3820 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3821 int ARG_UNUSED (flags), bool *no_add_attrs)
3822 {
3823 tree decl = *node;
3824
3825 if (TREE_CODE (decl) != FUNCTION_DECL)
3826 {
3827 error ("%qE attribute only applies to functions", name);
3828 *no_add_attrs = true;
3829 }
3830
3831 else if (TREE_TYPE (TREE_TYPE (decl)) != void_type_node)
3832 {
3833 error ("%qE attribute requires a void return type", name);
3834 *no_add_attrs = true;
3835 }
3836
3837 return NULL_TREE;
3838 }
3839
3840 /* Table of valid machine attributes. */
3841 static const struct attribute_spec nvptx_attribute_table[] =
3842 {
3843 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3844 affects_type_identity } */
3845 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3846 { NULL, 0, 0, false, false, false, NULL, false }
3847 };
3848 \f
3849 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3850
3851 static HOST_WIDE_INT
3852 nvptx_vector_alignment (const_tree type)
3853 {
3854 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3855
3856 return MIN (align, BIGGEST_ALIGNMENT);
3857 }
3858
3859 /* Indicate that INSN cannot be duplicated. */
3860
3861 static bool
3862 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3863 {
3864 switch (recog_memoized (insn))
3865 {
3866 case CODE_FOR_nvptx_shufflesi:
3867 case CODE_FOR_nvptx_shufflesf:
3868 case CODE_FOR_nvptx_barsync:
3869 case CODE_FOR_nvptx_fork:
3870 case CODE_FOR_nvptx_forked:
3871 case CODE_FOR_nvptx_joining:
3872 case CODE_FOR_nvptx_join:
3873 return true;
3874 default:
3875 return false;
3876 }
3877 }
3878
3879 /* Section anchors do not work. Initialization for flag_section_anchor
3880 probes the existence of the anchoring target hooks and prevents
3881 anchoring if they don't exist. However, we may be being used with
3882 a host-side compiler that does support anchoring, and hence see
3883 the anchor flag set (as it's not recalculated). So provide an
3884 implementation denying anchoring. */
3885
3886 static bool
3887 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3888 {
3889 return false;
3890 }
3891 \f
3892 /* Record a symbol for mkoffload to enter into the mapping table. */
3893
3894 static void
3895 nvptx_record_offload_symbol (tree decl)
3896 {
3897 switch (TREE_CODE (decl))
3898 {
3899 case VAR_DECL:
3900 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3901 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3902 break;
3903
3904 case FUNCTION_DECL:
3905 {
3906 tree attr = get_oacc_fn_attrib (decl);
3907 tree dims = TREE_VALUE (attr);
3908 unsigned ix;
3909
3910 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3911 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3912
3913 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3914 {
3915 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3916
3917 gcc_assert (!TREE_PURPOSE (dims));
3918 fprintf (asm_out_file, ", %#x", size);
3919 }
3920
3921 fprintf (asm_out_file, "\n");
3922 }
3923 break;
3924
3925 default:
3926 gcc_unreachable ();
3927 }
3928 }
3929
3930 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3931 at the start of a file. */
3932
3933 static void
3934 nvptx_file_start (void)
3935 {
3936 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3937 fputs ("\t.version\t3.1\n", asm_out_file);
3938 fputs ("\t.target\tsm_30\n", asm_out_file);
3939 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3940 fputs ("// END PREAMBLE\n", asm_out_file);
3941 }
3942
3943 /* Write out the function declarations we've collected and declare storage
3944 for the broadcast buffer. */
3945
3946 static void
3947 nvptx_file_end (void)
3948 {
3949 hash_table<tree_hasher>::iterator iter;
3950 tree decl;
3951 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3952 nvptx_record_fndecl (decl);
3953 fputs (func_decls.str().c_str(), asm_out_file);
3954
3955 if (worker_bcast_size)
3956 {
3957 /* Define the broadcast buffer. */
3958
3959 worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
3960 & ~(worker_bcast_align - 1);
3961
3962 write_var_marker (asm_out_file, true, false, worker_bcast_name);
3963 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3964 worker_bcast_align,
3965 worker_bcast_name, worker_bcast_size);
3966 }
3967
3968 if (worker_red_size)
3969 {
3970 /* Define the reduction buffer. */
3971
3972 worker_red_size = ((worker_red_size + worker_red_align - 1)
3973 & ~(worker_red_align - 1));
3974
3975 write_var_marker (asm_out_file, true, false, worker_red_name);
3976 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3977 worker_red_align,
3978 worker_red_name, worker_red_size);
3979 }
3980 }
3981
3982 /* Expander for the shuffle builtins. */
3983
3984 static rtx
3985 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3986 {
3987 if (ignore)
3988 return target;
3989
3990 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3991 NULL_RTX, mode, EXPAND_NORMAL);
3992 if (!REG_P (src))
3993 src = copy_to_mode_reg (mode, src);
3994
3995 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3996 NULL_RTX, SImode, EXPAND_NORMAL);
3997 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3998 NULL_RTX, SImode, EXPAND_NORMAL);
3999
4000 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4001 idx = copy_to_mode_reg (SImode, idx);
4002
4003 rtx pat = nvptx_gen_shuffle (target, src, idx,
4004 (nvptx_shuffle_kind) INTVAL (op));
4005 if (pat)
4006 emit_insn (pat);
4007
4008 return target;
4009 }
4010
4011 /* Worker reduction address expander. */
4012
4013 static rtx
4014 nvptx_expand_worker_addr (tree exp, rtx target,
4015 machine_mode ARG_UNUSED (mode), int ignore)
4016 {
4017 if (ignore)
4018 return target;
4019
4020 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4021 if (align > worker_red_align)
4022 worker_red_align = align;
4023
4024 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4025 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4026 if (size + offset > worker_red_size)
4027 worker_red_size = size + offset;
4028
4029 rtx addr = worker_red_sym;
4030 if (offset)
4031 {
4032 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4033 addr = gen_rtx_CONST (Pmode, addr);
4034 }
4035
4036 emit_move_insn (target, addr);
4037
4038 return target;
4039 }
4040
4041 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4042 not require taking the address of any object, other than the memory
4043 cell being operated on. */
4044
4045 static rtx
4046 nvptx_expand_cmp_swap (tree exp, rtx target,
4047 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4048 {
4049 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4050
4051 if (!target)
4052 target = gen_reg_rtx (mode);
4053
4054 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4055 NULL_RTX, Pmode, EXPAND_NORMAL);
4056 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4057 NULL_RTX, mode, EXPAND_NORMAL);
4058 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4059 NULL_RTX, mode, EXPAND_NORMAL);
4060 rtx pat;
4061
4062 mem = gen_rtx_MEM (mode, mem);
4063 if (!REG_P (cmp))
4064 cmp = copy_to_mode_reg (mode, cmp);
4065 if (!REG_P (src))
4066 src = copy_to_mode_reg (mode, src);
4067
4068 if (mode == SImode)
4069 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4070 else
4071 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4072
4073 emit_insn (pat);
4074
4075 return target;
4076 }
4077
4078
4079 /* Codes for all the NVPTX builtins. */
4080 enum nvptx_builtins
4081 {
4082 NVPTX_BUILTIN_SHUFFLE,
4083 NVPTX_BUILTIN_SHUFFLELL,
4084 NVPTX_BUILTIN_WORKER_ADDR,
4085 NVPTX_BUILTIN_CMP_SWAP,
4086 NVPTX_BUILTIN_CMP_SWAPLL,
4087 NVPTX_BUILTIN_MAX
4088 };
4089
4090 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4091
4092 /* Return the NVPTX builtin for CODE. */
4093
4094 static tree
4095 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4096 {
4097 if (code >= NVPTX_BUILTIN_MAX)
4098 return error_mark_node;
4099
4100 return nvptx_builtin_decls[code];
4101 }
4102
4103 /* Set up all builtin functions for this target. */
4104
4105 static void
4106 nvptx_init_builtins (void)
4107 {
4108 #define DEF(ID, NAME, T) \
4109 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4110 = add_builtin_function ("__builtin_nvptx_" NAME, \
4111 build_function_type_list T, \
4112 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4113 #define ST sizetype
4114 #define UINT unsigned_type_node
4115 #define LLUINT long_long_unsigned_type_node
4116 #define PTRVOID ptr_type_node
4117
4118 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4119 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4120 DEF (WORKER_ADDR, "worker_addr",
4121 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4122 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4123 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4124
4125 #undef DEF
4126 #undef ST
4127 #undef UINT
4128 #undef LLUINT
4129 #undef PTRVOID
4130 }
4131
4132 /* Expand an expression EXP that calls a built-in function,
4133 with result going to TARGET if that's convenient
4134 (and in mode MODE if that's convenient).
4135 SUBTARGET may be used as the target for computing one of EXP's operands.
4136 IGNORE is nonzero if the value is to be ignored. */
4137
4138 static rtx
4139 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4140 machine_mode mode, int ignore)
4141 {
4142 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4143 switch (DECL_FUNCTION_CODE (fndecl))
4144 {
4145 case NVPTX_BUILTIN_SHUFFLE:
4146 case NVPTX_BUILTIN_SHUFFLELL:
4147 return nvptx_expand_shuffle (exp, target, mode, ignore);
4148
4149 case NVPTX_BUILTIN_WORKER_ADDR:
4150 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4151
4152 case NVPTX_BUILTIN_CMP_SWAP:
4153 case NVPTX_BUILTIN_CMP_SWAPLL:
4154 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4155
4156 default: gcc_unreachable ();
4157 }
4158 }
4159 \f
4160 /* Define dimension sizes for known hardware. */
4161 #define PTX_VECTOR_LENGTH 32
4162 #define PTX_WORKER_LENGTH 32
4163
4164 /* Validate compute dimensions of an OpenACC offload or routine, fill
4165 in non-unity defaults. FN_LEVEL indicates the level at which a
4166 routine might spawn a loop. It is negative for non-routines. */
4167
4168 static bool
4169 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4170 {
4171 bool changed = false;
4172
4173 /* The vector size must be 32, unless this is a SEQ routine. */
4174 if (fn_level <= GOMP_DIM_VECTOR
4175 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4176 {
4177 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4178 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4179 dims[GOMP_DIM_VECTOR]
4180 ? "using vector_length (%d), ignoring %d"
4181 : "using vector_length (%d), ignoring runtime setting",
4182 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4183 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4184 changed = true;
4185 }
4186
4187 /* Check the num workers is not too large. */
4188 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4189 {
4190 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4191 "using num_workers (%d), ignoring %d",
4192 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4193 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4194 changed = true;
4195 }
4196
4197 return changed;
4198 }
4199
4200 /* Return maximum dimension size, or zero for unbounded. */
4201
4202 static int
4203 nvptx_dim_limit (int axis)
4204 {
4205 switch (axis)
4206 {
4207 case GOMP_DIM_WORKER:
4208 return PTX_WORKER_LENGTH;
4209
4210 case GOMP_DIM_VECTOR:
4211 return PTX_VECTOR_LENGTH;
4212
4213 default:
4214 break;
4215 }
4216 return 0;
4217 }
4218
4219 /* Determine whether fork & joins are needed. */
4220
4221 static bool
4222 nvptx_goacc_fork_join (gcall *call, const int dims[],
4223 bool ARG_UNUSED (is_fork))
4224 {
4225 tree arg = gimple_call_arg (call, 2);
4226 unsigned axis = TREE_INT_CST_LOW (arg);
4227
4228 /* We only care about worker and vector partitioning. */
4229 if (axis < GOMP_DIM_WORKER)
4230 return false;
4231
4232 /* If the size is 1, there's no partitioning. */
4233 if (dims[axis] == 1)
4234 return false;
4235
4236 return true;
4237 }
4238
4239 /* Generate a PTX builtin function call that returns the address in
4240 the worker reduction buffer at OFFSET. TYPE is the type of the
4241 data at that location. */
4242
4243 static tree
4244 nvptx_get_worker_red_addr (tree type, tree offset)
4245 {
4246 machine_mode mode = TYPE_MODE (type);
4247 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4248 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4249 tree align = build_int_cst (unsigned_type_node,
4250 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4251 tree call = build_call_expr (fndecl, 3, offset, size, align);
4252
4253 return fold_convert (build_pointer_type (type), call);
4254 }
4255
4256 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4257 will cast the variable if necessary. */
4258
4259 static void
4260 nvptx_generate_vector_shuffle (location_t loc,
4261 tree dest_var, tree var, unsigned shift,
4262 gimple_seq *seq)
4263 {
4264 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4265 tree_code code = NOP_EXPR;
4266 tree arg_type = unsigned_type_node;
4267 tree var_type = TREE_TYPE (var);
4268 tree dest_type = var_type;
4269
4270 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4271 var_type = TREE_TYPE (var_type);
4272
4273 if (TREE_CODE (var_type) == REAL_TYPE)
4274 code = VIEW_CONVERT_EXPR;
4275
4276 if (TYPE_SIZE (var_type)
4277 == TYPE_SIZE (long_long_unsigned_type_node))
4278 {
4279 fn = NVPTX_BUILTIN_SHUFFLELL;
4280 arg_type = long_long_unsigned_type_node;
4281 }
4282
4283 tree call = nvptx_builtin_decl (fn, true);
4284 tree bits = build_int_cst (unsigned_type_node, shift);
4285 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4286 tree expr;
4287
4288 if (var_type != dest_type)
4289 {
4290 /* Do real and imaginary parts separately. */
4291 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4292 real = fold_build1 (code, arg_type, real);
4293 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4294 real = fold_build1 (code, var_type, real);
4295
4296 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4297 imag = fold_build1 (code, arg_type, imag);
4298 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4299 imag = fold_build1 (code, var_type, imag);
4300
4301 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4302 }
4303 else
4304 {
4305 expr = fold_build1 (code, arg_type, var);
4306 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4307 expr = fold_build1 (code, dest_type, expr);
4308 }
4309
4310 gimplify_assign (dest_var, expr, seq);
4311 }
4312
4313 /* Lazily generate the global lock var decl and return its address. */
4314
4315 static tree
4316 nvptx_global_lock_addr ()
4317 {
4318 tree v = global_lock_var;
4319
4320 if (!v)
4321 {
4322 tree name = get_identifier ("__reduction_lock");
4323 tree type = build_qualified_type (unsigned_type_node,
4324 TYPE_QUAL_VOLATILE);
4325 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4326 global_lock_var = v;
4327 DECL_ARTIFICIAL (v) = 1;
4328 DECL_EXTERNAL (v) = 1;
4329 TREE_STATIC (v) = 1;
4330 TREE_PUBLIC (v) = 1;
4331 TREE_USED (v) = 1;
4332 mark_addressable (v);
4333 mark_decl_referenced (v);
4334 }
4335
4336 return build_fold_addr_expr (v);
4337 }
4338
4339 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4340 GSI. We use a lockless scheme for nearly all case, which looks
4341 like:
4342 actual = initval(OP);
4343 do {
4344 guess = actual;
4345 write = guess OP myval;
4346 actual = cmp&swap (ptr, guess, write)
4347 } while (actual bit-different-to guess);
4348 return write;
4349
4350 This relies on a cmp&swap instruction, which is available for 32-
4351 and 64-bit types. Larger types must use a locking scheme. */
4352
4353 static tree
4354 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4355 tree ptr, tree var, tree_code op)
4356 {
4357 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4358 tree_code code = NOP_EXPR;
4359 tree arg_type = unsigned_type_node;
4360 tree var_type = TREE_TYPE (var);
4361
4362 if (TREE_CODE (var_type) == COMPLEX_TYPE
4363 || TREE_CODE (var_type) == REAL_TYPE)
4364 code = VIEW_CONVERT_EXPR;
4365
4366 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4367 {
4368 arg_type = long_long_unsigned_type_node;
4369 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4370 }
4371
4372 tree swap_fn = nvptx_builtin_decl (fn, true);
4373
4374 gimple_seq init_seq = NULL;
4375 tree init_var = make_ssa_name (arg_type);
4376 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4377 init_expr = fold_build1 (code, arg_type, init_expr);
4378 gimplify_assign (init_var, init_expr, &init_seq);
4379 gimple *init_end = gimple_seq_last (init_seq);
4380
4381 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4382
4383 /* Split the block just after the init stmts. */
4384 basic_block pre_bb = gsi_bb (*gsi);
4385 edge pre_edge = split_block (pre_bb, init_end);
4386 basic_block loop_bb = pre_edge->dest;
4387 pre_bb = pre_edge->src;
4388 /* Reset the iterator. */
4389 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4390
4391 tree expect_var = make_ssa_name (arg_type);
4392 tree actual_var = make_ssa_name (arg_type);
4393 tree write_var = make_ssa_name (arg_type);
4394
4395 /* Build and insert the reduction calculation. */
4396 gimple_seq red_seq = NULL;
4397 tree write_expr = fold_build1 (code, var_type, expect_var);
4398 write_expr = fold_build2 (op, var_type, write_expr, var);
4399 write_expr = fold_build1 (code, arg_type, write_expr);
4400 gimplify_assign (write_var, write_expr, &red_seq);
4401
4402 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4403
4404 /* Build & insert the cmp&swap sequence. */
4405 gimple_seq latch_seq = NULL;
4406 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4407 ptr, expect_var, write_var);
4408 gimplify_assign (actual_var, swap_expr, &latch_seq);
4409
4410 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4411 NULL_TREE, NULL_TREE);
4412 gimple_seq_add_stmt (&latch_seq, cond);
4413
4414 gimple *latch_end = gimple_seq_last (latch_seq);
4415 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4416
4417 /* Split the block just after the latch stmts. */
4418 edge post_edge = split_block (loop_bb, latch_end);
4419 basic_block post_bb = post_edge->dest;
4420 loop_bb = post_edge->src;
4421 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4422
4423 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4424 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4425 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4426 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4427
4428 gphi *phi = create_phi_node (expect_var, loop_bb);
4429 add_phi_arg (phi, init_var, pre_edge, loc);
4430 add_phi_arg (phi, actual_var, loop_edge, loc);
4431
4432 loop *loop = alloc_loop ();
4433 loop->header = loop_bb;
4434 loop->latch = loop_bb;
4435 add_loop (loop, loop_bb->loop_father);
4436
4437 return fold_build1 (code, var_type, write_var);
4438 }
4439
4440 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4441 GSI. This is necessary for types larger than 64 bits, where there
4442 is no cmp&swap instruction to implement a lockless scheme. We use
4443 a lock variable in global memory.
4444
4445 while (cmp&swap (&lock_var, 0, 1))
4446 continue;
4447 T accum = *ptr;
4448 accum = accum OP var;
4449 *ptr = accum;
4450 cmp&swap (&lock_var, 1, 0);
4451 return accum;
4452
4453 A lock in global memory is necessary to force execution engine
4454 descheduling and avoid resource starvation that can occur if the
4455 lock is in .shared memory. */
4456
4457 static tree
4458 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4459 tree ptr, tree var, tree_code op)
4460 {
4461 tree var_type = TREE_TYPE (var);
4462 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4463 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4464 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4465
4466 /* Split the block just before the gsi. Insert a gimple nop to make
4467 this easier. */
4468 gimple *nop = gimple_build_nop ();
4469 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4470 basic_block entry_bb = gsi_bb (*gsi);
4471 edge entry_edge = split_block (entry_bb, nop);
4472 basic_block lock_bb = entry_edge->dest;
4473 /* Reset the iterator. */
4474 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4475
4476 /* Build and insert the locking sequence. */
4477 gimple_seq lock_seq = NULL;
4478 tree lock_var = make_ssa_name (unsigned_type_node);
4479 tree lock_expr = nvptx_global_lock_addr ();
4480 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4481 uns_unlocked, uns_locked);
4482 gimplify_assign (lock_var, lock_expr, &lock_seq);
4483 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4484 NULL_TREE, NULL_TREE);
4485 gimple_seq_add_stmt (&lock_seq, cond);
4486 gimple *lock_end = gimple_seq_last (lock_seq);
4487 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4488
4489 /* Split the block just after the lock sequence. */
4490 edge locked_edge = split_block (lock_bb, lock_end);
4491 basic_block update_bb = locked_edge->dest;
4492 lock_bb = locked_edge->src;
4493 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4494
4495 /* Create the lock loop ... */
4496 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4497 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4498 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4499 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4500
4501 /* ... and the loop structure. */
4502 loop *lock_loop = alloc_loop ();
4503 lock_loop->header = lock_bb;
4504 lock_loop->latch = lock_bb;
4505 lock_loop->nb_iterations_estimate = 1;
4506 lock_loop->any_estimate = true;
4507 add_loop (lock_loop, entry_bb->loop_father);
4508
4509 /* Build and insert the reduction calculation. */
4510 gimple_seq red_seq = NULL;
4511 tree acc_in = make_ssa_name (var_type);
4512 tree ref_in = build_simple_mem_ref (ptr);
4513 TREE_THIS_VOLATILE (ref_in) = 1;
4514 gimplify_assign (acc_in, ref_in, &red_seq);
4515
4516 tree acc_out = make_ssa_name (var_type);
4517 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4518 gimplify_assign (acc_out, update_expr, &red_seq);
4519
4520 tree ref_out = build_simple_mem_ref (ptr);
4521 TREE_THIS_VOLATILE (ref_out) = 1;
4522 gimplify_assign (ref_out, acc_out, &red_seq);
4523
4524 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4525
4526 /* Build & insert the unlock sequence. */
4527 gimple_seq unlock_seq = NULL;
4528 tree unlock_expr = nvptx_global_lock_addr ();
4529 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4530 uns_locked, uns_unlocked);
4531 gimplify_and_add (unlock_expr, &unlock_seq);
4532 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4533
4534 return acc_out;
4535 }
4536
4537 /* Emit a sequence to update a reduction accumlator at *PTR with the
4538 value held in VAR using operator OP. Return the updated value.
4539
4540 TODO: optimize for atomic ops and indepedent complex ops. */
4541
4542 static tree
4543 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4544 tree ptr, tree var, tree_code op)
4545 {
4546 tree type = TREE_TYPE (var);
4547 tree size = TYPE_SIZE (type);
4548
4549 if (size == TYPE_SIZE (unsigned_type_node)
4550 || size == TYPE_SIZE (long_long_unsigned_type_node))
4551 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4552 else
4553 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4554 }
4555
4556 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4557
4558 static void
4559 nvptx_goacc_reduction_setup (gcall *call)
4560 {
4561 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4562 tree lhs = gimple_call_lhs (call);
4563 tree var = gimple_call_arg (call, 2);
4564 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4565 gimple_seq seq = NULL;
4566
4567 push_gimplify_context (true);
4568
4569 if (level != GOMP_DIM_GANG)
4570 {
4571 /* Copy the receiver object. */
4572 tree ref_to_res = gimple_call_arg (call, 1);
4573
4574 if (!integer_zerop (ref_to_res))
4575 var = build_simple_mem_ref (ref_to_res);
4576 }
4577
4578 if (level == GOMP_DIM_WORKER)
4579 {
4580 /* Store incoming value to worker reduction buffer. */
4581 tree offset = gimple_call_arg (call, 5);
4582 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4583 tree ptr = make_ssa_name (TREE_TYPE (call));
4584
4585 gimplify_assign (ptr, call, &seq);
4586 tree ref = build_simple_mem_ref (ptr);
4587 TREE_THIS_VOLATILE (ref) = 1;
4588 gimplify_assign (ref, var, &seq);
4589 }
4590
4591 if (lhs)
4592 gimplify_assign (lhs, var, &seq);
4593
4594 pop_gimplify_context (NULL);
4595 gsi_replace_with_seq (&gsi, seq, true);
4596 }
4597
4598 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4599
4600 static void
4601 nvptx_goacc_reduction_init (gcall *call)
4602 {
4603 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4604 tree lhs = gimple_call_lhs (call);
4605 tree var = gimple_call_arg (call, 2);
4606 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4607 enum tree_code rcode
4608 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4609 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4610 TREE_TYPE (var));
4611 gimple_seq seq = NULL;
4612
4613 push_gimplify_context (true);
4614
4615 if (level == GOMP_DIM_VECTOR)
4616 {
4617 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4618 tree tid = make_ssa_name (integer_type_node);
4619 tree dim_vector = gimple_call_arg (call, 3);
4620 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4621 dim_vector);
4622 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4623 NULL_TREE, NULL_TREE);
4624
4625 gimple_call_set_lhs (tid_call, tid);
4626 gimple_seq_add_stmt (&seq, tid_call);
4627 gimple_seq_add_stmt (&seq, cond_stmt);
4628
4629 /* Split the block just after the call. */
4630 edge init_edge = split_block (gsi_bb (gsi), call);
4631 basic_block init_bb = init_edge->dest;
4632 basic_block call_bb = init_edge->src;
4633
4634 /* Fixup flags from call_bb to init_bb. */
4635 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4636
4637 /* Set the initialization stmts. */
4638 gimple_seq init_seq = NULL;
4639 tree init_var = make_ssa_name (TREE_TYPE (var));
4640 gimplify_assign (init_var, init, &init_seq);
4641 gsi = gsi_start_bb (init_bb);
4642 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4643
4644 /* Split block just after the init stmt. */
4645 gsi_prev (&gsi);
4646 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4647 basic_block dst_bb = inited_edge->dest;
4648
4649 /* Create false edge from call_bb to dst_bb. */
4650 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4651
4652 /* Create phi node in dst block. */
4653 gphi *phi = create_phi_node (lhs, dst_bb);
4654 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4655 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4656
4657 /* Reset dominator of dst bb. */
4658 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4659
4660 /* Reset the gsi. */
4661 gsi = gsi_for_stmt (call);
4662 }
4663 else
4664 {
4665 if (level == GOMP_DIM_GANG)
4666 {
4667 /* If there's no receiver object, propagate the incoming VAR. */
4668 tree ref_to_res = gimple_call_arg (call, 1);
4669 if (integer_zerop (ref_to_res))
4670 init = var;
4671 }
4672
4673 gimplify_assign (lhs, init, &seq);
4674 }
4675
4676 pop_gimplify_context (NULL);
4677 gsi_replace_with_seq (&gsi, seq, true);
4678 }
4679
4680 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4681
4682 static void
4683 nvptx_goacc_reduction_fini (gcall *call)
4684 {
4685 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4686 tree lhs = gimple_call_lhs (call);
4687 tree ref_to_res = gimple_call_arg (call, 1);
4688 tree var = gimple_call_arg (call, 2);
4689 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4690 enum tree_code op
4691 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4692 gimple_seq seq = NULL;
4693 tree r = NULL_TREE;;
4694
4695 push_gimplify_context (true);
4696
4697 if (level == GOMP_DIM_VECTOR)
4698 {
4699 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4700 but that requires a method of emitting a unified jump at the
4701 gimple level. */
4702 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4703 {
4704 tree other_var = make_ssa_name (TREE_TYPE (var));
4705 nvptx_generate_vector_shuffle (gimple_location (call),
4706 other_var, var, shfl, &seq);
4707
4708 r = make_ssa_name (TREE_TYPE (var));
4709 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4710 var, other_var), &seq);
4711 var = r;
4712 }
4713 }
4714 else
4715 {
4716 tree accum = NULL_TREE;
4717
4718 if (level == GOMP_DIM_WORKER)
4719 {
4720 /* Get reduction buffer address. */
4721 tree offset = gimple_call_arg (call, 5);
4722 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4723 tree ptr = make_ssa_name (TREE_TYPE (call));
4724
4725 gimplify_assign (ptr, call, &seq);
4726 accum = ptr;
4727 }
4728 else if (integer_zerop (ref_to_res))
4729 r = var;
4730 else
4731 accum = ref_to_res;
4732
4733 if (accum)
4734 {
4735 /* UPDATE the accumulator. */
4736 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4737 seq = NULL;
4738 r = nvptx_reduction_update (gimple_location (call), &gsi,
4739 accum, var, op);
4740 }
4741 }
4742
4743 if (lhs)
4744 gimplify_assign (lhs, r, &seq);
4745 pop_gimplify_context (NULL);
4746
4747 gsi_replace_with_seq (&gsi, seq, true);
4748 }
4749
4750 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4751
4752 static void
4753 nvptx_goacc_reduction_teardown (gcall *call)
4754 {
4755 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4756 tree lhs = gimple_call_lhs (call);
4757 tree var = gimple_call_arg (call, 2);
4758 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4759 gimple_seq seq = NULL;
4760
4761 push_gimplify_context (true);
4762 if (level == GOMP_DIM_WORKER)
4763 {
4764 /* Read the worker reduction buffer. */
4765 tree offset = gimple_call_arg (call, 5);
4766 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4767 tree ptr = make_ssa_name (TREE_TYPE (call));
4768
4769 gimplify_assign (ptr, call, &seq);
4770 var = build_simple_mem_ref (ptr);
4771 TREE_THIS_VOLATILE (var) = 1;
4772 }
4773
4774 if (level != GOMP_DIM_GANG)
4775 {
4776 /* Write to the receiver object. */
4777 tree ref_to_res = gimple_call_arg (call, 1);
4778
4779 if (!integer_zerop (ref_to_res))
4780 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4781 }
4782
4783 if (lhs)
4784 gimplify_assign (lhs, var, &seq);
4785
4786 pop_gimplify_context (NULL);
4787
4788 gsi_replace_with_seq (&gsi, seq, true);
4789 }
4790
4791 /* NVPTX reduction expander. */
4792
4793 void
4794 nvptx_goacc_reduction (gcall *call)
4795 {
4796 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4797
4798 switch (code)
4799 {
4800 case IFN_GOACC_REDUCTION_SETUP:
4801 nvptx_goacc_reduction_setup (call);
4802 break;
4803
4804 case IFN_GOACC_REDUCTION_INIT:
4805 nvptx_goacc_reduction_init (call);
4806 break;
4807
4808 case IFN_GOACC_REDUCTION_FINI:
4809 nvptx_goacc_reduction_fini (call);
4810 break;
4811
4812 case IFN_GOACC_REDUCTION_TEARDOWN:
4813 nvptx_goacc_reduction_teardown (call);
4814 break;
4815
4816 default:
4817 gcc_unreachable ();
4818 }
4819 }
4820
4821 #undef TARGET_OPTION_OVERRIDE
4822 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4823
4824 #undef TARGET_ATTRIBUTE_TABLE
4825 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4826
4827 #undef TARGET_LEGITIMATE_ADDRESS_P
4828 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4829
4830 #undef TARGET_PROMOTE_FUNCTION_MODE
4831 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4832
4833 #undef TARGET_FUNCTION_ARG
4834 #define TARGET_FUNCTION_ARG nvptx_function_arg
4835 #undef TARGET_FUNCTION_INCOMING_ARG
4836 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4837 #undef TARGET_FUNCTION_ARG_ADVANCE
4838 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4839 #undef TARGET_FUNCTION_ARG_BOUNDARY
4840 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4841 #undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
4842 #define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
4843 #undef TARGET_PASS_BY_REFERENCE
4844 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4845 #undef TARGET_FUNCTION_VALUE_REGNO_P
4846 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4847 #undef TARGET_FUNCTION_VALUE
4848 #define TARGET_FUNCTION_VALUE nvptx_function_value
4849 #undef TARGET_LIBCALL_VALUE
4850 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4851 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4852 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4853 #undef TARGET_GET_DRAP_RTX
4854 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4855 #undef TARGET_SPLIT_COMPLEX_ARG
4856 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4857 #undef TARGET_RETURN_IN_MEMORY
4858 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4859 #undef TARGET_OMIT_STRUCT_RETURN_REG
4860 #define TARGET_OMIT_STRUCT_RETURN_REG true
4861 #undef TARGET_STRICT_ARGUMENT_NAMING
4862 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4863 #undef TARGET_STATIC_CHAIN
4864 #define TARGET_STATIC_CHAIN nvptx_static_chain
4865
4866 #undef TARGET_CALL_ARGS
4867 #define TARGET_CALL_ARGS nvptx_call_args
4868 #undef TARGET_END_CALL_ARGS
4869 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4870
4871 #undef TARGET_ASM_FILE_START
4872 #define TARGET_ASM_FILE_START nvptx_file_start
4873 #undef TARGET_ASM_FILE_END
4874 #define TARGET_ASM_FILE_END nvptx_file_end
4875 #undef TARGET_ASM_GLOBALIZE_LABEL
4876 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4877 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4878 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4879 #undef TARGET_PRINT_OPERAND
4880 #define TARGET_PRINT_OPERAND nvptx_print_operand
4881 #undef TARGET_PRINT_OPERAND_ADDRESS
4882 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4883 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4884 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4885 #undef TARGET_ASM_INTEGER
4886 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4887 #undef TARGET_ASM_DECL_END
4888 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4889 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4890 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4891 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4892 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4893 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4894 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4895
4896 #undef TARGET_MACHINE_DEPENDENT_REORG
4897 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4898 #undef TARGET_NO_REGISTER_ALLOCATION
4899 #define TARGET_NO_REGISTER_ALLOCATION true
4900
4901 #undef TARGET_ENCODE_SECTION_INFO
4902 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
4903 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4904 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4905
4906 #undef TARGET_VECTOR_ALIGNMENT
4907 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4908
4909 #undef TARGET_CANNOT_COPY_INSN_P
4910 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4911
4912 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4913 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4914
4915 #undef TARGET_INIT_BUILTINS
4916 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4917 #undef TARGET_EXPAND_BUILTIN
4918 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4919 #undef TARGET_BUILTIN_DECL
4920 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4921
4922 #undef TARGET_GOACC_VALIDATE_DIMS
4923 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4924
4925 #undef TARGET_GOACC_DIM_LIMIT
4926 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4927
4928 #undef TARGET_GOACC_FORK_JOIN
4929 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4930
4931 #undef TARGET_GOACC_REDUCTION
4932 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4933
4934 struct gcc_target targetm = TARGET_INITIALIZER;
4935
4936 #include "gt-nvptx.h"